Compare commits

..

3 Commits

Author SHA1 Message Date
voje
c6b8426fb3 added adjective handling (appending _ to headwords) 2019-04-19 07:41:50 +02:00
af4f6045bb prevent duplicate entries in DB 2019-04-15 20:48:10 +02:00
f0b0abac1b added functors and headwords to db entry 2019-04-15 02:34:53 +02:00
3 changed files with 78 additions and 15 deletions

View File

@@ -107,12 +107,12 @@ class Parser():
yield from self.xml_file_to_generator(self.ssj_file) yield from self.xml_file_to_generator(self.ssj_file)
def parse_xml_file(self, xml_file): def parse_xml_file(self, xml_file):
tstart = time.time() # tstart = time.time()
file_data = [] file_data = []
for tpl in self.xml_file_to_generator(xml_file): for tpl in self.xml_file_to_generator(xml_file):
file_data += [tpl[1]] file_data += [tpl[1]]
tend = time.time() tend = time.time()
self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart)) # self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart))
return file_data return file_data
def xml_file_to_generator(self, xml_file): def xml_file_to_generator(self, xml_file):
@@ -211,7 +211,7 @@ class Parser():
"text": sentence_text, "text": sentence_text,
"tokens": sentence_tokens, "tokens": sentence_tokens,
"jos_links": jos_links, "jos_links": jos_links,
"srl_links": srl_links_parsed "srl_links": srl_links_parsed,
} }
self.stats["parsed_count"] += 1 self.stats["parsed_count"] += 1
yield (xml_file, sentence_entry) yield (xml_file, sentence_entry)

View File

@@ -1 +1,2 @@
from corpusparser.Parser import Parser from corpusparser.Parser import Parser
from corpusparser.main import enriched_lemma

View File

@@ -4,20 +4,48 @@ import argparse
import logging import logging
import json import json
from pymongo import MongoClient from pymongo import MongoClient
import pymongo
import sys import sys
from multiprocessing import Pool from multiprocessing import Pool
import time
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
n_kres_files = -1 # for logging n_kres_files = -1 # for logging
def enriched_lemma(token):
return (token["lemma"] if token["msd"][0] == "G" else token["lemma"] + "_")
def _helper_tid_to_token(tid, tokens):
for t in tokens:
if t["tid"] == tid:
return t
return None
def _db_preprocess(e):
if e["srl_links"] is None:
e["headwords"] = []
e["functors"] = []
else:
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
hw_tokens = [_helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
headwords = [enriched_lemma(t) for t in hw_tokens]
e["headwords"] = headwords
functors = list(set([x["afun"] for x in e["srl_links"]]))
e["functors"] = functors
return e
# handler for concurrency # handler for concurrency
def _handle_kres_file_tpl(kres_file_tpl): def _handle_kres_file_tpl(kres_file_tpl):
tstart = time.time()
kres_file_idx = kres_file_tpl[0] kres_file_idx = kres_file_tpl[0]
kres_file = kres_file_tpl[1] kres_file = kres_file_tpl[1]
logging.info("Handling {} ({}/{})".format(
kres_file, kres_file_idx, n_kres_files))
kres_data = kres_parser.parse_xml_file(kres_file) kres_data = kres_parser.parse_xml_file(kres_file)
if args.output == "file": if args.output == "file":
kres_outdir = outdir / "kres_json" kres_outdir = outdir / "kres_json"
@@ -26,9 +54,43 @@ def _handle_kres_file_tpl(kres_file_tpl):
with kres_outfile.open("w") as fp: with kres_outfile.open("w") as fp:
json.dump(kres_data, fp) json.dump(kres_data, fp)
elif args.output == "db": elif args.output == "db":
# mongoclient needs to be created after forking
dbclient = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = dbclient.valdb
kres_col = valdb["kres"] kres_col = valdb["kres"]
# HUUUUGE BOTTLENECK
"""
for sentence in kres_data: for sentence in kres_data:
kres_col.update({"sid": sentence["sid"]}, sentence, upsert=True) kres_col.update({"sid": sentence["sid"]}, sentence, upsert=True)
"""
# skip if one of the sentences is already in DB
if kres_col.find({"sid": kres_data[0]["sid"]}).count() > 0:
logging.info("File {} already in DB ({}/{})".format(
kres_file, kres_file_idx, n_kres_files))
return
kres_data_1 = [_db_preprocess(x) for x in kres_data]
kres_col.insert_many(kres_data_1) # much much better (just make sure sid has a unique index)
logging.info("Inserted data from {} ({}/{}) in {:.2f} s".format(
kres_file, kres_file_idx, n_kres_files, time.time() - tstart))
def _get_dbclient(args):
dbclient = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
return dbclient
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
@@ -44,19 +106,16 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
outdir = None outdir = None
valdb = None
if args.output == "file": if args.output == "file":
outdir = Path(args.outdir) outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True) outdir.mkdir(parents=True, exist_ok=True)
elif args.output == "db": elif args.output == "db":
dbclient = MongoClient( # Force unique sid
"mongodb://{}".format(args.dbaddr), dbclient = _get_dbclient(args)
username=args.dbuser, for corpus in ["kres", "ssj"]:
password=args.dbpass, dbclient.valdb[corpus].ensure_index([("sid", pymongo.ASCENDING)])
authSource="valdb", dbclient.valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
authMechanism='SCRAM-SHA-1' dbclient.valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
)
valdb = dbclient.valdb
# SSJ # SSJ
logger.info("Parsing Ssj: {}".format(args.ssj_file)) logger.info("Parsing Ssj: {}".format(args.ssj_file))
@@ -67,8 +126,11 @@ if __name__ == "__main__":
with ssj_outfile.open("w") as fp: with ssj_outfile.open("w") as fp:
json.dump(ssj_data, fp) json.dump(ssj_data, fp)
elif args.output == "db": elif args.output == "db":
dbclient = _get_dbclient(args)
valdb = dbclient.valdb
ssj_col = valdb["ssj"] ssj_col = valdb["ssj"]
for sentence in ssj_data: for sentence in ssj_data:
sentence = _db_preprocess(sentence)
ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True) ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True)