added functors and headwords to db entry

This commit is contained in:
voje 2019-04-15 02:34:53 +02:00
parent 86e56767dd
commit f0b0abac1b
2 changed files with 66 additions and 14 deletions

View File

@ -107,12 +107,12 @@ class Parser():
yield from self.xml_file_to_generator(self.ssj_file) yield from self.xml_file_to_generator(self.ssj_file)
def parse_xml_file(self, xml_file): def parse_xml_file(self, xml_file):
tstart = time.time() # tstart = time.time()
file_data = [] file_data = []
for tpl in self.xml_file_to_generator(xml_file): for tpl in self.xml_file_to_generator(xml_file):
file_data += [tpl[1]] file_data += [tpl[1]]
tend = time.time() tend = time.time()
self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart)) # self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart))
return file_data return file_data
def xml_file_to_generator(self, xml_file): def xml_file_to_generator(self, xml_file):
@ -211,7 +211,7 @@ class Parser():
"text": sentence_text, "text": sentence_text,
"tokens": sentence_tokens, "tokens": sentence_tokens,
"jos_links": jos_links, "jos_links": jos_links,
"srl_links": srl_links_parsed "srl_links": srl_links_parsed,
} }
self.stats["parsed_count"] += 1 self.stats["parsed_count"] += 1
yield (xml_file, sentence_entry) yield (xml_file, sentence_entry)

View File

@ -4,20 +4,44 @@ import argparse
import logging import logging
import json import json
from pymongo import MongoClient from pymongo import MongoClient
import pymongo
import sys import sys
from multiprocessing import Pool from multiprocessing import Pool
import time
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
n_kres_files = -1 # for logging n_kres_files = -1 # for logging
def _helper_tid_to_token(tid, tokens):
for t in tokens:
if t["tid"] == tid:
return t
return None
def _db_preprocess(e):
if e["srl_links"] is None:
e["headwords"] = []
e["functors"] = []
else:
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
hw_tokens = [_helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
e["headwords"] = headwords
functors = list(set([x["afun"] for x in e["srl_links"]]))
e["functors"] = functors
return e
# handler for concurrency # handler for concurrency
def _handle_kres_file_tpl(kres_file_tpl): def _handle_kres_file_tpl(kres_file_tpl):
tstart = time.time()
kres_file_idx = kres_file_tpl[0] kres_file_idx = kres_file_tpl[0]
kres_file = kres_file_tpl[1] kres_file = kres_file_tpl[1]
logging.info("Handling {} ({}/{})".format(
kres_file, kres_file_idx, n_kres_files))
kres_data = kres_parser.parse_xml_file(kres_file) kres_data = kres_parser.parse_xml_file(kres_file)
if args.output == "file": if args.output == "file":
kres_outdir = outdir / "kres_json" kres_outdir = outdir / "kres_json"
@ -26,9 +50,37 @@ def _handle_kres_file_tpl(kres_file_tpl):
with kres_outfile.open("w") as fp: with kres_outfile.open("w") as fp:
json.dump(kres_data, fp) json.dump(kres_data, fp)
elif args.output == "db": elif args.output == "db":
# mongoclient needs to be created after forking
dbclient = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = dbclient.valdb
kres_col = valdb["kres"] kres_col = valdb["kres"]
# HUUUUGE BOTTLENECK
"""
for sentence in kres_data: for sentence in kres_data:
kres_col.update({"sid": sentence["sid"]}, sentence, upsert=True) kres_col.update({"sid": sentence["sid"]}, sentence, upsert=True)
"""
kres_data_1 = [_db_preprocess(x) for x in kres_data]
kres_col.insert_many(kres_data_1) # much much better (just make sure sid has a unique index)
logging.info("Handled {} ({}/{}) in {:.2f} s".format(
kres_file, kres_file_idx, n_kres_files, time.time() - tstart))
def _get_dbclient(args):
dbclient = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
return dbclient
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
@ -44,19 +96,16 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
outdir = None outdir = None
valdb = None
if args.output == "file": if args.output == "file":
outdir = Path(args.outdir) outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True) outdir.mkdir(parents=True, exist_ok=True)
elif args.output == "db": elif args.output == "db":
dbclient = MongoClient( # Force unique sid
"mongodb://{}".format(args.dbaddr), dbclient = _get_dbclient(args)
username=args.dbuser, for corpus in ["kres", "ssj"]:
password=args.dbpass, dbclient.valdb[corpus].ensure_index([("sid", pymongo.ASCENDING)])
authSource="valdb", dbclient.valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
authMechanism='SCRAM-SHA-1' dbclient.valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
)
valdb = dbclient.valdb
# SSJ # SSJ
logger.info("Parsing Ssj: {}".format(args.ssj_file)) logger.info("Parsing Ssj: {}".format(args.ssj_file))
@ -67,8 +116,11 @@ if __name__ == "__main__":
with ssj_outfile.open("w") as fp: with ssj_outfile.open("w") as fp:
json.dump(ssj_data, fp) json.dump(ssj_data, fp)
elif args.output == "db": elif args.output == "db":
dbclient = _get_dbclient(args)
valdb = dbclient.valdb
ssj_col = valdb["ssj"] ssj_col = valdb["ssj"]
for sentence in ssj_data: for sentence in ssj_data:
sentence = _db_preprocess(sentence)
ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True) ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True)