added functors and headwords to db entry

This commit is contained in:
voje 2019-04-15 02:34:53 +02:00
parent 86e56767dd
commit f0b0abac1b
2 changed files with 66 additions and 14 deletions

View File

@ -107,12 +107,12 @@ class Parser():
yield from self.xml_file_to_generator(self.ssj_file)
def parse_xml_file(self, xml_file):
tstart = time.time()
# tstart = time.time()
file_data = []
for tpl in self.xml_file_to_generator(xml_file):
file_data += [tpl[1]]
tend = time.time()
self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart))
# self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart))
return file_data
def xml_file_to_generator(self, xml_file):
@ -211,7 +211,7 @@ class Parser():
"text": sentence_text,
"tokens": sentence_tokens,
"jos_links": jos_links,
"srl_links": srl_links_parsed
"srl_links": srl_links_parsed,
}
self.stats["parsed_count"] += 1
yield (xml_file, sentence_entry)

View File

@ -4,20 +4,44 @@ import argparse
import logging
import json
from pymongo import MongoClient
import pymongo
import sys
from multiprocessing import Pool
import time
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
n_kres_files = -1 # for logging
def _helper_tid_to_token(tid, tokens):
for t in tokens:
if t["tid"] == tid:
return t
return None
def _db_preprocess(e):
if e["srl_links"] is None:
e["headwords"] = []
e["functors"] = []
else:
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
hw_tokens = [_helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens]
e["headwords"] = headwords
functors = list(set([x["afun"] for x in e["srl_links"]]))
e["functors"] = functors
return e
# handler for concurrency
def _handle_kres_file_tpl(kres_file_tpl):
tstart = time.time()
kres_file_idx = kres_file_tpl[0]
kres_file = kres_file_tpl[1]
logging.info("Handling {} ({}/{})".format(
kres_file, kres_file_idx, n_kres_files))
kres_data = kres_parser.parse_xml_file(kres_file)
if args.output == "file":
kres_outdir = outdir / "kres_json"
@ -26,9 +50,37 @@ def _handle_kres_file_tpl(kres_file_tpl):
with kres_outfile.open("w") as fp:
json.dump(kres_data, fp)
elif args.output == "db":
# mongoclient needs to be created after forking
dbclient = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = dbclient.valdb
kres_col = valdb["kres"]
# HUUUUGE BOTTLENECK
"""
for sentence in kres_data:
kres_col.update({"sid": sentence["sid"]}, sentence, upsert=True)
"""
kres_data_1 = [_db_preprocess(x) for x in kres_data]
kres_col.insert_many(kres_data_1) # much much better (just make sure sid has a unique index)
logging.info("Handled {} ({}/{}) in {:.2f} s".format(
kres_file, kres_file_idx, n_kres_files, time.time() - tstart))
def _get_dbclient(args):
dbclient = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
return dbclient
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
@ -44,19 +96,16 @@ if __name__ == "__main__":
args = parser.parse_args()
outdir = None
valdb = None
if args.output == "file":
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
elif args.output == "db":
dbclient = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = dbclient.valdb
# Force unique sid
dbclient = _get_dbclient(args)
for corpus in ["kres", "ssj"]:
dbclient.valdb[corpus].ensure_index([("sid", pymongo.ASCENDING)])
dbclient.valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
dbclient.valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
# SSJ
logger.info("Parsing Ssj: {}".format(args.ssj_file))
@ -67,8 +116,11 @@ if __name__ == "__main__":
with ssj_outfile.open("w") as fp:
json.dump(ssj_data, fp)
elif args.output == "db":
dbclient = _get_dbclient(args)
valdb = dbclient.valdb
ssj_col = valdb["ssj"]
for sentence in ssj_data:
sentence = _db_preprocess(sentence)
ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True)