Compare commits

...

12 Commits

3 changed files with 172 additions and 67 deletions

View File

@@ -5,7 +5,7 @@ from lxml import etree
import logging import logging
import time import time
logging.basicConfig(level=logging.INFO) # logging.basicConfig(level=logging.INFO)
# Read input file(.xml, .json; kres or ssj500k). # Read input file(.xml, .json; kres or ssj500k).
# Create an iterator that outputs resulting sentences (python dict format). # Create an iterator that outputs resulting sentences (python dict format).
@@ -61,7 +61,10 @@ class Parser():
def parse_any_links_ssj(self, sent_el, links_type): def parse_any_links_ssj(self, sent_el, links_type):
lgrps = sent_el.findall(".//linkGrp") lgrps = sent_el.findall(".//linkGrp")
links = [x for x in lgrps if x.get("type") == links_type][0] try:
links = [x for x in lgrps if x.get("type") == links_type][0]
except:
return []
res_links = [] res_links = []
for link in links: for link in links:
tar = self.parse_ssj_target_arg(link.get("target")) tar = self.parse_ssj_target_arg(link.get("target"))
@@ -107,12 +110,12 @@ class Parser():
yield from self.xml_file_to_generator(self.ssj_file) yield from self.xml_file_to_generator(self.ssj_file)
def parse_xml_file(self, xml_file): def parse_xml_file(self, xml_file):
tstart = time.time() # tstart = time.time()
file_data = [] file_data = []
for tpl in self.xml_file_to_generator(xml_file): for tpl in self.xml_file_to_generator(xml_file):
file_data += [tpl[1]] file_data += [tpl[1]]
tend = time.time() tend = time.time()
self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart)) # self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart))
return file_data return file_data
def xml_file_to_generator(self, xml_file): def xml_file_to_generator(self, xml_file):
@@ -211,7 +214,7 @@ class Parser():
"text": sentence_text, "text": sentence_text,
"tokens": sentence_tokens, "tokens": sentence_tokens,
"jos_links": jos_links, "jos_links": jos_links,
"srl_links": srl_links_parsed "srl_links": srl_links_parsed,
} }
self.stats["parsed_count"] += 1 self.stats["parsed_count"] += 1
yield (xml_file, sentence_entry) yield (xml_file, sentence_entry)

View File

@@ -1 +1,2 @@
from corpusparser.Parser import Parser from corpusparser.Parser import Parser
from corpusparser.main import enriched_lemma

View File

@@ -4,51 +4,62 @@ import argparse
import logging import logging
import json import json
from pymongo import MongoClient from pymongo import MongoClient
import pymongo
import sys import sys
from multiprocessing import Pool from multiprocessing import Pool
import time
logging.basicConfig(level=logging.INFO) CORPORA = ["kres", "ssj"]
# logging.basicConfig(filename=Path("/var/tmp/corpusparser.log"), filemode='a', level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
n_kres_files = -1 # for logging # lfh = logging.FileHandler("/project/logs/fill-database.log")
lfh = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
lfh.setFormatter(formatter)
logger.addHandler(lfh)
logger.setLevel(logging.INFO)
n_chunks = -1
def enriched_lemma(token):
return (token["lemma"] if token["msd"][0] == "G" else token["lemma"] + "_")
def _helper_tid_to_token(tid, tokens):
for t in tokens:
if t["tid"] == tid:
return t
return None
def _db_preprocess(e):
if e["srl_links"] is None:
e["headwords"] = []
e["functors"] = []
else:
hw_tids = list(set([x["from"] for x in e["srl_links"]]))
hw_tokens = [_helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids]
headwords = [enriched_lemma(t) for t in hw_tokens]
e["headwords"] = headwords
functors = list(set([x["afun"] for x in e["srl_links"]]))
e["functors"] = functors
return e
# handler for concurrency # handler for concurrency
def _handle_kres_file_tpl(kres_file_tpl): def _handle_kres_file_chunk(kres_file_chunk):
kres_file_idx = kres_file_tpl[0] tstart = time.time()
kres_file = kres_file_tpl[1] kres_chunk_idx = kres_file_chunk[0]
logging.info("Handling {} ({}/{})".format( kres_chunk = kres_file_chunk[1]
kres_file, kres_file_idx, n_kres_files))
kres_data = kres_parser.parse_xml_file(kres_file)
if args.output == "file":
kres_outdir = outdir / "kres_json"
kres_outdir.mkdir(parents=True, exist_ok=True)
kres_outfile = kres_outdir / Path(kres_file.name.split(".")[0]).with_suffix(".json")
with kres_outfile.open("w") as fp:
json.dump(kres_data, fp)
elif args.output == "db":
kres_col = valdb["kres"]
for sentence in kres_data:
kres_col.update({"sid": sentence["sid"]}, sentence, upsert=True)
if __name__ == "__main__": dbclient = None
parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") db_payload = []
parser.add_argument('--kres-folder', required=True) if args.output == "db":
parser.add_argument('--kres-srl-folder', required=True) # mongoclient needs to be created after forking
parser.add_argument('--ssj-file', required=True)
parser.add_argument('--output', required=False, default=None)
parser.add_argument('--outdir', required=False, default=None)
parser.add_argument('--dbaddr', required=False, default=None)
parser.add_argument('--dbuser', required=False, default=None)
parser.add_argument('--dbpass', required=False, default=None)
parser.add_argument('--cores', required=False, default=1)
args = parser.parse_args()
outdir = None
valdb = None
if args.output == "file":
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
elif args.output == "db":
dbclient = MongoClient( dbclient = MongoClient(
"mongodb://{}".format(args.dbaddr), "mongodb://{}".format(args.dbaddr),
username=args.dbuser, username=args.dbuser,
@@ -56,35 +67,125 @@ if __name__ == "__main__":
authSource="valdb", authSource="valdb",
authMechanism='SCRAM-SHA-1' authMechanism='SCRAM-SHA-1'
) )
valdb = dbclient.valdb # dbclient.valdb["kres"]
for kres_file in kres_chunk:
try:
kres_data = kres_parser.parse_xml_file(kres_file)
except:
logger.error("Failed to parse file: {}".format(kres_file))
continue
if args.output == "file":
kres_outdir = outdir / "kres_json"
kres_outdir.mkdir(parents=True, exist_ok=True)
kres_outfile = kres_outdir / Path(kres_file.name.split(".")[0]).with_suffix(".json")
with kres_outfile.open("w") as fp:
json.dump(kres_data, fp)
# SSJ elif args.output == "db":
logger.info("Parsing Ssj: {}".format(args.ssj_file)) """
ssj_parser = Parser(corpus="ssj") if dbclient.valdb["kres"].find({"sid": kres_data[0]["sid"]}).count() > 0:
ssj_data = ssj_parser.parse_xml_file(Path(args.ssj_file)) logger.info("File {} already in DB, closing chunk ({}/{})".format(
if args.output == "file": kres_file, kres_chunk_idx, n_chunks))
ssj_outfile = outdir / "ssj500k.json" dbclient.close()
with ssj_outfile.open("w") as fp: return
json.dump(ssj_data, fp) """
elif args.output == "db":
ssj_col = valdb["ssj"]
for sentence in ssj_data:
ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True)
kres_data_1 = [_db_preprocess(x) for x in kres_data]
db_payload += kres_data_1
# Kres try:
logger.info("Parsing Kres: {}".format(args.kres_folder)) dbclient.valdb["kres"].insert_many(db_payload, ordered=False) # much much better (just make sure sid has a unique index)
kres_parser = Parser( except:
corpus="kres", logger.error("Failed inserting kres files chunk ({}/{}) in {:.2f} s".format(
kres_srl_folder=args.kres_srl_folder kres_chunk_idx, n_chunks, time.time() - tstart))
logger.info("Db insert: chunks ({}/{}) in {:.2f} s".format(
kres_chunk_idx, n_chunks, time.time() - tstart))
dbclient.close()
def _get_dbclient(args):
dbclient = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
) )
return dbclient
# [(idx, filepath)]
kres_files = [x for x in Path(args.kres_folder).iterdir()]
kres_files = [x for x in enumerate(kres_files)]
n_kres_files = len(kres_files)
p = Pool(int(args.cores)) if __name__ == "__main__":
p.map(_handle_kres_file_tpl, kres_files) parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
parser.add_argument('--corpus', required=True)
parser.add_argument('--kres-folder', required=False)
parser.add_argument('--kres-srl-folder', required=False)
parser.add_argument('--ssj-file', required=False)
parser.add_argument('--output', required=False, default=None)
parser.add_argument('--outdir', required=False, default=None)
parser.add_argument('--dbaddr', required=False, default=None)
parser.add_argument('--dbuser', required=False, default=None)
parser.add_argument('--dbpass', required=False, default=None)
parser.add_argument('--cores', required=False, default=1)
parser.add_argument('--chunk-size', required=False, default=3)
args = parser.parse_args()
corpus = args.corpus
assert (corpus in CORPORA), "Wrong corpus name."
outdir = None
if args.output == "file":
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
elif args.output == "db":
dbclient = _get_dbclient(args)
dbclient.valdb[corpus].ensure_index([("sid", pymongo.ASCENDING)])
dbclient.valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)])
dbclient.valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)])
dbclient.close()
if corpus == "ssj":
logger.info("Parsing Ssj: {}".format(args.ssj_file))
ssj_parser = Parser(logger=logger, corpus="ssj")
ssj_data = ssj_parser.parse_xml_file(Path(args.ssj_file))
if args.output == "file":
ssj_outfile = outdir / "ssj500k.json"
with ssj_outfile.open("w") as fp:
json.dump(ssj_data, fp)
elif args.output == "db":
dbclient = _get_dbclient(args)
valdb = dbclient.valdb
ssj_col = valdb["ssj"]
for sentence in ssj_data:
sentence = _db_preprocess(sentence)
ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True)
dbclient.close()
if corpus == "kres":
# Kres
logger.info("Parsing Kres: {}".format(args.kres_folder))
kres_parser = Parser(
logger=logger,
corpus="kres",
kres_srl_folder=args.kres_srl_folder
)
kres_files = [x for x in Path(args.kres_folder).iterdir()]
kres_files = sorted(kres_files, key=lambda x: x.name)
kres_files_chunks = []
i = 0
while i < len(kres_files):
# kres_files_chunks += kres_files[i:(i+args.chunk_size)]
new_i = i + int(args.chunk_size)
kres_files_chunks += [kres_files[i:new_i]]
i = new_i
kres_files_chunks = [x for x in enumerate(kres_files_chunks)]
n_chunks = len(kres_files_chunks)
p = Pool(int(args.cores))
p.map(_handle_kres_file_chunk, kres_files_chunks)
logger.info("Finished parsing.") logger.info("Finished parsing.")