diff --git a/src/pkg/corpusparser/corpusparser/Parser.py b/src/pkg/corpusparser/corpusparser/Parser.py index 476a87c..d908d04 100644 --- a/src/pkg/corpusparser/corpusparser/Parser.py +++ b/src/pkg/corpusparser/corpusparser/Parser.py @@ -10,7 +10,7 @@ logging.basicConfig(level=logging.INFO) # Create an iterator that outputs resulting sentences (python dict format). class Parser(): - def __init__(self, corpus, infiles, output=None, outdir=None, dbaddr=None, logger=None): + def __init__(self, corpus, infiles, logger=None): if corpus == "kres": self.kres_folder = Path(infiles[0]) @@ -20,13 +20,6 @@ class Parser(): else: raise ValueError("Argument corpus should be 'ssj' or 'kres'.") - self.output = output # None | file | db - if self.output == "file": - self.outdir = Path(outdir) - self.outdir.mkdir(parents=True, exist_ok=True) - elif self.output == "db": - self.dbaddr = "TODO" - self.corpus = corpus self.W_TAGS = ['w'] self.C_TAGS = ['c'] diff --git a/src/pkg/corpusparser/corpusparser/main.py b/src/pkg/corpusparser/corpusparser/main.py index cb23c8c..b6dd803 100644 --- a/src/pkg/corpusparser/corpusparser/main.py +++ b/src/pkg/corpusparser/corpusparser/main.py @@ -1,3 +1,4 @@ +from pathlib import Path from corpusparser import Parser import argparse import logging @@ -8,37 +9,54 @@ logger = logging.getLogger(__name__) ## Main handles command line arguments and writing to files / DB. +def ssj_to_json_file(sentence_generator, outfile): + # this funciton is based on the fact that files are parsed sequentially + data_buffer = [] + for s in sentence_generator: + sdata = s[1] + data_buffer += [sdata] + + # outfile = Path(outfile) + with open(outfile, "w") as fp: + logger.info("Writing to {}".format(outfile)) + json.dump(data_buffer, fp) + +def kres_to_json_files() + return "TODO" + +def to_db(): + return "TODO" + if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") - parser.add_argument('--kres-folder', required=True) - parser.add_argument('--kres-srl-folder', required=True) - parser.add_argument('--ssj-file', required=True) - parser.add_argument('--output', required=False, default=None) - parser.add_argument('--outdir', required=False, default=None) - parser.add_argument('--dbaddr', required=False, default=None) - args = parser.parse_args() - - # parse ssj - logger.info("Parsing ssj500k: {}".format(args.ssj_file)) - ssj_parser = Parser( - corpus="ssj", - infiles=[args.ssj_file], - output=args.output, - outdir=args.outdir, - ) - res = [x[1]["sid"] for x in ssj_parser.sentence_generator()] - logger.info("Parsed {} sentences (ssj500k)".format(len(res))) - - # parse kres - logger.info("Parsing Kres: {}".format(args.ssj_file)) - kres_parser = Parser( - corpus="kres", - infiles=[args.kres_folder, args.kres_srl_folder], - output=args.output, - outdir=args.outdir, - ) - res = [x[1]["sid"] for x in kres_parser.sentence_generator()] - logger.info("Parsed {} sentences (kres)".format(len(res))) + parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") + parser.add_argument('--kres-folder', required=True) + parser.add_argument('--kres-srl-folder', required=True) + parser.add_argument('--ssj-file', required=True) + parser.add_argument('--output', required=False, default=None) + parser.add_argument('--outdir', required=False, default=None) + parser.add_argument('--dbaddr', required=False, default=None) + args = parser.parse_args() + + # parse ssj + logger.info("Parsing ssj500k: {}".format(args.ssj_file)) + ssj_parser = Parser( + corpus="ssj", + infiles=[args.ssj_file], + ) + # res = [x[1]["sid"] for x in ssj_parser.sentence_generator()] + # logger.info("Parsed {} sentences (ssj500k)".format(len(res))) + + # ssj to json + ssj_to_json_file(ssj_parser.sentence_generator(), "/home/voje/workdir/ssj.json") + + # parse kres + logger.info("Parsing Kres: {}".format(args.ssj_file)) + kres_parser = Parser( + corpus="kres", + infiles=[args.kres_folder, args.kres_srl_folder], + ) + res = [x[1]["sid"] for x in kres_parser.sentence_generator()] + logger.info("Parsed {} sentences (kres)".format(len(res))) ## Handling output is situational --- implement it outside of Parser. @@ -47,12 +65,3 @@ if __name__ == "__main__": # 2. parse and save to DB # TODO -def handle_output(self, sent_ent, xml_file): - if self.output is None: - pass - if self.output == "file": - outfile = Path(self.outdir) / Path(xml_file.name.split(".")[0]).with_suffix(".json") - with outfile.open("a") as fp: - print(sent_ent) - json.dumps(sent_ent, fp) -