From f87eafd9f2fe473cafcf680e886de0188bb28759 Mon Sep 17 00:00:00 2001 From: voje Date: Sun, 17 Mar 2019 13:22:01 +0100 Subject: [PATCH] corpusparser as standalone project --- Makefile | 35 ++++ README.md | 41 ++++ corpusparser.egg-info/PKG-INFO | 10 + corpusparser.egg-info/SOURCES.txt | 5 + corpusparser.egg-info/dependency_links.txt | 1 + corpusparser.egg-info/not-zip-safe | 1 + corpusparser.egg-info/top_level.txt | 1 + corpusparser/Parser.py | 196 +++++++++++++++++++ corpusparser/__init__.py | 1 + corpusparser/main.py | 102 ++++++++++ dockerfiles/cjvt-corpusparser-env/Dockerfile | 15 ++ dockerfiles/cjvt-corpusparser-env/Makefile | 19 ++ setup.py | 10 + 13 files changed, 437 insertions(+) create mode 100644 Makefile create mode 100644 README.md create mode 100644 corpusparser.egg-info/PKG-INFO create mode 100644 corpusparser.egg-info/SOURCES.txt create mode 100644 corpusparser.egg-info/dependency_links.txt create mode 100644 corpusparser.egg-info/not-zip-safe create mode 100644 corpusparser.egg-info/top_level.txt create mode 100644 corpusparser/Parser.py create mode 100644 corpusparser/__init__.py create mode 100644 corpusparser/main.py create mode 100644 dockerfiles/cjvt-corpusparser-env/Dockerfile create mode 100644 dockerfiles/cjvt-corpusparser-env/Makefile create mode 100644 setup.py diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1a297a9 --- /dev/null +++ b/Makefile @@ -0,0 +1,35 @@ +# All required components, to create and fill a database, +# instantiate backend and frontend. + +MAKE_ROOT = $(shell pwd) + +### Input data +# I received ssj500k in one .xml file, +# kres is composed of many .xml files +# I generated srl tags for kres in separate .json files +# (for each kres.xml file there is a kres.json file with srl tags) +SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml" +KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example" +KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl" +OUTPUT = "file" +OUTDIR = "/home/voje/workdir/test_out" +DBADDR = "" + +DB_ADM_USER = testadmin +DB_ADM_PASS = testadminpass +DB_USR_USER = testuser +DB_USR_PASS = testuserpass +export + +.PHONY: cjvt-corpusparser-env + +all: cjvt-corpusparser-env install + +# prereq (environment) +cjvt-corpusparser-env: + cd dockerfiles/cjvt-corpusparser-env; $(MAKE) + +# commands inside containers: +install: + pip3 install -e . + diff --git a/README.md b/README.md new file mode 100644 index 0000000..61cb0b2 --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +# corpusparser +A tool for parsing ssj500k and Kres into a unified .json format. + +## Quickstart +Run `make`. You will get a container with python3 and this package installed. + +## Input: +### ssj500k +To parse ssj500k, point to the monolythic `ssj500k-sl.body.xml` file (tested on ssj500k 2.1). + +### Kres +To parse Kres, point to folders: +* Kres folder, containig several (around 20K) .xml files (`F00XXXXX.xml.parsed.xml`). +* Kres SRL folder, containing SRL links for the corresponding F00...xml files (`F00XXXXX.srl.json`). + +## Internal data format +This is the internal python dict data format. It can be stored to file as `.json` or stored into a database for application usage. +```python +{ + 'sid': 'F0034713.5.0', + 'text': 'Mednarodni denarni sklad je odobril 30 milijard evrov vredno posojilo GrĨiji. ', + 'tokens': [ + {'text': 'Mednarodni', 'lemma': 'mednaroden', 'msd': 'Ppnmeid', 'word': True, 'tid': 1}, + {'text': 'denarni', 'lemma': 'denaren', 'msd': 'Ppnmeid', 'word': True, 'tid': 2}, + {'text': 'sklad', 'lemma': 'sklad', 'msd': 'Somei', 'word': True, 'tid': 3}, + {'text': 'je', 'lemma': 'biti', 'msd': 'Gp-ste-n', 'word': True, 'tid': 4}, + {'text': 'odobril', 'lemma': 'odobriti', 'msd': 'Ggdd-em', 'word': True, 'tid': 5}, + {'text': '30', 'lemma': '30', 'msd': 'Kag', 'word': True, 'tid': 6}, + {'text': 'milijard', 'lemma': 'milijarda', 'msd': 'Sozmr', 'word': True, 'tid': 7}, # ... + ] + 'jos_links': [ + {'to': 1, 'from': 3, 'afun': 'dol'}, + {'to': 2, 'from': 3, 'afun': 'dol'}, + {'to': 3, 'from': 5, 'afun': 'ena'}, # ... + ] + 'srl_links': [ + {'to': 3, 'from': 5, 'afun': 'ACT'}, + {'to': 7, 'from': 5, 'afun': 'PAT'} + ] +} +``` diff --git a/corpusparser.egg-info/PKG-INFO b/corpusparser.egg-info/PKG-INFO new file mode 100644 index 0000000..6935cc4 --- /dev/null +++ b/corpusparser.egg-info/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.0 +Name: corpusparser +Version: 0.0.1 +Summary: Parser for kres and ssj500k +Home-page: UNKNOWN +Author: Kristjan Voje +Author-email: kristjan.voje@gmail.com +License: MIT +Description: UNKNOWN +Platform: UNKNOWN diff --git a/corpusparser.egg-info/SOURCES.txt b/corpusparser.egg-info/SOURCES.txt new file mode 100644 index 0000000..998012a --- /dev/null +++ b/corpusparser.egg-info/SOURCES.txt @@ -0,0 +1,5 @@ +corpusparser.egg-info/PKG-INFO +corpusparser.egg-info/SOURCES.txt +corpusparser.egg-info/dependency_links.txt +corpusparser.egg-info/not-zip-safe +corpusparser.egg-info/top_level.txt \ No newline at end of file diff --git a/corpusparser.egg-info/dependency_links.txt b/corpusparser.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/corpusparser.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/corpusparser.egg-info/not-zip-safe b/corpusparser.egg-info/not-zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/corpusparser.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/corpusparser.egg-info/top_level.txt b/corpusparser.egg-info/top_level.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/corpusparser.egg-info/top_level.txt @@ -0,0 +1 @@ + diff --git a/corpusparser/Parser.py b/corpusparser/Parser.py new file mode 100644 index 0000000..d908d04 --- /dev/null +++ b/corpusparser/Parser.py @@ -0,0 +1,196 @@ +from pathlib import Path +import re +import json +from lxml import etree +import logging + +logging.basicConfig(level=logging.INFO) + +# Read input file(.xml, .json; kres or ssj500k). +# Create an iterator that outputs resulting sentences (python dict format). +class Parser(): + + def __init__(self, corpus, infiles, logger=None): + + if corpus == "kres": + self.kres_folder = Path(infiles[0]) + self.kres_srl_folder = Path(infiles[1]) + elif corpus == "ssj": + self.ssj_file = Path(infiles[0]) + else: + raise ValueError("Argument corpus should be 'ssj' or 'kres'.") + + self.corpus = corpus + self.W_TAGS = ['w'] + self.C_TAGS = ['c'] + self.S_TAGS = ['S', 'pc'] + self.logger = logger or logging.getLogger(__name__) + self.stats = { + "parsed_count": 0, + "missing_srl": [] + } + + def parse_jos_links(self, sent_el): + if self.corpus == "kres": + return self.parse_jos_links_kres(sent_el) + else: + # 'syntax' is the linkgroup we're looking for + return self.parse_any_links_ssj(sent_el, "syntax") + + def parse_jos_links_kres(self, sent_el): + lgrps = sent_el.findall(".//links") + if len(lgrps) < 1: + raise IOError("Can't find links.") + res_links = [] + for link in lgrps[0]: + res_links += [{ + "from": int(link.get("from").split(".")[-1]), + "afun": link.get("afun"), + "to": int(link.get("dep").split(".")[-1]), + }] + return res_links + + def parse_ssj_target_arg(self, text): + # from: 0, to: 6 + # + # from: 6, to: 7 + # + lst = [x.split(".")[-1] for x in text.split(" ")] + return [int(x[1:] if x[0] == "t" else 0) for x in lst] + + def parse_any_links_ssj(self, sent_el, links_type): + lgrps = sent_el.findall(".//linkGrp") + links = [x for x in lgrps if x.get("type") == links_type][0] + res_links = [] + for link in links: + tar = self.parse_ssj_target_arg(link.get("target")) + res_links += [{ + "from": tar[0], + "afun": link.get("ana").split(":")[1], + "to": tar[1], + }] + return res_links + + def parse_srl_links(self, sent_el, sent_srl_links=None): + if self.corpus == "kres": + return self.parse_srl_links_kres(sent_el, sent_srl_links) + else: + return self.parse_any_links_ssj(sent_el, "SRL") + + def parse_srl_links_kres(self, sent_el, sent_srl_links): + res_links = [] + for link in sent_srl_links: + res_links += [{ + "from": int(link["from"]), + "afun": link["arg"], + "to": int(link["dep"]), + }] + # find the correspointing json file with srl links + return res_links + + def sentence_generator(self): + # Using generators so we don't copy a whole corpu around in memory. + if self.corpus == "kres": + for xml_file in self.kres_folder.iterdir(): + # self.parse_xml_file(xml_file) + yield from self.parse_xml_file(xml_file) + else: + yield from self.parse_xml_file(self.ssj_file) + + def parse_xml_file(self, xml_file): + srl_from_json = {} + if self.corpus == "kres": + # in case of kres, read the SRL links form a separate json file + file_id = xml_file.name.split(".")[0] + json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json") + with json_file.open("r") as fp: + srl_from_json = json.loads(fp.read()) + + with xml_file.open("rb") as fp: + # remove namespaces + bstr = fp.read() + + utf8str = bstr.decode("utf-8") + utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) + utf8str = re.sub(' xml:', ' ', utf8str) + + root = etree.XML(utf8str.encode("utf-8")) + + divs = [] # in ssj, there are divs, in Kres, there are separate files + if self.corpus == "kres": + divs = [root] + else: + divs = root.findall(".//div") + + res_dict = {} + + # parse divs + for div in divs: + f_id = div.get("id") + + # parse paragraphs + for p in div.findall(".//p"): + p_id = p.get("id").split(".")[-1] + + # parse sentences + for s in p.findall(".//s"): + s_id = s.get("id").split(".")[-1] + sentence_text = "" + sentence_tokens = [] + + # parse tokens + for el in s.iter(): + if el.tag in self.W_TAGS: + el_id = el.get("id").split(".")[-1] + if el_id[0] == 't': + el_id = el_id[1:] # ssj W_TAG ids start with t + sentence_text += el.text + sentence_tokens += [{ + "word": True, + "tid": int(el_id), + "text": el.text, + "lemma": el.get("lemma"), + "msd": (el.get("msd") if self.corpus == "kres" + else el.get("ana").split(":")[-1]), + }] + elif el.tag in self.C_TAGS: + # only Kres' C_TAGS have ids + el_id = el.get("id") or "none" + el_id = el_id.split(".")[-1] + sentence_text += el.text + sentence_tokens += [{ + "word": False, + "tid": (int(el_id) if self.corpus == "kres" else -1), + "text": el.text, + }] + elif el.tag in self.S_TAGS: + # Kres' doesn't contain .text + sentence_text += " " + else: + # pass links and linkGroups + pass + sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) + + jos_links = self.parse_jos_links(s) + + if self.corpus == "kres": + srl_links_raw = srl_from_json.get(sentence_id) + if srl_links_raw is None: + srl_links_parsed = None + self.stats["missing_srl"] += [(sentence_id, sentence_text)] + else: + srl_links_parsed = self.parse_srl_links(s, srl_links_raw) + else: + srl_links_parsed = self.parse_srl_links(s) + if len(srl_links_parsed) == 0: + self.stats["missing_srl"] += [(sentence_id, sentence_text)] + + sentence_entry = { + "sid": sentence_id, + "text": sentence_text, + "tokens": sentence_tokens, + "jos_links": jos_links, + "srl_links": srl_links_parsed + } + self.stats["parsed_count"] += 1 + yield (xml_file, sentence_entry) diff --git a/corpusparser/__init__.py b/corpusparser/__init__.py new file mode 100644 index 0000000..d993b8c --- /dev/null +++ b/corpusparser/__init__.py @@ -0,0 +1 @@ +from corpusparser.Parser import Parser \ No newline at end of file diff --git a/corpusparser/main.py b/corpusparser/main.py new file mode 100644 index 0000000..e0ba065 --- /dev/null +++ b/corpusparser/main.py @@ -0,0 +1,102 @@ +from pathlib import Path +from corpusparser import Parser +import argparse +import logging +import json + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +## Main handles command line arguments and writing to files / DB. + +def ssj_to_json_file(sentence_generator, outfolder): + # this funciton is based on the fact that files are parsed sequentially + outfolder = Path(outfolder) + outfolder.mkdir(parents=True, exist_ok=True) + outfile = outfolder / "ssj500k.json" + + data_buffer = [] + for s in sentence_generator: + sdata = s[1] + data_buffer += [sdata] + + # outfile = Path(outfile) + with outfile.open("w") as fp: + logger.info("Writing to {}".format(outfile)) + json.dump(data_buffer, fp) + +def kres_to_json_files(sentence_generator, outfolder): + outfolder = Path(outfolder) / "kres_json" + outfolder.mkdir(parents=True, exist_ok=True) + + def write_buffer_to_file(outfile, outfile_buffer): + logger.info("Writing file: {}".format(outfile)) + with outfile.open("w") as fp: + json.dump(outfile_buffer, fp) + + outfile_buffer = None + current_outfile = None + for s in sentence_generator: + infile = s[0] + outfile = outfolder / Path(infile.name.split(".")[0]).with_suffix(".json") + + # parser sequentially parses files; when we're done with a file, write it out + if current_outfile is None: + current_outfile = outfile + outfile_buffer = [] + elif outfile != current_outfile: + write_buffer_to_file(current_outfile, outfile_buffer) + current_outfile = outfile + outfile_buffer = [] + + # update buffer + sdata = s[1] + outfile_buffer += [sdata] + write_buffer_to_file(current_outfile, outfile_buffer) + + + +def to_db(): + return "TODO" + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") + parser.add_argument('--kres-folder', required=True) + parser.add_argument('--kres-srl-folder', required=True) + parser.add_argument('--ssj-file', required=True) + parser.add_argument('--output', required=False, default=None) + parser.add_argument('--outdir', required=False, default=None) + parser.add_argument('--dbaddr', required=False, default=None) + args = parser.parse_args() + + # parse ssj + logger.info("Parsing ssj500k: {}".format(args.ssj_file)) + ssj_parser = Parser( + corpus="ssj", + infiles=[args.ssj_file], + ) + # res = [x[1]["sid"] for x in ssj_parser.sentence_generator()] + # logger.info("Parsed {} sentences (ssj500k)".format(len(res))) + + # ssj to json + ssj_to_json_file(ssj_parser.sentence_generator(), args.outdir) + + # parse kres + logger.info("Parsing Kres: {}".format(args.ssj_file)) + kres_parser = Parser( + corpus="kres", + infiles=[args.kres_folder, args.kres_srl_folder], + ) + # res = [x[1]["sid"] for x in kres_parser.sentence_generator()] + # logger.info("Parsed {} sentences (kres)".format(len(res))) + + # kres to json + kres_to_json_files(kres_parser.sentence_generator(), args.outdir) + + +## Handling output is situational --- implement it outside of Parser. +## Parser returns tuples (orig_file, element) +# 1. parse per-file and output to file (JSON) +# 2. parse and save to DB + +# TODO diff --git a/dockerfiles/cjvt-corpusparser-env/Dockerfile b/dockerfiles/cjvt-corpusparser-env/Dockerfile new file mode 100644 index 0000000..e44a6e6 --- /dev/null +++ b/dockerfiles/cjvt-corpusparser-env/Dockerfile @@ -0,0 +1,15 @@ +FROM ubuntu:16.04 + +RUN apt-get update --fix-missing +RUN apt-get install -y \ +vim \ +python3 \ +python3-pip \ +sshfs + +RUN pip3 install \ + lxml \ + argparse \ + pathlib + +ENV PYTHONIOENCODING UTF-8 diff --git a/dockerfiles/cjvt-corpusparser-env/Makefile b/dockerfiles/cjvt-corpusparser-env/Makefile new file mode 100644 index 0000000..4d4cf1f --- /dev/null +++ b/dockerfiles/cjvt-corpusparser-env/Makefile @@ -0,0 +1,19 @@ +IMAGE_NAME="cjvt-corpusparser-env" + +all: build run + +build: + docker build . -t $(IMAGE_NAME) + +run: + docker run \ + -it \ + -v /home/${USER}:/home/${USER} \ + --user $(shell id -u):$(shell id -g) \ + -v /etc/passwd:/etc/passwd \ + -v /etc/group:/etc/group \ + -v $(MAKE_ROOT):/project \ + -w /project \ + $(IMAGE_NAME) \ + /bin/bash + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..da967a7 --- /dev/null +++ b/setup.py @@ -0,0 +1,10 @@ +from setuptools import setup + +setup(name='corpusparser', + version='0.0.1', + description=u"Parser for kres and ssj500k", + author=u"Kristjan Voje", + author_email='kristjan.voje@gmail.com', + license='MIT', + zip_safe=False, +) \ No newline at end of file