diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ecb8742 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +data/samples/ +*/pycache/ +*egg-info/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..51a0afe --- /dev/null +++ b/Makefile @@ -0,0 +1,34 @@ +# All required components, to create and fill a database, +# instantiate backend and frontend. + +MAKE_ROOT = $(shell pwd) + +### Input data +# I received ssj500k in one .xml file, +# kres is composed of many .xml files +# I generated srl tags for kres in separate .json files +# (for each kres.xml file there is a kres.json file with srl tags) +SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml" +KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example" +KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl" +export + +.PHONY: dev-env preflight + +all: + echo "Select an argument" + +# prereq (environment) +dev-env: + cd dockerfiles; cd dev-env; $(MAKE) + +# run these inside dev-env container +data/samples: + cd data; tar xzvf samples.tar.gz + +# installs our python code as packages +# when debugging, run this once, then run python3 ... by hand +preflight: data/samples + pip3 install -e src/pkg/corpusparser/. + python3 src/preflight/main_parse.py --kres-folder $(KRES_FOLDER) \ + --ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) diff --git a/conf/main.conf b/conf/main.conf deleted file mode 100644 index e69de29..0000000 diff --git a/data/kres_srl b/data/kres_srl new file mode 120000 index 0000000..f1acfc4 --- /dev/null +++ b/data/kres_srl @@ -0,0 +1 @@ +/home/kristjan/kres_srl/final_json/ \ No newline at end of file diff --git a/data/samples.tar.gz b/data/samples.tar.gz new file mode 100644 index 0000000..ba89432 Binary files /dev/null and b/data/samples.tar.gz differ diff --git a/dockerfiles/dev-env/Dockerfile b/dockerfiles/dev-env/Dockerfile new file mode 100644 index 0000000..3a494d2 --- /dev/null +++ b/dockerfiles/dev-env/Dockerfile @@ -0,0 +1,17 @@ +FROM ubuntu:16.04 + +RUN apt-get update --fix-missing +RUN apt-get install -y \ +vim \ +python3 \ +python3-pip \ +sshfs + +RUN pip3 install \ + lxml \ + pandas \ + sklearn \ + argparse \ + pathlib + +ENV PYTHONIOENCODING UTF-8 diff --git a/dockerfiles/dev-env/Makefile b/dockerfiles/dev-env/Makefile new file mode 100644 index 0000000..7f5dd50 --- /dev/null +++ b/dockerfiles/dev-env/Makefile @@ -0,0 +1,18 @@ +IMAGE_NAME="cjvt-dev-env" + +all: build run + +build: + docker build . -t $(IMAGE_NAME) + +run: + docker run \ + -it \ + -v /home/${USER}:/home/${USER} \ + --user $(shell id -u):$(shell id -g) \ + -v /etc/passwd:/etc/passwd \ + -v /etc/group:/etc/group \ + -v $(MAKE_ROOT):/project \ + -w /project \ + $(IMAGE_NAME) \ + /bin/bash diff --git a/src/pkg/anchor b/src/pkg/anchor deleted file mode 100644 index e69de29..0000000 diff --git a/src/pkg/corpusparser/corpusparser/Parser.py b/src/pkg/corpusparser/corpusparser/Parser.py new file mode 100644 index 0000000..dc93554 --- /dev/null +++ b/src/pkg/corpusparser/corpusparser/Parser.py @@ -0,0 +1,151 @@ +from corpusparser import Sentence +from pathlib import Path +import re +from lxml import etree + +# Read input file(.xml, .json; kres or ssj500k). +# Create an iterator that outputs resulting sentences (python dict format). +class Parser(): + + def __init__(self, corpus, infiles): + + if corpus == "kres": + self.kres_folder = Path(infiles[0]) + self.kres_srl_folder = Path(infiles[1]) + elif corpus == "ssj": + self.ssj_file = Path(infiles[0]) + else: + raise ValueError("Argument corpus should be 'ssj' or 'kres'.") + + self.corpus = corpus + self.W_TAGS = ['w'] + self.C_TAGS = ['c'] + self.S_TAGS = ['S', 'pc'] + + def parse_jos_links(self, sent_el): + if self.corpus == "kres": + return self.parse_jos_links_kres(sent_el) + else: + return self.parse_jos_links_ssj(sent_el) + + def parse_ssj_target_arg(self, text): + # from: 0, to: 6 + # + # from: 6, to: 7 + # + lst = [x.split(".")[-1] for x in text.split(" ")] + return [int(x[1:] if x[0] == "t" else 0) for x in lst] + + def parse_jos_links_kres(self, sent_el): + lgrps = sent_el.findall(".//links") + if len(lgrps) < 1: + raise IOError("Can't find links.") + res_links = [] + for link in lgrps[0]: + res_links += [{ + "from": int(link.get("from").split(".")[-1]), + "afun": link.get("afun"), + "to": int(link.get("dep").split(".")[-1]), + }] + return res_links + + def parse_jos_links_ssj(self, sent_el): + lgrps = sent_el.findall(".//linkGrp") + if len(lgrps) < 1: + # print(etree.tostring(sent_el)) + raise IOError("Can't find links.") + res_links = [] + for link in lgrps[0]: + print(link) + tar = self.parse_ssj_target_arg(link.get("target")) + res_links += [{ + "from": tar[0], + "afun": link.get("ana").split(":")[1], + "to": [1], + }] + return res_links + + def parse(self): + if self.corpus == "kres": + print("parse kres: TODO") + else: + self.parse_xml_file(self.ssj_file) + + def parse_xml_file(self, filepath): + res_dict = {} + with filepath.open("rb") as fp: + # remove namespaces + bstr = fp.read() + + utf8str = bstr.decode("utf-8") + utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) + utf8str = re.sub(' xml:', ' ', utf8str) + + root = etree.XML(utf8str.encode("utf-8")) + + divs = [] # in ssj, there are divs, in Kres, there are separate files + if self.corpus == "kres": + divs = [root] + else: + divs = root.findall(".//div") + + # parse divs + for div in divs: + f_id = div.get("id") + + # parse paragraphs + for p in div.findall(".//p"): + p_id = p.get("id").split(".")[-1] + + # parse sentences + for s in p.findall(".//s"): + s_id = s.get("id").split(".")[-1] + sentence_text = "" + sentence_tokens = [] + + # parse tokens + for el in s.iter(): + if el.tag in self.W_TAGS: + el_id = el.get("id").split(".")[-1] + if el_id[0] == 't': + el_id = el_id[1:] # ssj W_TAG ids start with t + sentence_text += el.text + sentence_tokens += [{ + "word": True, + "tid": int(el_id), + "text": el.text, + "lemma": el.get("lemma"), + "msd": (el.get("msd") if self.corpus == "kres" + else el.get("ana").split(":")[-1]), + }] + elif el.tag in self.C_TAGS: + # only Kres' C_TAGS have ids + el_id = el.get("id") or "none" + el_id = el_id.split(".")[-1] + sentence_text += el.text + sentence_tokens += [{ + "word": False, + "tid": el_id, + "text": el.text, + }] + elif el.tag in self.S_TAGS: + # Kres' doesn't contain .text + sentence_text += " " + else: + # pass links and linkGroups + pass + sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) + + # make a generator instead of holding the whole corpus in memory + if sentence_id in res_dict: + raise KeyError("duplicated id: {}".format(sentence_id)) + res_dict[sentence_id] = { + "sid": sentence_id, + "text": sentence_text, + "tokens": sentence_tokens, + "jos_links": self.parse_jos_links(s) + } + + print(res_dict[sentence_id]) + break + return res_dict diff --git a/src/pkg/corpusparser/corpusparser/Sentence.py b/src/pkg/corpusparser/corpusparser/Sentence.py new file mode 100644 index 0000000..aba9a4a --- /dev/null +++ b/src/pkg/corpusparser/corpusparser/Sentence.py @@ -0,0 +1,3 @@ +class Sentence(): + def __init__(): + print("Sentence __init__(): TODO") diff --git a/src/pkg/corpusparser/corpusparser/__init__.py b/src/pkg/corpusparser/corpusparser/__init__.py new file mode 100644 index 0000000..213c88a --- /dev/null +++ b/src/pkg/corpusparser/corpusparser/__init__.py @@ -0,0 +1,2 @@ +from corpusparser.Parser import Parser +from corpusparser.Sentence import Sentence \ No newline at end of file diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc new file mode 100644 index 0000000..98e401c Binary files /dev/null and b/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc differ diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc new file mode 100644 index 0000000..c0ae9f7 Binary files /dev/null and b/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc differ diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc new file mode 100644 index 0000000..a365c8e Binary files /dev/null and b/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc differ diff --git a/src/pkg/corpusparser/setup.py b/src/pkg/corpusparser/setup.py new file mode 100644 index 0000000..da967a7 --- /dev/null +++ b/src/pkg/corpusparser/setup.py @@ -0,0 +1,10 @@ +from setuptools import setup + +setup(name='corpusparser', + version='0.0.1', + description=u"Parser for kres and ssj500k", + author=u"Kristjan Voje", + author_email='kristjan.voje@gmail.com', + license='MIT', + zip_safe=False, +) \ No newline at end of file diff --git a/src/preflight/main_parse.py b/src/preflight/main_parse.py new file mode 100644 index 0000000..d696cfb --- /dev/null +++ b/src/preflight/main_parse.py @@ -0,0 +1,16 @@ +from corpusparser import Parser +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") + parser.add_argument('--kres-folder', required=True) + parser.add_argument('--kres-srl-folder', required=True) + parser.add_argument('--ssj-file', required=True) + args = parser.parse_args() + + # parse ssj + ssj_parser = Parser( + corpus="ssj", + infiles=[args.ssj_file] + ) + ssj_parser.parse()