working on parser

2019-03-10 22:11:18 +01:00
parent 46f09be1d1
commit 1654548310
16 changed files with 255 additions and 0 deletions
@@ -0,0 +1,3 @@
+data/samples/
+*/pycache/
+*egg-info/
@@ -0,0 +1,34 @@
+# All required components, to create and fill a database,
+# instantiate backend and frontend. 
+
+MAKE_ROOT = $(shell pwd)
+
+### Input data
+# I received ssj500k in one .xml file,
+# kres is composed of many .xml files
+# I generated srl tags for kres in separate .json files
+# (for each kres.xml file there is a kres.json file with srl tags)
+SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml"
+KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example"
+KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl"
+export
+
+.PHONY: dev-env preflight
+
+all:
+	echo "Select an argument"
+
+# prereq (environment)
+dev-env:
+	cd dockerfiles; cd dev-env; $(MAKE)
+
+# run these inside dev-env container
+data/samples:
+	cd data; tar xzvf samples.tar.gz
+
+# installs our python code as packages
+# when debugging, run this once, then run python3 ... by hand
+preflight: data/samples
+	pip3 install -e src/pkg/corpusparser/.
+	python3 src/preflight/main_parse.py --kres-folder $(KRES_FOLDER) \
+		--ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER)
@@ -0,0 +1 @@
+/home/kristjan/kres_srl/final_json/
@@ -0,0 +1,17 @@
+FROM ubuntu:16.04
+
+RUN apt-get update --fix-missing
+RUN apt-get install -y \
+vim \
+python3 \
+python3-pip \
+sshfs
+
+RUN pip3 install \
+	lxml \
+	pandas \
+	sklearn \
+	argparse \
+	pathlib
+
+ENV PYTHONIOENCODING UTF-8
@@ -0,0 +1,18 @@
+IMAGE_NAME="cjvt-dev-env"
+
+all: build run
+
+build:
+	docker build . -t $(IMAGE_NAME)
+
+run:
+	docker run \
+    	-it \
+	-v /home/${USER}:/home/${USER} \
+	--user $(shell id -u):$(shell id -g) \
+	-v /etc/passwd:/etc/passwd \
+	-v /etc/group:/etc/group \
+    -v $(MAKE_ROOT):/project \
+	-w /project \
+	$(IMAGE_NAME) \
+    /bin/bash
@@ -0,0 +1,151 @@
+from corpusparser import Sentence
+from pathlib import Path
+import re
+from lxml import etree
+
+# Read input file(.xml, .json; kres or ssj500k).  
+# Create an iterator that outputs resulting sentences (python dict format).  
+class Parser():
+
+    def __init__(self, corpus, infiles):
+
+        if corpus == "kres":
+            self.kres_folder = Path(infiles[0])
+            self.kres_srl_folder = Path(infiles[1])
+        elif corpus == "ssj":
+            self.ssj_file = Path(infiles[0])
+        else:
+            raise ValueError("Argument corpus should be 'ssj' or 'kres'.")
+
+        self.corpus = corpus
+        self.W_TAGS = ['w']
+        self.C_TAGS = ['c']
+        self.S_TAGS = ['S', 'pc']
+
+    def parse_jos_links(self, sent_el):
+        if self.corpus == "kres":
+            return self.parse_jos_links_kres(sent_el)
+        else:
+            return self.parse_jos_links_ssj(sent_el)
+
+    def parse_ssj_target_arg(self, text):
+        # from: 0, to: 6
+        # <link ana="syn:modra" target="#ssj1.1.3 #ssj1.1.3.t6"/>
+        # from: 6, to: 7
+        # <link ana="syn:dol" target="#ssj1.1.3.t6 #ssj1.1.3.t7"/>
+        lst = [x.split(".")[-1] for x in text.split(" ")]
+        return [int(x[1:] if x[0] == "t" else 0) for x in lst]
+
+    def parse_jos_links_kres(self, sent_el):
+        lgrps = sent_el.findall(".//links")
+        if len(lgrps) < 1:
+            raise IOError("Can't find links.")
+        res_links = []
+        for link in lgrps[0]:
+            res_links += [{
+                "from": int(link.get("from").split(".")[-1]),
+                "afun": link.get("afun"),
+                "to": int(link.get("dep").split(".")[-1]),
+            }]
+        return res_links
+
+    def parse_jos_links_ssj(self, sent_el):
+        lgrps = sent_el.findall(".//linkGrp")
+        if len(lgrps) < 1:
+            # print(etree.tostring(sent_el))
+            raise IOError("Can't find links.")
+        res_links = []
+        for link in lgrps[0]:
+            print(link)
+            tar = self.parse_ssj_target_arg(link.get("target"))
+            res_links += [{
+                "from": tar[0],
+                "afun": link.get("ana").split(":")[1],
+                "to": [1],
+            }]
+        return res_links
+
+    def parse(self):
+        if self.corpus == "kres":
+            print("parse kres: TODO")
+        else:
+            self.parse_xml_file(self.ssj_file)
+
+    def parse_xml_file(self, filepath):
+        res_dict = {}
+        with filepath.open("rb") as fp:
+            # remove namespaces
+            bstr = fp.read()
+
+            utf8str = bstr.decode("utf-8")
+            utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
+            utf8str = re.sub(' xml:', ' ', utf8str)
+
+            root = etree.XML(utf8str.encode("utf-8"))
+
+            divs = []  # in ssj, there are divs, in Kres, there are separate files
+            if self.corpus == "kres":
+                divs = [root]
+            else:
+                divs = root.findall(".//div")
+
+            # parse divs
+            for div in divs:
+                f_id = div.get("id")
+
+                # parse paragraphs
+                for p in div.findall(".//p"):
+                    p_id = p.get("id").split(".")[-1]
+
+                    # parse sentences
+                    for s in p.findall(".//s"):
+                        s_id = s.get("id").split(".")[-1]
+                        sentence_text = ""
+                        sentence_tokens = []
+
+                        # parse tokens
+                        for el in s.iter():
+                            if el.tag in self.W_TAGS:
+                                el_id = el.get("id").split(".")[-1]
+                                if el_id[0] == 't':
+                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
+                                sentence_text += el.text
+                                sentence_tokens += [{
+                                    "word": True,
+                                    "tid": int(el_id),
+                                    "text": el.text,
+                                    "lemma": el.get("lemma"),
+                                    "msd": (el.get("msd") if self.corpus == "kres"
+                                        else el.get("ana").split(":")[-1]),
+                                }]
+                            elif el.tag in self.C_TAGS:
+                                # only Kres' C_TAGS have ids
+                                el_id = el.get("id") or "none"
+                                el_id = el_id.split(".")[-1]
+                                sentence_text += el.text
+                                sentence_tokens += [{
+                                    "word": False,
+                                    "tid": el_id,
+                                    "text": el.text,
+                                }]
+                            elif el.tag in self.S_TAGS:
+                                # Kres' <S /> doesn't contain .text
+                                sentence_text += " "
+                            else:
+                                # pass links and linkGroups
+                                pass
+                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
+
+                        # make a generator instead of holding the whole corpus in memory
+                        if sentence_id in res_dict:
+                            raise KeyError("duplicated id: {}".format(sentence_id))
+                        res_dict[sentence_id] = {
+                            "sid": sentence_id,
+                            "text": sentence_text,
+                            "tokens": sentence_tokens,
+                            "jos_links": self.parse_jos_links(s)
+                        }
+
+                        print(res_dict[sentence_id])
+                        break
+        return res_dict
@@ -0,0 +1,3 @@
+class Sentence():
+	def __init__():
+		print("Sentence __init__(): TODO")
@@ -0,0 +1,2 @@
+from corpusparser.Parser import Parser
+from corpusparser.Sentence import Sentence
@@ -0,0 +1,10 @@
+from setuptools import setup
+
+setup(name='corpusparser',
+  version='0.0.1',
+  description=u"Parser for kres and ssj500k",
+  author=u"Kristjan Voje",
+  author_email='kristjan.voje@gmail.com',
+  license='MIT',
+  zip_safe=False,
+)
@@ -0,0 +1,16 @@
+from corpusparser import Parser 
+import argparse
+
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
+	parser.add_argument('--kres-folder', required=True)
+	parser.add_argument('--kres-srl-folder', required=True)
+	parser.add_argument('--ssj-file', required=True)
+	args = parser.parse_args()	
+
+	# parse ssj
+	ssj_parser = Parser(
+		corpus="ssj",
+		infiles=[args.ssj_file]
+	)
+	ssj_parser.parse()