working on parser

2019-03-10 22:11:18 +01:00 · 2019-03-10 22:11:18 +01:00 · 1654548310
commit 1654548310
parent 46f09be1d1
16 changed files with 255 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 data/samples/
 */pycache/
 *egg-info/
--- a/34
+++ b/34
@ -0,0 +1,34 @@
 # All required components, to create and fill a database,
 # instantiate backend and frontend. 
 MAKE_ROOT = $(shell pwd)
 ### Input data
 # I received ssj500k in one .xml file,
 # kres is composed of many .xml files
 # I generated srl tags for kres in separate .json files
 # (for each kres.xml file there is a kres.json file with srl tags)
 SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml"
 KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example"
 KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl"
 export
 .PHONY: dev-env preflight
 all:
 	echo "Select an argument"
 # prereq (environment)
 dev-env:
 	cd dockerfiles; cd dev-env; $(MAKE)
 # run these inside dev-env container
 data/samples:
 	cd data; tar xzvf samples.tar.gz
 # installs our python code as packages
 # when debugging, run this once, then run python3 ... by hand
 preflight: data/samples
 	pip3 install -e src/pkg/corpusparser/.
 	python3 src/preflight/main_parse.py --kres-folder $(KRES_FOLDER) \
 		--ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER)
--- a/conf/main.conf
+++ b/conf/main.conf
--- a/data/kres_srl
+++ b/data/kres_srl
@ -0,0 +1 @@
 /home/kristjan/kres_srl/final_json/
--- a/data/samples.tar.gz
+++ b/data/samples.tar.gz
--- a/dockerfiles/dev-env/Dockerfile
+++ b/dockerfiles/dev-env/Dockerfile
@ -0,0 +1,17 @@
 FROM ubuntu:16.04
 RUN apt-get update --fix-missing
 RUN apt-get install -y \
 vim \
 python3 \
 python3-pip \
 sshfs
 RUN pip3 install \
 	lxml \
 	pandas \
 	sklearn \
 	argparse \
 	pathlib
 ENV PYTHONIOENCODING UTF-8
--- a/dockerfiles/dev-env/Makefile
+++ b/dockerfiles/dev-env/Makefile
@ -0,0 +1,18 @@
 IMAGE_NAME="cjvt-dev-env"
 all: build run
 build:
 	docker build . -t $(IMAGE_NAME)
 run:
 	docker run \
    	-it \
 	-v /home/${USER}:/home/${USER} \
 	--user $(shell id -u):$(shell id -g) \
 	-v /etc/passwd:/etc/passwd \
 	-v /etc/group:/etc/group \
    -v $(MAKE_ROOT):/project \
 	-w /project \
 	$(IMAGE_NAME) \
    /bin/bash
--- a/src/pkg/anchor
+++ b/src/pkg/anchor
--- a/src/pkg/corpusparser/corpusparser/Parser.py
+++ b/src/pkg/corpusparser/corpusparser/Parser.py
@ -0,0 +1,151 @@
 from corpusparser import Sentence
 from pathlib import Path
 import re
 from lxml import etree
 # Read input file(.xml, .json; kres or ssj500k).  
 # Create an iterator that outputs resulting sentences (python dict format).  
 class Parser():
    def __init__(self, corpus, infiles):
        if corpus == "kres":
            self.kres_folder = Path(infiles[0])
            self.kres_srl_folder = Path(infiles[1])
        elif corpus == "ssj":
            self.ssj_file = Path(infiles[0])
        else:
            raise ValueError("Argument corpus should be 'ssj' or 'kres'.")
        self.corpus = corpus
        self.W_TAGS = ['w']
        self.C_TAGS = ['c']
        self.S_TAGS = ['S', 'pc']
    def parse_jos_links(self, sent_el):
        if self.corpus == "kres":
            return self.parse_jos_links_kres(sent_el)
        else:
            return self.parse_jos_links_ssj(sent_el)
    def parse_ssj_target_arg(self, text):
        # from: 0, to: 6
        # <link ana="syn:modra" target="#ssj1.1.3 #ssj1.1.3.t6"/>
        # from: 6, to: 7
        # <link ana="syn:dol" target="#ssj1.1.3.t6 #ssj1.1.3.t7"/>
        lst = [x.split(".")[-1] for x in text.split(" ")]
        return [int(x[1:] if x[0] == "t" else 0) for x in lst]
    def parse_jos_links_kres(self, sent_el):
        lgrps = sent_el.findall(".//links")
        if len(lgrps) < 1:
            raise IOError("Can't find links.")
        res_links = []
        for link in lgrps[0]:
            res_links += [{
                "from": int(link.get("from").split(".")[-1]),
                "afun": link.get("afun"),
                "to": int(link.get("dep").split(".")[-1]),
            }]
        return res_links
    def parse_jos_links_ssj(self, sent_el):
        lgrps = sent_el.findall(".//linkGrp")
        if len(lgrps) < 1:
            # print(etree.tostring(sent_el))
            raise IOError("Can't find links.")
        res_links = []
        for link in lgrps[0]:
            print(link)
            tar = self.parse_ssj_target_arg(link.get("target"))
            res_links += [{
                "from": tar[0],
                "afun": link.get("ana").split(":")[1],
                "to": [1],
            }]
        return res_links
    def parse(self):
        if self.corpus == "kres":
            print("parse kres: TODO")
        else:
            self.parse_xml_file(self.ssj_file)
    def parse_xml_file(self, filepath):
        res_dict = {}
        with filepath.open("rb") as fp:
            # remove namespaces
            bstr = fp.read()
            utf8str = bstr.decode("utf-8")
            utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
            utf8str = re.sub(' xml:', ' ', utf8str)
            root = etree.XML(utf8str.encode("utf-8"))
            divs = []  # in ssj, there are divs, in Kres, there are separate files
            if self.corpus == "kres":
                divs = [root]
            else:
                divs = root.findall(".//div")
            # parse divs
            for div in divs:
                f_id = div.get("id")
                # parse paragraphs
                for p in div.findall(".//p"):
                    p_id = p.get("id").split(".")[-1]
                    # parse sentences
                    for s in p.findall(".//s"):
                        s_id = s.get("id").split(".")[-1]
                        sentence_text = ""
                        sentence_tokens = []
                        # parse tokens
                        for el in s.iter():
                            if el.tag in self.W_TAGS:
                                el_id = el.get("id").split(".")[-1]
                                if el_id[0] == 't':
                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
                                sentence_text += el.text
                                sentence_tokens += [{
                                    "word": True,
                                    "tid": int(el_id),
                                    "text": el.text,
                                    "lemma": el.get("lemma"),
                                    "msd": (el.get("msd") if self.corpus == "kres"
                                        else el.get("ana").split(":")[-1]),
                                }]
                            elif el.tag in self.C_TAGS:
                                # only Kres' C_TAGS have ids
                                el_id = el.get("id") or "none"
                                el_id = el_id.split(".")[-1]
                                sentence_text += el.text
                                sentence_tokens += [{
                                    "word": False,
                                    "tid": el_id,
                                    "text": el.text,
                                }]
                            elif el.tag in self.S_TAGS:
                                # Kres' <S /> doesn't contain .text
                                sentence_text += " "
                            else:
                                # pass links and linkGroups
                                pass
                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
                        # make a generator instead of holding the whole corpus in memory
                        if sentence_id in res_dict:
                            raise KeyError("duplicated id: {}".format(sentence_id))
                        res_dict[sentence_id] = {
                            "sid": sentence_id,
                            "text": sentence_text,
                            "tokens": sentence_tokens,
                            "jos_links": self.parse_jos_links(s)
                        }
                        print(res_dict[sentence_id])
                        break
        return res_dict
--- a/src/pkg/corpusparser/corpusparser/Sentence.py
+++ b/src/pkg/corpusparser/corpusparser/Sentence.py
@ -0,0 +1,3 @@
 class Sentence():
 	def __init__():
 		print("Sentence __init__(): TODO")
--- a/src/pkg/corpusparser/corpusparser/init.py
+++ b/src/pkg/corpusparser/corpusparser/init.py
@ -0,0 +1,2 @@
 from corpusparser.Parser import Parser
 from corpusparser.Sentence import Sentence
--- a/src/pkg/corpusparser/corpusparser/pycache/Parser.cpython-35.pyc
+++ b/src/pkg/corpusparser/corpusparser/pycache/Parser.cpython-35.pyc
--- a/src/pkg/corpusparser/corpusparser/pycache/Sentence.cpython-35.pyc
+++ b/src/pkg/corpusparser/corpusparser/pycache/Sentence.cpython-35.pyc
--- a/src/pkg/corpusparser/corpusparser/pycache/init.cpython-35.pyc
+++ b/src/pkg/corpusparser/corpusparser/pycache/init.cpython-35.pyc
--- a/src/pkg/corpusparser/setup.py
+++ b/src/pkg/corpusparser/setup.py
@ -0,0 +1,10 @@
 from setuptools import setup
 setup(name='corpusparser',
  version='0.0.1',
  description=u"Parser for kres and ssj500k",
  author=u"Kristjan Voje",
  author_email='kristjan.voje@gmail.com',
  license='MIT',
  zip_safe=False,
 )
--- a/src/preflight/main_parse.py
+++ b/src/preflight/main_parse.py
@ -0,0 +1,16 @@
 from corpusparser import Parser 
 import argparse
 if __name__ == "__main__":
 	parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
 	parser.add_argument('--kres-folder', required=True)
 	parser.add_argument('--kres-srl-folder', required=True)
 	parser.add_argument('--ssj-file', required=True)
 	args = parser.parse_args()	
 	# parse ssj
 	ssj_parser = Parser(
 		corpus="ssj",
 		infiles=[args.ssj_file]
 	)
 	ssj_parser.parse()
		`@ -0,0 +1,2 @@`
							`from corpusparser.Parser import Parser`
							`from corpusparser.Sentence import Sentence`