java_python env dockerfile, progress on parser

2019-02-12 08:24:46 +01:00 · 2019-02-12 08:24:46 +01:00 · 47bb4ce0ad
commit 47bb4ce0ad
parent b617fb5e16
9 changed files with 56 additions and 7 deletions
--- a/dockerfiles/all/Dockerfile
+++ b/dockerfiles/all/Dockerfile
@ -0,0 +1,12 @@
+FROM ubuntu:16.04
+
+RUN apt-get update
+RUN apt-get install -y \
+vim \
+default-jdk \
+python3 \
+python3-pip
+
+RUN pip3 install lxml
+
+ENV PYTHONIOENCODING UTF-8
--- a/dockerfiles/all/README.md
+++ b/dockerfiles/all/README.md
@ -0,0 +1,12 @@
+You might want to mount this whole repo into the docker container.  
+Also mount data locations.  
+
+Example container:
+```bash
+$ docker build . -t my_python
+$ docker run \
+    -it \
+    -v $(echo $(cd ../..; pwd)):/cjvt-srl-tagging \
+    python_java \
+    /bin/bash
+```
--- a/tools/parser/README.md
+++ b/tools/parser/README.md
@ -1,3 +1,6 @@
 ## msdmap.py
 Help conversion between english and slovenian MSD.  
 Hardcoded values from online documentation (html tables).  
+
+## Sources
+[1] (conll09 data format) https://nlpado.de/~sebastian/pub/papers/conll09_hajic.pdf
--- a/tools/parser/init.pyc
+++ b/tools/parser/init.pyc
--- a/tools/parser/pycache/init.cpython-37.pyc
+++ b/tools/parser/pycache/init.cpython-37.pyc
--- a/tools/parser/pycache/parser.cpython-37.pyc
+++ b/tools/parser/pycache/parser.cpython-37.pyc
--- a/tools/parser/msdmap.py
+++ b/tools/parser/msdmap.py
@ -9,7 +9,7 @@ class Msdmap():
            ("prislov", "R", "Adverb", "R"),
            ("zaimek", "Z", "Pronoun", "P"),
            ("števnik", "K", "Numeral", "M"),
-            ("predlog", "D", "Preposition", "S"),
+            ("predlog", "D", "Adposition", "S"),
            ("veznik", "V", "Conjunction", "C"),
            ("členek", "L", "Particle", "Q"),
            ("medmet", "M", "Interjection", "I"),
@ -290,6 +290,10 @@ class Msdmap():
        for pos in self.pos_val:
            if pos[7] == en_category and pos[col] == query:
                return pos
+        print("---")
+        print(en_category)
+        print(col)
+        print(query)
        raise ValueError("Wrong part of speech value.")

    def msd_from_slo(self, msd):
--- a/tools/parser/parser.py
+++ b/tools/parser/parser.py
@ -1,5 +1,6 @@
 from lxml import etree
 import re
+from parser.msdmap import Msdmap

 W_TAGS = ['w']
 C_TAGS = ['c']
@ -16,13 +17,15 @@ S_TAGS = ['S', 'pc']
 def parse_tei(filepath):
    guess_corpus = None  # SSJ | KRES
    res_dict = {}
-    with open(filepath, "r") as fp:
+    with open(filepath, "rb") as fp:
        # remove namespaces
-        xmlstr = fp.read()
-        xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
-        xmlstr = re.sub(' xml:', ' ', xmlstr)
+        bstr = fp.read()

-        root = etree.XML(xmlstr.encode("utf-8"))
+        utf8str = bstr.decode("utf-8")
+        utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
+        utf8str = re.sub(' xml:', ' ', utf8str)
+
+        root = etree.XML(utf8str.encode("utf-8"))

        divs = []  # in ssj, there are divs, in Kres, there are separate files
        if "id" in root.keys():
@ -104,16 +107,28 @@ def parse_links(s_el):


 def to_conll09(sentence_entry):
+
+    def fillpred(pos, feat):
+        if False:
+            # todo
+            return "Y"
+        return "_"
+
+    msdm = Msdmap()
    # works with kres, with parsed links
    out_str = ""
    for token in sentence_entry["tokens"]:
        if token[0] != "w":
            continue
+        msd = msdm.msd_from_slo(token[4])
+        fprd = fillpred("todo", "todo")
+        print(msd)
        print(token)
        print(sentence_entry["links"])
        t_id = token[1]
        print(t_id)
-        out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+        #            1   3   4   5   6   7   8   9  10  11  12  13  14
+        out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            t_id,  # id
            token[2],  # form
            token[3],  # lemma
@ -126,6 +141,9 @@ def to_conll09(sentence_entry):
            sentence_entry["links"][t_id][2],  # phead
            sentence_entry["links"][t_id][1],  # deprel
            sentence_entry["links"][t_id][1],  # pdeprel
+            fprd,  # fillpred
+            (token[3] if fprd == "Y" else "_"),  # pred
+            "todo"  # apredn...
        )
    out_str += "\n"
    return out_str
--- a/tools/parser/parser.pyc
+++ b/tools/parser/parser.pyc