diff --git a/dockerfiles/all/Dockerfile b/dockerfiles/all/Dockerfile new file mode 100644 index 0000000..e6159e9 --- /dev/null +++ b/dockerfiles/all/Dockerfile @@ -0,0 +1,12 @@ +FROM ubuntu:16.04 + +RUN apt-get update +RUN apt-get install -y \ +vim \ +default-jdk \ +python3 \ +python3-pip + +RUN pip3 install lxml + +ENV PYTHONIOENCODING UTF-8 diff --git a/dockerfiles/all/README.md b/dockerfiles/all/README.md new file mode 100644 index 0000000..166f5b1 --- /dev/null +++ b/dockerfiles/all/README.md @@ -0,0 +1,12 @@ +You might want to mount this whole repo into the docker container. +Also mount data locations. + +Example container: +```bash +$ docker build . -t my_python +$ docker run \ + -it \ + -v $(echo $(cd ../..; pwd)):/cjvt-srl-tagging \ + python_java \ + /bin/bash +``` diff --git a/tools/parser/README.md b/tools/parser/README.md index ef6ae42..a52097a 100644 --- a/tools/parser/README.md +++ b/tools/parser/README.md @@ -1,3 +1,6 @@ ## msdmap.py Help conversion between english and slovenian MSD. Hardcoded values from online documentation (html tables). + +## Sources +[1] (conll09 data format) https://nlpado.de/~sebastian/pub/papers/conll09_hajic.pdf \ No newline at end of file diff --git a/tools/parser/__init__.pyc b/tools/parser/__init__.pyc deleted file mode 100644 index d333b89..0000000 Binary files a/tools/parser/__init__.pyc and /dev/null differ diff --git a/tools/parser/__pycache__/__init__.cpython-37.pyc b/tools/parser/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index ecc16a3..0000000 Binary files a/tools/parser/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/tools/parser/__pycache__/parser.cpython-37.pyc b/tools/parser/__pycache__/parser.cpython-37.pyc deleted file mode 100644 index d2e9eff..0000000 Binary files a/tools/parser/__pycache__/parser.cpython-37.pyc and /dev/null differ diff --git a/tools/parser/msdmap.py b/tools/parser/msdmap.py index 7404faa..929ed74 100644 --- a/tools/parser/msdmap.py +++ b/tools/parser/msdmap.py @@ -9,7 +9,7 @@ class Msdmap(): ("prislov", "R", "Adverb", "R"), ("zaimek", "Z", "Pronoun", "P"), ("števnik", "K", "Numeral", "M"), - ("predlog", "D", "Preposition", "S"), + ("predlog", "D", "Adposition", "S"), ("veznik", "V", "Conjunction", "C"), ("členek", "L", "Particle", "Q"), ("medmet", "M", "Interjection", "I"), @@ -290,6 +290,10 @@ class Msdmap(): for pos in self.pos_val: if pos[7] == en_category and pos[col] == query: return pos + print("---") + print(en_category) + print(col) + print(query) raise ValueError("Wrong part of speech value.") def msd_from_slo(self, msd): diff --git a/tools/parser/parser.py b/tools/parser/parser.py index 2a08c2b..0a91b5b 100644 --- a/tools/parser/parser.py +++ b/tools/parser/parser.py @@ -1,5 +1,6 @@ from lxml import etree import re +from parser.msdmap import Msdmap W_TAGS = ['w'] C_TAGS = ['c'] @@ -16,13 +17,15 @@ S_TAGS = ['S', 'pc'] def parse_tei(filepath): guess_corpus = None # SSJ | KRES res_dict = {} - with open(filepath, "r") as fp: + with open(filepath, "rb") as fp: # remove namespaces - xmlstr = fp.read() - xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1) - xmlstr = re.sub(' xml:', ' ', xmlstr) + bstr = fp.read() - root = etree.XML(xmlstr.encode("utf-8")) + utf8str = bstr.decode("utf-8") + utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1) + utf8str = re.sub(' xml:', ' ', utf8str) + + root = etree.XML(utf8str.encode("utf-8")) divs = [] # in ssj, there are divs, in Kres, there are separate files if "id" in root.keys(): @@ -104,16 +107,28 @@ def parse_links(s_el): def to_conll09(sentence_entry): + + def fillpred(pos, feat): + if False: + # todo + return "Y" + return "_" + + msdm = Msdmap() # works with kres, with parsed links out_str = "" for token in sentence_entry["tokens"]: if token[0] != "w": continue + msd = msdm.msd_from_slo(token[4]) + fprd = fillpred("todo", "todo") + print(msd) print(token) print(sentence_entry["links"]) t_id = token[1] print(t_id) - out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + # 1 3 4 5 6 7 8 9 10 11 12 13 14 + out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( t_id, # id token[2], # form token[3], # lemma @@ -126,6 +141,9 @@ def to_conll09(sentence_entry): sentence_entry["links"][t_id][2], # phead sentence_entry["links"][t_id][1], # deprel sentence_entry["links"][t_id][1], # pdeprel + fprd, # fillpred + (token[3] if fprd == "Y" else "_"), # pred + "todo" # apredn... ) out_str += "\n" return out_str diff --git a/tools/parser/parser.pyc b/tools/parser/parser.pyc deleted file mode 100644 index 5ddae18..0000000 Binary files a/tools/parser/parser.pyc and /dev/null differ