java_python env dockerfile, progress on parser

This commit is contained in:
voje 2019-02-12 08:24:46 +01:00
parent b617fb5e16
commit 47bb4ce0ad
9 changed files with 56 additions and 7 deletions

View File

@ -0,0 +1,12 @@
FROM ubuntu:16.04
RUN apt-get update
RUN apt-get install -y \
vim \
default-jdk \
python3 \
python3-pip
RUN pip3 install lxml
ENV PYTHONIOENCODING UTF-8

12
dockerfiles/all/README.md Normal file
View File

@ -0,0 +1,12 @@
You might want to mount this whole repo into the docker container.
Also mount data locations.
Example container:
```bash
$ docker build . -t my_python
$ docker run \
-it \
-v $(echo $(cd ../..; pwd)):/cjvt-srl-tagging \
python_java \
/bin/bash
```

View File

@ -1,3 +1,6 @@
## msdmap.py
Help conversion between english and slovenian MSD.
Hardcoded values from online documentation (html tables).
## Sources
[1] (conll09 data format) https://nlpado.de/~sebastian/pub/papers/conll09_hajic.pdf

Binary file not shown.

View File

@ -9,7 +9,7 @@ class Msdmap():
("prislov", "R", "Adverb", "R"),
("zaimek", "Z", "Pronoun", "P"),
("števnik", "K", "Numeral", "M"),
("predlog", "D", "Preposition", "S"),
("predlog", "D", "Adposition", "S"),
("veznik", "V", "Conjunction", "C"),
("členek", "L", "Particle", "Q"),
("medmet", "M", "Interjection", "I"),
@ -290,6 +290,10 @@ class Msdmap():
for pos in self.pos_val:
if pos[7] == en_category and pos[col] == query:
return pos
print("---")
print(en_category)
print(col)
print(query)
raise ValueError("Wrong part of speech value.")
def msd_from_slo(self, msd):

View File

@ -1,5 +1,6 @@
from lxml import etree
import re
from parser.msdmap import Msdmap
W_TAGS = ['w']
C_TAGS = ['c']
@ -16,13 +17,15 @@ S_TAGS = ['S', 'pc']
def parse_tei(filepath):
guess_corpus = None # SSJ | KRES
res_dict = {}
with open(filepath, "r") as fp:
with open(filepath, "rb") as fp:
# remove namespaces
xmlstr = fp.read()
xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
xmlstr = re.sub(' xml:', ' ', xmlstr)
bstr = fp.read()
root = etree.XML(xmlstr.encode("utf-8"))
utf8str = bstr.decode("utf-8")
utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
utf8str = re.sub(' xml:', ' ', utf8str)
root = etree.XML(utf8str.encode("utf-8"))
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
@ -104,16 +107,28 @@ def parse_links(s_el):
def to_conll09(sentence_entry):
def fillpred(pos, feat):
if False:
# todo
return "Y"
return "_"
msdm = Msdmap()
# works with kres, with parsed links
out_str = ""
for token in sentence_entry["tokens"]:
if token[0] != "w":
continue
msd = msdm.msd_from_slo(token[4])
fprd = fillpred("todo", "todo")
print(msd)
print(token)
print(sentence_entry["links"])
t_id = token[1]
print(t_id)
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
# 1 3 4 5 6 7 8 9 10 11 12 13 14
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
t_id, # id
token[2], # form
token[3], # lemma
@ -126,6 +141,9 @@ def to_conll09(sentence_entry):
sentence_entry["links"][t_id][2], # phead
sentence_entry["links"][t_id][1], # deprel
sentence_entry["links"][t_id][1], # pdeprel
fprd, # fillpred
(token[3] if fprd == "Y" else "_"), # pred
"todo" # apredn...
)
out_str += "\n"
return out_str

Binary file not shown.