java_python env dockerfile, progress on parser
This commit is contained in:
parent
b617fb5e16
commit
47bb4ce0ad
12
dockerfiles/all/Dockerfile
Normal file
12
dockerfiles/all/Dockerfile
Normal file
|
@ -0,0 +1,12 @@
|
|||
FROM ubuntu:16.04
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y \
|
||||
vim \
|
||||
default-jdk \
|
||||
python3 \
|
||||
python3-pip
|
||||
|
||||
RUN pip3 install lxml
|
||||
|
||||
ENV PYTHONIOENCODING UTF-8
|
12
dockerfiles/all/README.md
Normal file
12
dockerfiles/all/README.md
Normal file
|
@ -0,0 +1,12 @@
|
|||
You might want to mount this whole repo into the docker container.
|
||||
Also mount data locations.
|
||||
|
||||
Example container:
|
||||
```bash
|
||||
$ docker build . -t my_python
|
||||
$ docker run \
|
||||
-it \
|
||||
-v $(echo $(cd ../..; pwd)):/cjvt-srl-tagging \
|
||||
python_java \
|
||||
/bin/bash
|
||||
```
|
|
@ -1,3 +1,6 @@
|
|||
## msdmap.py
|
||||
Help conversion between english and slovenian MSD.
|
||||
Hardcoded values from online documentation (html tables).
|
||||
|
||||
## Sources
|
||||
[1] (conll09 data format) https://nlpado.de/~sebastian/pub/papers/conll09_hajic.pdf
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -9,7 +9,7 @@ class Msdmap():
|
|||
("prislov", "R", "Adverb", "R"),
|
||||
("zaimek", "Z", "Pronoun", "P"),
|
||||
("števnik", "K", "Numeral", "M"),
|
||||
("predlog", "D", "Preposition", "S"),
|
||||
("predlog", "D", "Adposition", "S"),
|
||||
("veznik", "V", "Conjunction", "C"),
|
||||
("členek", "L", "Particle", "Q"),
|
||||
("medmet", "M", "Interjection", "I"),
|
||||
|
@ -290,6 +290,10 @@ class Msdmap():
|
|||
for pos in self.pos_val:
|
||||
if pos[7] == en_category and pos[col] == query:
|
||||
return pos
|
||||
print("---")
|
||||
print(en_category)
|
||||
print(col)
|
||||
print(query)
|
||||
raise ValueError("Wrong part of speech value.")
|
||||
|
||||
def msd_from_slo(self, msd):
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from lxml import etree
|
||||
import re
|
||||
from parser.msdmap import Msdmap
|
||||
|
||||
W_TAGS = ['w']
|
||||
C_TAGS = ['c']
|
||||
|
@ -16,13 +17,15 @@ S_TAGS = ['S', 'pc']
|
|||
def parse_tei(filepath):
|
||||
guess_corpus = None # SSJ | KRES
|
||||
res_dict = {}
|
||||
with open(filepath, "r") as fp:
|
||||
with open(filepath, "rb") as fp:
|
||||
# remove namespaces
|
||||
xmlstr = fp.read()
|
||||
xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
|
||||
xmlstr = re.sub(' xml:', ' ', xmlstr)
|
||||
bstr = fp.read()
|
||||
|
||||
root = etree.XML(xmlstr.encode("utf-8"))
|
||||
utf8str = bstr.decode("utf-8")
|
||||
utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
|
||||
utf8str = re.sub(' xml:', ' ', utf8str)
|
||||
|
||||
root = etree.XML(utf8str.encode("utf-8"))
|
||||
|
||||
divs = [] # in ssj, there are divs, in Kres, there are separate files
|
||||
if "id" in root.keys():
|
||||
|
@ -104,16 +107,28 @@ def parse_links(s_el):
|
|||
|
||||
|
||||
def to_conll09(sentence_entry):
|
||||
|
||||
def fillpred(pos, feat):
|
||||
if False:
|
||||
# todo
|
||||
return "Y"
|
||||
return "_"
|
||||
|
||||
msdm = Msdmap()
|
||||
# works with kres, with parsed links
|
||||
out_str = ""
|
||||
for token in sentence_entry["tokens"]:
|
||||
if token[0] != "w":
|
||||
continue
|
||||
msd = msdm.msd_from_slo(token[4])
|
||||
fprd = fillpred("todo", "todo")
|
||||
print(msd)
|
||||
print(token)
|
||||
print(sentence_entry["links"])
|
||||
t_id = token[1]
|
||||
print(t_id)
|
||||
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
|
||||
# 1 3 4 5 6 7 8 9 10 11 12 13 14
|
||||
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
|
||||
t_id, # id
|
||||
token[2], # form
|
||||
token[3], # lemma
|
||||
|
@ -126,6 +141,9 @@ def to_conll09(sentence_entry):
|
|||
sentence_entry["links"][t_id][2], # phead
|
||||
sentence_entry["links"][t_id][1], # deprel
|
||||
sentence_entry["links"][t_id][1], # pdeprel
|
||||
fprd, # fillpred
|
||||
(token[3] if fprd == "Y" else "_"), # pred
|
||||
"todo" # apredn...
|
||||
)
|
||||
out_str += "\n"
|
||||
return out_str
|
||||
|
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user