parsed links

This commit is contained in:
voje 2019-02-12 02:48:34 +01:00
parent c398de66f7
commit b617fb5e16
7 changed files with 10044 additions and 116 deletions

9982
data/ssj500k-sl.sample.xml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,7 @@
FROM python
RUN pip install lxml
# for my convenience
RUN apt-get update
RUN apt-get install -y vim

View File

@ -7,7 +7,6 @@ $ docker build . -t my_python
$ docker run \
-it \
-v /home/kristjan/git/cjvt-srl-tagging:/cjvt-srl-tagging \
-v /home/kristjan/some_corpus_data:/some_corpus_data \
my_python \
/bin/bash
```

View File

@ -1,23 +1,23 @@
from parser import parser
import os
from os.path import join
import sys
SSJ500K_2_1 = 27829 # number of sentences
if __name__ == "__main__":
# make sure you sanitize every input into unicode
# make sure you sanitize every input into unicode
print("parsing ssj")
# ssj_file = "/home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml"
# ssj_file = "/dipldata/ssj500k-sl.TEI/ssj500k-sl.body.xml"
ssj_file = "/dipldata/ssj500k-sl.TEI/ssj500k-sl.body.sample.xml" # smaller file
ssj_dict = parser.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = parser.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = "../data/kres_example/"
for kres_file in os.listdir(kres_dir):
parser.parse_tei(join(kres_dir, kres_file))
print("end parsing kres")
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = "../data/kres_example/"
for kres_file in os.listdir(kres_dir):
res_dict = parser.parse_tei(join(kres_dir, kres_file))
for _, sentence in res_dict.items():
parser.to_conll09(sentence)
print("end parsing kres")

View File

@ -56,7 +56,7 @@ def parse_tei(filepath):
sentence_text += el.text
sentence_tokens += [(
"w",
el_id,
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES"
@ -73,116 +73,59 @@ def parse_tei(filepath):
sentence_text += " "
else:
# pass links and linkGroups
# print(el.tag)
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
"""
print(sentence_id)
print(sentence_text)
print(sentence_tokens)
"""
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
)
}
return res_dict
def msd_slo_to_ang(msd):
# mapping table: http://nl.ijs.si/imp/msd/html-sl/#msd.index.values
# 3.1.1: list of POS
# 3.3.1: list of values
def parse_links(s_el):
lgrps = s_el.findall(".//links")
if len(lgrps) < 1:
raise IOError("Can't find links.")
res_links = {}
for link in lgrps[0]:
dep = int(link.get("dep").split(".")[-1])
res_links[dep] = (
link.get("afun"),
dep,
int(link.get("from").split(".")[-1]),
)
return res_links
msd = "Sosei"
def slo_pos(msd):
return msd[0]
def pos_slo_ang_map(col, query):
pos_slo_ang = [
("samostalnik", "S", "Noun", "N"),
("glagol", "G", "Verb", "V"),
("pridevnik", "P", "Adjective", "A"),
("prislov", "R", "Adverb", "R"),
("zaimek", "Z", "Pronoun", "P"),
("števnik", "K", "Numeral", "M"),
("predlog", "D", "Preposition", "S"),
("veznik", "V", "Conjunction", "C"),
("členek", "L", "Particle", "Q"),
("medmet", "M", "Interjection", "I"),
("okrajšava", "O", "Abbreviation", "Y"),
("neuvrščeno", "N", "Residual", "X"),
]
for pos in pos_slo_ang:
if pos[col] == query:
return pos
raise ValueError("Wrong part of speech.")
def pos_val_map(col, query):
# col:
# (sl_vrednost, sl_koda, sl_atribut, sl_kategorija,
# en_vrednost, en_koda, en_atribut, en_kategorija)
pos_val = [
("arabski", "a", "zapis", "števnik",
"digit", "d", "Form", "Numeral"),
("besedni", "b", "zapis", "števnik",
"letter", "l", "Form", "Numeral"),
("deležje", "d", "vrsta", "prislov",
"participle", "r", "Type", "Adverb"),
("deležniški", "d", "vrsta", "pridevnik",
" participle", "p", "Type", "Adjective"),
("dovršni", "d", "vid", "glagol",
" perfective", "e", "Aspect", "Verb"),
("dvovidski", "v", "vid", "glagol",
" biaspectual", "b", "Aspect", "Verb"),
("glavni", "g", "vrsta", "glagol",
" main", "m", "Type", "Verb"),
("lastno_ime", "l", "vrsta", "samostalnik",
"proper", "p", "Type", "Noun"),
("moški", "m", "spol", "samostalnik",
"masculine", "m", "Gender", "Noun"),
("nedoločeno", "n", "stopnja", "pridevnik",
" positive", "p", "Degree", "Adjective"),
("nedoločeno", "n", "stopnja", "prislov",
"positive", "p", "Degree", "Adverb"),
("nedovršni", "n", "vid", "glagol",
" progressive", "p", "Aspect", "Verb"),
("občno_ime", "o", "vrsta", "samostalnik",
"common", "c", "Type", "Noun"),
("pomožni", "p", "vrsta", "glagol",
" auxiliary", "a", "Type", "Verb"),
("presežnik", "s", "stopnja", "pridevnik",
" superlative", "s", "Degree", "Adjective"),
("presežnik", "s", "stopnja", "prislov",
"superlative", "s", "Degree", "Adverb"),
("primernik", "p", "stopnja", "pridevnik",
" comparative", "c", "Degree", "Adjective"),
("primernik", "r", "stopnja", "prislov",
"comparative", "c", "Degree", "Adverb"),
("program", "p", "vrsta", "neuvrščeno",
" program", "p", "Type", "Residual"),
("rimski", "r", "zapis", "števnik",
"roman", "r", "Form", "Numeral"),
("splošni", "p", "vrsta", "pridevnik",
" general", "g", "Type", "Adjective"),
("splošni", "s", "vrsta", "prislov",
"general", "g", "Type", "Adverb"),
("srednji", "s", "spol", "samostalnik",
"neuter", "n", "Gender", "Noun"),
("svojilni", "s", "vrsta", "pridevnik",
" possessive", "s", "Type", "Adjective"),
("tipkarska", "t", "vrsta", "neuvrščeno",
" typo", "t", "Type", "Residual"),
("tujejezično", "j", "vrsta", "neuvrščeno",
" foreign", "f", "Type", "Residual"),
("ženski", "z", "spol", "samostalnik",
"feminine", "f", "Gender", "Noun"),
]
for pos in pos_val:
if pos[col] == query:
return pos
raise ValueError("Wrong part of speech value.")
def to_conll09(sentence_entry):
# works with kres, with parsed links
out_str = ""
for token in sentence_entry["tokens"]:
if token[0] != "w":
continue
print(token)
print(sentence_entry["links"])
t_id = token[1]
print(t_id)
out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
t_id, # id
token[2], # form
token[3], # lemma
token[3], # plemma
"todo", # pos (TODO)
"todo", # ppos (TODO)
"todo", # feat (TODO)
"todo", # pfeat (TODO)
sentence_entry["links"][t_id][2], # head
sentence_entry["links"][t_id][2], # phead
sentence_entry["links"][t_id][1], # deprel
sentence_entry["links"][t_id][1], # pdeprel
)
out_str += "\n"
return out_str