forked from kristjan/cjvt-srl-tagging
Adapted code to ssj500k and added its branch
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
import copy
|
||||
|
||||
from lxml import etree
|
||||
import re
|
||||
from parser.msd.msdmap import Msdmap
|
||||
@@ -5,6 +7,7 @@ import pickle
|
||||
from pathlib import Path
|
||||
from fillpred_model.step1 import build_model_row
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
class Parser:
|
||||
# reads a TEI xml file and returns a dictionary:
|
||||
@@ -29,17 +32,23 @@ class Parser:
|
||||
def parse_tei(self, filepath):
|
||||
|
||||
def parse_links(s_el):
|
||||
lgrps = s_el.findall(".//links")
|
||||
sent_id = '#' + s_el.get('id')
|
||||
lgrps = s_el.findall(".//linkGrp")
|
||||
if len(lgrps) < 1:
|
||||
raise IOError("Can't find links.")
|
||||
res_links = {}
|
||||
for link in lgrps[0]:
|
||||
dep = int(link.get("dep").split(".")[-1])
|
||||
res_links[dep] = (
|
||||
link.get("afun"),
|
||||
dep,
|
||||
int(link.get("from").split(".")[-1]),
|
||||
)
|
||||
for lgrp in lgrps:
|
||||
if lgrp.get("type") == "JOS-SYN":
|
||||
for link in lgrp:
|
||||
jos_type = link.get("ana").split(":")[-1]
|
||||
link_data = link.get("target").split(" ")
|
||||
link_from = int(link_data[1].split('.')[-1][1:])
|
||||
link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
|
||||
res_links[link_from] = (
|
||||
jos_type,
|
||||
link_from,
|
||||
link_to,
|
||||
)
|
||||
return res_links
|
||||
|
||||
guess_corpus = None # SSJ | KRES
|
||||
@@ -79,6 +88,11 @@ class Parser:
|
||||
|
||||
# parse sentences
|
||||
for s in p.findall(".//s"):
|
||||
# test if sentence has jos-syn annotations and doesn't have SRL
|
||||
sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
|
||||
if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
|
||||
continue
|
||||
|
||||
s_id = s.get("id").split(".")[-1]
|
||||
sentence_text = ""
|
||||
sentence_list = []
|
||||
@@ -87,21 +101,29 @@ class Parser:
|
||||
# parse tokens
|
||||
for el in s.iter():
|
||||
if el.tag in self.W_TAGS:
|
||||
if guess_corpus != "GIGA":
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [(
|
||||
"w",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.get("lemma"),
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
)]
|
||||
else:
|
||||
sentence_list.append(el.text)
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
uPosTag = None
|
||||
uPosFeats = []
|
||||
for msd_el in el.get("msd").split('|'):
|
||||
key, val = msd_el.split('=')
|
||||
if key == 'UPosTag':
|
||||
uPosTag = val
|
||||
else:
|
||||
uPosFeats.append(msd_el)
|
||||
uPosFeats = '|'.join(uPosFeats)
|
||||
sentence_tokens += [(
|
||||
"w",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.get("lemma"),
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
uPosTag,
|
||||
uPosFeats
|
||||
)]
|
||||
elif el.tag in self.C_TAGS:
|
||||
# only Kres' C_TAGS have ids
|
||||
if guess_corpus != "GIGA":
|
||||
@@ -110,33 +132,243 @@ class Parser:
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [("c", el_id, el.text,)]
|
||||
elif el.tag in self.S_TAGS:
|
||||
# Kres' <S /> doesn't contain .text
|
||||
if guess_corpus == "GIGA":
|
||||
sentence_list.append(el.text)
|
||||
else:
|
||||
sentence_text += " "
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
uPosTag = None
|
||||
uPosFeats = []
|
||||
for msd_el in el.get("msd").split('|'):
|
||||
key, val = msd_el.split('=')
|
||||
if key == 'UPosTag':
|
||||
uPosTag = val
|
||||
else:
|
||||
uPosFeats.append(msd_el)
|
||||
uPosFeats = '|'.join(uPosFeats)
|
||||
sentence_tokens += [(
|
||||
"pc",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.text,
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
uPosTag,
|
||||
uPosFeats
|
||||
)]
|
||||
else:
|
||||
# pass links and linkGroups
|
||||
pass
|
||||
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
|
||||
sentence_id = s.get("id")
|
||||
if sentence_id in res_dict:
|
||||
raise KeyError("duplicated id: {}".format(sentence_id))
|
||||
if guess_corpus == "GIGA":
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": ' '.join(sentence_list),
|
||||
"tokens": None,
|
||||
"links": None
|
||||
}
|
||||
else:
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": sentence_text,
|
||||
"tokens": sentence_tokens,
|
||||
"links": (
|
||||
parse_links(s) if guess_corpus == "KRES" else None
|
||||
)
|
||||
}
|
||||
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": sentence_text,
|
||||
"tokens": sentence_tokens,
|
||||
"links": (
|
||||
parse_links(s)
|
||||
)
|
||||
}
|
||||
fp.close()
|
||||
return res_dict
|
||||
|
||||
|
||||
def minimize_tei(self, filepath, jsondata):
|
||||
def set_xml_attr(node, attribute, value):
|
||||
node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value
|
||||
|
||||
def parse_links(s_el):
|
||||
sent_id = '#' + s_el.get('id')
|
||||
lgrps = s_el.findall(".//linkGrp")
|
||||
if len(lgrps) < 1:
|
||||
raise IOError("Can't find links.")
|
||||
res_links = {}
|
||||
for lgrp in lgrps:
|
||||
if lgrp.get("type") == "JOS-SYN":
|
||||
for link in lgrp:
|
||||
jos_type = link.get("ana").split(":")[-1]
|
||||
link_data = link.get("target").split(" ")
|
||||
link_from = int(link_data[1].split('.')[-1][1:])
|
||||
link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
|
||||
res_links[link_from] = (
|
||||
jos_type,
|
||||
link_from,
|
||||
link_to,
|
||||
)
|
||||
return res_links
|
||||
|
||||
guess_corpus = None # SSJ | KRES
|
||||
res_dict = {}
|
||||
# with filepath.open("rb") as fp, open("../data/ssj500k2.3/final_tei/res.xml", 'w') as sf:
|
||||
with filepath.open("rb") as fp:
|
||||
used_ssj_documents = set([k.split('.')[0] for k, v in jsondata.items()])
|
||||
used_ssj_paragraphs = set(['.'.join(k.split('.')[:-1]) for k, v in jsondata.items()])
|
||||
used_ssj_sentences = set([k for k, v in jsondata.items()])
|
||||
|
||||
ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
|
||||
tree = ET.parse(fp)
|
||||
root_res = tree.getroot()
|
||||
# root_res = copy.deepcopy(root)
|
||||
ns = '{http://www.w3.org/XML/1998/namespace}'
|
||||
ns2 = '{http://www.tei-c.org/ns/1.0}'
|
||||
|
||||
for doc in list(root_res):
|
||||
doc_id = doc.get(ns + 'id')
|
||||
if doc_id not in used_ssj_documents:
|
||||
root_res.remove(doc)
|
||||
continue
|
||||
|
||||
for par in list(doc):
|
||||
par_id = par.get(ns + 'id')
|
||||
if par_id not in used_ssj_paragraphs:
|
||||
if par.tag != ns2 + 'bibl':
|
||||
doc.remove(par)
|
||||
continue
|
||||
|
||||
for sen in list(par):
|
||||
sen_id = sen.get(ns + 'id')
|
||||
if sen_id not in used_ssj_sentences:
|
||||
par.remove(sen)
|
||||
continue
|
||||
|
||||
linkGrp = ET.Element(f'{ns2}linkGrp')
|
||||
|
||||
linkGrp.attrib[f'targFunc'] = 'head argument'
|
||||
linkGrp.attrib[f'type'] = 'SRL'
|
||||
|
||||
for srl_el in jsondata[sen_id]:
|
||||
link = ET.Element(f'{ns2}link')
|
||||
link.attrib['ana'] = f'srl:{srl_el["arg"]}'
|
||||
link.attrib['target'] = f'#{sen_id}.t{srl_el["from"]} #{sen_id}.t{srl_el["dep"]}'
|
||||
linkGrp.append(link)
|
||||
sen.append(linkGrp)
|
||||
|
||||
|
||||
# <linkGrp corresp="#ssj1.1.1" targFunc="head argument" type="SRL">
|
||||
# <link ana="srl:TIME" target="#ssj1.1.1.t6 #ssj1.1.1.t3"/>
|
||||
# <link ana="srl:QUANT" target="#ssj1.1.1.t6 #ssj1.1.1.t5"/>
|
||||
# <link ana="srl:TIME" target="#ssj1.1.1.t8 #ssj1.1.1.t11"/>
|
||||
# <link ana="srl:PAT" target="#ssj1.1.1.t23 #ssj1.1.1.t21"/>
|
||||
# <link ana="srl:ACT" target="#ssj1.1.1.t23 #ssj1.1.1.t22"/>
|
||||
# <link ana="srl:RESLT" target="#ssj1.1.1.t18 #ssj1.1.1.t23"/>
|
||||
# </linkGrp>
|
||||
# print('aaa')
|
||||
|
||||
# sf.write(etree.tostring(tree, pretty_print=True, encoding='utf-8').decode())
|
||||
tree.write("../data/ssj500k2.3/final_tei/res.xml", encoding='utf-8')
|
||||
|
||||
return
|
||||
divs = [] # in ssj, there are divs, in Kres, there are separate files
|
||||
if "id" in root.keys():
|
||||
# Kres files start with <TEI id=...>
|
||||
if root.get("id")[0:2] == 'GF':
|
||||
guess_corpus = "GIGA"
|
||||
else:
|
||||
guess_corpus = "KRES"
|
||||
divs = [root]
|
||||
else:
|
||||
guess_corpus = "SSJ"
|
||||
divs = root.findall(".//div")
|
||||
|
||||
# parse divs
|
||||
for div in divs:
|
||||
f_id = div.get("id")
|
||||
|
||||
if guess_corpus == "GIGA":
|
||||
div = div.findall(".//body")[0]
|
||||
|
||||
# parse paragraphs
|
||||
for p in div.findall(".//p"):
|
||||
p_id = p.get("id").split(".")[-1]
|
||||
|
||||
# parse sentences
|
||||
for s in p.findall(".//s"):
|
||||
# test if sentence has jos-syn annotations and doesn't have SRL
|
||||
sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
|
||||
if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
|
||||
del s
|
||||
continue
|
||||
|
||||
s_id = s.get("id").split(".")[-1]
|
||||
sentence_text = ""
|
||||
sentence_list = []
|
||||
sentence_tokens = []
|
||||
|
||||
# parse tokens
|
||||
for el in s.iter():
|
||||
if el.tag in self.W_TAGS:
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
uPosTag = None
|
||||
uPosFeats = []
|
||||
for msd_el in el.get("msd").split('|'):
|
||||
key, val = msd_el.split('=')
|
||||
if key == 'UPosTag':
|
||||
uPosTag = val
|
||||
else:
|
||||
uPosFeats.append(msd_el)
|
||||
uPosFeats = '|'.join(uPosFeats)
|
||||
sentence_tokens += [(
|
||||
"w",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.get("lemma"),
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
uPosTag,
|
||||
uPosFeats
|
||||
)]
|
||||
elif el.tag in self.C_TAGS:
|
||||
# only Kres' C_TAGS have ids
|
||||
if guess_corpus != "GIGA":
|
||||
el_id = el.get("id") or "none"
|
||||
el_id = el_id.split(".")[-1]
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [("c", el_id, el.text,)]
|
||||
elif el.tag in self.S_TAGS:
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
uPosTag = None
|
||||
uPosFeats = []
|
||||
for msd_el in el.get("msd").split('|'):
|
||||
key, val = msd_el.split('=')
|
||||
if key == 'UPosTag':
|
||||
uPosTag = val
|
||||
else:
|
||||
uPosFeats.append(msd_el)
|
||||
uPosFeats = '|'.join(uPosFeats)
|
||||
sentence_tokens += [(
|
||||
"pc",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.text,
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
uPosTag,
|
||||
uPosFeats
|
||||
)]
|
||||
else:
|
||||
# pass links and linkGroups
|
||||
pass
|
||||
sentence_id = s.get("id")
|
||||
if sentence_id in res_dict:
|
||||
raise KeyError("duplicated id: {}".format(sentence_id))
|
||||
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": sentence_text,
|
||||
"tokens": sentence_tokens,
|
||||
"links": (
|
||||
parse_links(s)
|
||||
)
|
||||
}
|
||||
et = etree.ElementTree(root)
|
||||
et.write("../data/ssj500k2.3/final_tei/res.xml", pretty_print=True, encoding='unicode')
|
||||
fp.close()
|
||||
return res_dict
|
||||
|
||||
@@ -157,12 +389,8 @@ class Parser:
|
||||
|
||||
# handle stop signs
|
||||
if token[0] != "w":
|
||||
out_str += '\t'.join(
|
||||
[t_id] +
|
||||
[form for x in range(7)] +
|
||||
["0", "0", "modra", "modra", "_", "_"] +
|
||||
["\n"]
|
||||
)
|
||||
out_list = [t_id] + [form for x in range(7)] + ["0", "0", "modra", "modra", "_", "_"] + ["\n"]
|
||||
out_str += '\t'.join(map(str, out_list))
|
||||
continue
|
||||
|
||||
pos = self.msdmap.slo_msd_to_eng_pos(token[4])
|
||||
|
||||
Reference in New Issue
Block a user