forked from kristjan/cjvt-srl-tagging
Big changes
This commit is contained in:
@@ -57,7 +57,10 @@ class Parser:
|
||||
divs = [] # in ssj, there are divs, in Kres, there are separate files
|
||||
if "id" in root.keys():
|
||||
# Kres files start with <TEI id=...>
|
||||
guess_corpus = "KRES"
|
||||
if root.get("id")[0:2] == 'GF':
|
||||
guess_corpus = "GIGA"
|
||||
else:
|
||||
guess_corpus = "KRES"
|
||||
divs = [root]
|
||||
else:
|
||||
guess_corpus = "SSJ"
|
||||
@@ -65,7 +68,10 @@ class Parser:
|
||||
|
||||
# parse divs
|
||||
for div in divs:
|
||||
f_id = div.get("id")
|
||||
f_id = div.get("id")[:-6]
|
||||
|
||||
if guess_corpus == "GIGA":
|
||||
div = div.findall(".//body")[0]
|
||||
|
||||
# parse paragraphs
|
||||
for p in div.findall(".//p"):
|
||||
@@ -75,46 +81,62 @@ class Parser:
|
||||
for s in p.findall(".//s"):
|
||||
s_id = s.get("id").split(".")[-1]
|
||||
sentence_text = ""
|
||||
sentence_list = []
|
||||
sentence_tokens = []
|
||||
|
||||
# parse tokens
|
||||
for el in s.iter():
|
||||
if el.tag in self.W_TAGS:
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [(
|
||||
"w",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.get("lemma"),
|
||||
(el.get("msd") if guess_corpus == "KRES"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
)]
|
||||
if guess_corpus != "GIGA":
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [(
|
||||
"w",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.get("lemma"),
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
)]
|
||||
else:
|
||||
sentence_list.append(el.text)
|
||||
elif el.tag in self.C_TAGS:
|
||||
# only Kres' C_TAGS have ids
|
||||
el_id = el.get("id") or "none"
|
||||
el_id = el_id.split(".")[-1]
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [("c", el_id, el.text,)]
|
||||
if guess_corpus != "GIGA":
|
||||
el_id = el.get("id") or "none"
|
||||
el_id = el_id.split(".")[-1]
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [("c", el_id, el.text,)]
|
||||
elif el.tag in self.S_TAGS:
|
||||
# Kres' <S /> doesn't contain .text
|
||||
sentence_text += " "
|
||||
if guess_corpus == "GIGA":
|
||||
sentence_list.append(el.text)
|
||||
else:
|
||||
sentence_text += " "
|
||||
else:
|
||||
# pass links and linkGroups
|
||||
pass
|
||||
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
|
||||
if sentence_id in res_dict:
|
||||
raise KeyError("duplicated id: {}".format(sentence_id))
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": sentence_text,
|
||||
"tokens": sentence_tokens,
|
||||
"links": (
|
||||
parse_links(s) if guess_corpus == "KRES" else None
|
||||
)
|
||||
}
|
||||
if guess_corpus == "GIGA":
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": ' '.join(sentence_list),
|
||||
"tokens": None,
|
||||
"links": None
|
||||
}
|
||||
else:
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": sentence_text,
|
||||
"tokens": sentence_tokens,
|
||||
"links": (
|
||||
parse_links(s) if guess_corpus == "KRES" else None
|
||||
)
|
||||
}
|
||||
fp.close()
|
||||
return res_dict
|
||||
|
||||
@@ -123,7 +145,7 @@ class Parser:
|
||||
|
||||
def fillpred(tsv_row):
|
||||
mrow = build_model_row(tsv_row)
|
||||
x = mrow[:-1]
|
||||
x = mrow[:-1]
|
||||
y = self.fillpred_model.predict([x])
|
||||
return y[0] # bool
|
||||
|
||||
|
||||
Reference in New Issue
Block a user