Big changes

This commit is contained in:
2022-02-04 11:24:47 +01:00
parent a6cee3d459
commit c1ecc4cdbc
18 changed files with 1384 additions and 53 deletions

View File

@@ -57,7 +57,10 @@ class Parser:
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
guess_corpus = "KRES"
if root.get("id")[0:2] == 'GF':
guess_corpus = "GIGA"
else:
guess_corpus = "KRES"
divs = [root]
else:
guess_corpus = "SSJ"
@@ -65,7 +68,10 @@ class Parser:
# parse divs
for div in divs:
f_id = div.get("id")
f_id = div.get("id")[:-6]
if guess_corpus == "GIGA":
div = div.findall(".//body")[0]
# parse paragraphs
for p in div.findall(".//p"):
@@ -75,46 +81,62 @@ class Parser:
for s in p.findall(".//s"):
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_list = []
sentence_tokens = []
# parse tokens
for el in s.iter():
if el.tag in self.W_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES"
else el.get("ana").split(":")[-1]),
)]
if guess_corpus != "GIGA":
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
)]
else:
sentence_list.append(el.text)
elif el.tag in self.C_TAGS:
# only Kres' C_TAGS have ids
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
if guess_corpus != "GIGA":
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in self.S_TAGS:
# Kres' <S /> doesn't contain .text
sentence_text += " "
if guess_corpus == "GIGA":
sentence_list.append(el.text)
else:
sentence_text += " "
else:
# pass links and linkGroups
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
)
}
if guess_corpus == "GIGA":
res_dict[sentence_id] = {
"sid": sentence_id,
"text": ' '.join(sentence_list),
"tokens": None,
"links": None
}
else:
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
)
}
fp.close()
return res_dict
@@ -123,7 +145,7 @@ class Parser:
def fillpred(tsv_row):
mrow = build_model_row(tsv_row)
x = mrow[:-1]
x = mrow[:-1]
y = self.fillpred_model.predict([x])
return y[0] # bool