tag_ids now reports errors does not just exit.

This commit is contained in:
Ozbolt Menegatti 2019-01-02 11:41:50 +01:00
parent 7aa3db49ac
commit da6747e1c4

View File

@ -15,23 +15,19 @@ ids_tags = {}
def resolve_ids(id_content, tag_content, num): def resolve_ids(id_content, tag_content, num):
print("\r", num, end="\t")
id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", "")) id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", ""))
tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", "")) tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", ""))
id_sentences = list(id_tree.findall(".//w")) id_sentences = list(id_tree.findall(".//w"))
tag_sentences = list(tag_tree.findall(".//w")) tag_sentences = list(tag_tree.findall(".//w"))
id_l = len(id_sentences)
tag_l = len(tag_sentences)
assert(id_l == tag_l)
for id_w, tag_w in zip(id_sentences, tag_sentences): for id_w, tag_w in zip(id_sentences, tag_sentences):
if id_w.text != tag_w.text:
print(id_w.text, tag_w.text)
exit(0)
id_ = id_w.attrib['id'] id_ = id_w.attrib['id']
if id_w.text != tag_w.text:
print("Non-matching w tag #{}: {} != {}".format(id_, id_w.text, tag_w.text) )
return
tag = tag_w.attrib['msd'] tag = tag_w.attrib['msd']
lemma = tag_w.attrib['lemma'] lemma = tag_w.attrib['lemma']
ids_tags[int(id_)] = (tag, lemma) ids_tags[int(id_)] = (tag, lemma)
@ -56,6 +52,7 @@ for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")):
filenum = int(re.match(r"(\d+).*", filename).groups()[0]) filenum = int(re.match(r"(\d+).*", filename).groups()[0])
with open(str(filepath), "r") as fp: with open(str(filepath), "r") as fp:
print("\r", filepath, end="\t")
resolve_ids(ids_files[filenum], fp.read(), filenum) resolve_ids(ids_files[filenum], fp.read(), filenum)