diff --git a/tag_ids.py b/tag_ids.py index 73cb1cb..03b2bdc 100644 --- a/tag_ids.py +++ b/tag_ids.py @@ -15,23 +15,19 @@ ids_tags = {} def resolve_ids(id_content, tag_content, num): - print("\r", num, end="\t") id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", "")) tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", "")) id_sentences = list(id_tree.findall(".//w")) tag_sentences = list(tag_tree.findall(".//w")) - id_l = len(id_sentences) - tag_l = len(tag_sentences) - assert(id_l == tag_l) - for id_w, tag_w in zip(id_sentences, tag_sentences): + id_ = id_w.attrib['id'] + if id_w.text != tag_w.text: - print(id_w.text, tag_w.text) - exit(0) + print("Non-matching w tag #{}: {} != {}".format(id_, id_w.text, tag_w.text) ) + return - id_ = id_w.attrib['id'] tag = tag_w.attrib['msd'] lemma = tag_w.attrib['lemma'] ids_tags[int(id_)] = (tag, lemma) @@ -56,6 +52,7 @@ for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")): filenum = int(re.match(r"(\d+).*", filename).groups()[0]) with open(str(filepath), "r") as fp: + print("\r", filepath, end="\t") resolve_ids(ids_files[filenum], fp.read(), filenum)