import xml.etree.ElementTree as ElementTree import re import pathlib import sys import pickle IN_FOLDER_IDS = sys.argv[1] IN_FOLDER_TAGGED = sys.argv[2] OUT_FILE = sys.argv[3] REPLACE = "xmlns=\"http://www.tei-c.org/ns/1.0\"" ids_files = {} ids_tags = {} def resolve_ids(id_content, tag_content, num): id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", "")) tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", "")) id_sentences = list(id_tree.findall(".//w")) tag_sentences = list(tag_tree.findall(".//w")) for id_w, tag_w in zip(id_sentences, tag_sentences): id_ = id_w.attrib['id'] if id_w.text != tag_w.text: print("Non-matching w tag #{}: {} != {}".format(id_, id_w.text, tag_w.text) ) return tag = tag_w.attrib['msd'] lemma = tag_w.attrib['lemma'] ids_tags[int(id_)] = (tag, lemma) for filepath in pathlib.Path(IN_FOLDER_IDS).glob("**/*.xml"): if not filepath.is_file(): continue filename = str(filepath).split('/')[-1] filenum = int(re.match(r"(\d+).*", filename).groups()[0]) with open(str(filepath), "r") as fp: ids_files[filenum] = fp.read() for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")): if not filepath.is_file(): continue filename = str(filepath).split('/')[-1] filenum = int(re.match(r"(\d+).*", filename).groups()[0]) with open(str(filepath), "r") as fp: print("\r", filepath, end="\t") resolve_ids(ids_files[filenum], fp.read(), filenum) old_tags = {} if pathlib.Path(OUT_FILE).is_file(): with open(OUT_FILE, "rb") as fp: old_tags = pickle.load(fp) for i, taglemma in ids_tags.items(): old_tags[i] = taglemma ids_tags = old_tags with open(OUT_FILE, "wb") as fp: pickle.dump(ids_tags, fp)