import xml.etree.ElementTree as ElementTree import pickle import sys import re ID = "ozbo_id" IDS_PICKLE = sys.argv[1] IN_XML = sys.argv[2] OUT_FILE = sys.argv[3] with open(IDS_PICKLE, "rb") as fp: ids_dict = pickle.load(fp) with open(IN_XML, "r") as fp: content = fp.read() print("XML read") # remove old msd-s ane lemma-s msd_matcher = r"(msd|lemma)=\"\S+\"" content = re.sub(msd_matcher, '', content) print("removed old msd's lemma's successfully") matcher = r"{} *= *\"?(\d+)\"?".format(ID) content_out = [] prev_end = 0 for f in re.finditer(matcher, content): content_out.append(content[prev_end:f.start()]) msd, lemma = ids_dict[int(f.groups()[0])] content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma)) prev_end = f.end() content_out.append(content[prev_end:]) print("added msd's lemma's successfully") content_out = "".join(content_out) xml_tree = ElementTree.XML(content_out) print("reparsed xml, all good!") with open(OUT_FILE, "wb") as fp: fp.write(ElementTree.tostring(xml_tree, encoding='utf8'))