import xml.etree.ElementTree as ElementTree
import pickle
import sys
import re
ID = "ozbo_id"
IDS_PICKLE = sys.argv[1]
IN_XML = sys.argv[2]
OUT_FILE = sys.argv[3]
with open(IDS_PICKLE, "rb") as fp:
ids_dict = pickle.load(fp)
with open(IN_XML, "r") as fp:
content =
print("XML read")
# remove old msd-s ane lemma-s
msd_matcher = r"(msd|lemma)=\"\S+\""
content = re.sub(msd_matcher, '', content)
print("removed old msd's lemma's successfully")
matcher = r"{} *= *\"?(\d+)\"?".format(ID)
content_out = []
prev_end = 0
for f in re.finditer(matcher, content):
msd, lemma = ids_dict[int(f.groups()[0])]
content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma))
prev_end = f.end()
print("added msd's lemma's successfully")
content_out = "".join(content_out)
xml_tree = ElementTree.XML(content_out)
print("reparsed xml, all good!")
with open(OUT_FILE, "wb") as fp:
fp.write(ElementTree.tostring(xml_tree, encoding='utf8'))