You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

45 lines
1.0 KiB

5 years ago
import xml.etree.ElementTree as ElementTree
import pickle
import sys
import re
ID = "ozbo_id"
IDS_PICKLE = sys.argv[1]
IN_XML = sys.argv[2]
OUT_FILE = sys.argv[3]
with open(IDS_PICKLE, "rb") as fp:
ids_dict = pickle.load(fp)
with open(IN_XML, "r") as fp:
content = fp.read()
print("XML read")
# remove old msd-s ane lemma-s
msd_matcher = r"(msd|lemma)=\"\S+\""
content = re.sub(msd_matcher, '', content)
print("removed old msd's lemma's successfully")
matcher = r"{} *= *\"?(\d+)\"?".format(ID)
content_out = []
prev_end = 0
for f in re.finditer(matcher, content):
content_out.append(content[prev_end:f.start()])
msd, lemma = ids_dict[int(f.groups()[0])]
content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma))
prev_end = f.end()
content_out.append(content[prev_end:])
print("added msd's lemma's successfully")
content_out = "".join(content_out)
xml_tree = ElementTree.XML(content_out)
print("reparsed xml, all good!")
with open(OUT_FILE, "wb") as fp:
fp.write(ElementTree.tostring(xml_tree, encoding='utf8'))