45 lines
1.0 KiB
Python
45 lines
1.0 KiB
Python
import xml.etree.ElementTree as ElementTree
|
|
import pickle
|
|
import sys
|
|
import re
|
|
|
|
|
|
ID = "ozbo_id"
|
|
|
|
IDS_PICKLE = sys.argv[1]
|
|
IN_XML = sys.argv[2]
|
|
OUT_FILE = sys.argv[3]
|
|
|
|
|
|
with open(IDS_PICKLE, "rb") as fp:
|
|
ids_dict = pickle.load(fp)
|
|
|
|
with open(IN_XML, "r") as fp:
|
|
content = fp.read()
|
|
print("XML read")
|
|
|
|
# remove old msd-s ane lemma-s
|
|
msd_matcher = r"(msd|lemma)=\"\S+\""
|
|
content = re.sub(msd_matcher, '', content)
|
|
print("removed old msd's lemma's successfully")
|
|
|
|
matcher = r"{} *= *\"?(\d+)\"?".format(ID)
|
|
content_out = []
|
|
prev_end = 0
|
|
|
|
for f in re.finditer(matcher, content):
|
|
content_out.append(content[prev_end:f.start()])
|
|
msd, lemma = ids_dict[int(f.groups()[0])]
|
|
content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma))
|
|
prev_end = f.end()
|
|
|
|
content_out.append(content[prev_end:])
|
|
print("added msd's lemma's successfully")
|
|
|
|
content_out = "".join(content_out)
|
|
xml_tree = ElementTree.XML(content_out)
|
|
print("reparsed xml, all good!")
|
|
|
|
with open(OUT_FILE, "wb") as fp:
|
|
fp.write(ElementTree.tostring(xml_tree, encoding='utf8'))
|