73 lines
1.8 KiB
Python
73 lines
1.8 KiB
Python
import xml.etree.ElementTree as ElementTree
|
|
import re
|
|
import pathlib
|
|
import sys
|
|
import pickle
|
|
|
|
IN_FOLDER_IDS = sys.argv[1]
|
|
IN_FOLDER_TAGGED = sys.argv[2]
|
|
OUT_FILE = sys.argv[3]
|
|
|
|
REPLACE = "xmlns=\"http://www.tei-c.org/ns/1.0\""
|
|
|
|
ids_files = {}
|
|
ids_tags = {}
|
|
|
|
|
|
def resolve_ids(id_content, tag_content, num):
|
|
id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", ""))
|
|
tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", ""))
|
|
|
|
id_sentences = list(id_tree.findall(".//w"))
|
|
tag_sentences = list(tag_tree.findall(".//w"))
|
|
|
|
for id_w, tag_w in zip(id_sentences, tag_sentences):
|
|
id_ = id_w.attrib['id']
|
|
|
|
if id_w.text != tag_w.text:
|
|
print("Non-matching w tag #{}: {} != {}".format(id_, id_w.text, tag_w.text) )
|
|
return
|
|
|
|
tag = tag_w.attrib['msd']
|
|
lemma = tag_w.attrib['lemma']
|
|
ids_tags[int(id_)] = (tag, lemma)
|
|
|
|
|
|
for filepath in pathlib.Path(IN_FOLDER_IDS).glob("**/*.xml"):
|
|
if not filepath.is_file():
|
|
continue
|
|
|
|
filename = str(filepath).split('/')[-1]
|
|
filenum = int(re.match(r"(\d+).*", filename).groups()[0])
|
|
|
|
with open(str(filepath), "r") as fp:
|
|
ids_files[filenum] = fp.read()
|
|
|
|
|
|
for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")):
|
|
if not filepath.is_file():
|
|
continue
|
|
|
|
filename = str(filepath).split('/')[-1]
|
|
filenum = int(re.match(r"(\d+).*", filename).groups()[0])
|
|
|
|
with open(str(filepath), "r") as fp:
|
|
print("\r", filepath, end="\t")
|
|
resolve_ids(ids_files[filenum], fp.read(), filenum)
|
|
|
|
|
|
old_tags = {}
|
|
if pathlib.Path(OUT_FILE).is_file():
|
|
with open(OUT_FILE, "rb") as fp:
|
|
old_tags = pickle.load(fp)
|
|
|
|
|
|
for i, taglemma in ids_tags.items():
|
|
old_tags[i] = taglemma
|
|
ids_tags = old_tags
|
|
|
|
|
|
with open(OUT_FILE, "wb") as fp:
|
|
pickle.dump(ids_tags, fp)
|
|
|