123 lines
3.5 KiB
Python
123 lines
3.5 KiB
Python
import xml.etree.ElementTree as ElementTree
|
|
import sys
|
|
import re
|
|
|
|
FILE_IN = sys.argv[1]
|
|
|
|
TOP = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
|
<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:lang="sl">"""
|
|
|
|
ID = "ozbo_id"
|
|
|
|
with open(FILE_IN, "r") as fp:
|
|
xml_tree = ElementTree.XML(fp.read())
|
|
|
|
trees_teacher = []
|
|
trees_student = []
|
|
|
|
out_tree_teacher = ElementTree.Element('text')
|
|
out_tree_student = ElementTree.Element('text')
|
|
|
|
paragraph_teacher = None
|
|
paragraph_student = None
|
|
|
|
sentence_teacher = None
|
|
sentence_student = None
|
|
|
|
last_id = None
|
|
added_words = False
|
|
|
|
def reset_word(el):
|
|
el.tag = "w"
|
|
del el.attrib["msd"]
|
|
del el.attrib["lemma"]
|
|
|
|
for idx, el in enumerate(xml_tree.findall(".//*")):
|
|
if el.tag == "body":
|
|
if paragraph_teacher is not None:
|
|
out_tree_teacher.append(paragraph_teacher)
|
|
out_tree_student.append(paragraph_student)
|
|
|
|
trees_teacher.append(out_tree_teacher)
|
|
trees_student.append(out_tree_student)
|
|
|
|
out_tree_teacher = ElementTree.Element('text')
|
|
out_tree_student = ElementTree.Element('text')
|
|
|
|
paragraph_teacher = ElementTree.Element('p')
|
|
paragraph_student = ElementTree.Element('p')
|
|
|
|
elif el.tag == 'st1':
|
|
if added_words:
|
|
paragraph_teacher.append(sentence_teacher)
|
|
paragraph_student.append(sentence_student)
|
|
|
|
sentence_student = ElementTree.Element('s')
|
|
sentence_teacher = ElementTree.Element('s')
|
|
added_words = False
|
|
|
|
elif el.tag in ["w1", "w2", "w3"]:
|
|
added_words = True
|
|
|
|
if el.tag == "w1":
|
|
if last_id == el.attrib[ID]:
|
|
print("REPEAT...")
|
|
continue
|
|
|
|
reset_word(el)
|
|
sentence_student.append(el)
|
|
last_id = el.attrib[ID]
|
|
|
|
elif el.tag == "w2":
|
|
if last_id == el.attrib[ID]:
|
|
continue
|
|
|
|
reset_word(el)
|
|
sentence_teacher.append(el)
|
|
last_id = el.attrib[ID]
|
|
else:
|
|
reset_word(el)
|
|
sentence_teacher.append(el)
|
|
sentence_student.append(el)
|
|
|
|
elif el.tag == "c1":
|
|
el.tag = "c"
|
|
sentence_student.append(el)
|
|
elif el.tag == "c2":
|
|
el.tag = "c"
|
|
sentence_teacher.append(el)
|
|
elif el.tag == "c3":
|
|
el.tag = "c"
|
|
sentence_student.append(el)
|
|
sentence_teacher.append(el)
|
|
|
|
elif el.tag == "S":
|
|
if len(sentence_student) == 0 or sentence_student[-1].tag != 'S':
|
|
sentence_student.append(el)
|
|
if len(sentence_teacher) == 0 or sentence_teacher[-1].tag != 'S':
|
|
sentence_teacher.append(el)
|
|
|
|
assert(None not in [paragraph_teacher, paragraph_student, sentence_student, sentence_teacher])
|
|
|
|
paragraph_teacher.append(sentence_teacher)
|
|
paragraph_teacher.append(sentence_student)
|
|
out_tree_teacher.append(paragraph_teacher)
|
|
out_tree_student.append(paragraph_student)
|
|
trees_teacher.append(out_tree_teacher)
|
|
trees_student.append(out_tree_student)
|
|
|
|
|
|
for folder, tree in [("teacher", trees_teacher), ("student", trees_student)]:
|
|
for idx, xml in enumerate(tree):
|
|
filename = "{}/{}.xml".format(folder, idx)
|
|
|
|
s = ElementTree.tostring(xml, encoding='utf8', method='xml').decode('utf8')
|
|
s = s.replace(ID, "xml:id")
|
|
s = re.sub(r"<lb\s*\/>", "", s)
|
|
s = s.replace('°', "●")
|
|
|
|
snl = s.find('\n')
|
|
with open(filename, "w") as fp:
|
|
print(TOP + s[snl:] + "</TEI>", file=fp)
|
|
|