Solar-oznacevanje-zacasno/separate.py
2018-12-11 17:49:17 +01:00

123 lines
3.5 KiB
Python

import xml.etree.ElementTree as ElementTree
import sys
import re
FILE_IN = sys.argv[1]
TOP = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:lang="sl">"""
ID = "ozbo_id"
with open(FILE_IN, "r") as fp:
xml_tree = ElementTree.XML(fp.read())
trees_teacher = []
trees_student = []
out_tree_teacher = ElementTree.Element('text')
out_tree_student = ElementTree.Element('text')
paragraph_teacher = None
paragraph_student = None
sentence_teacher = None
sentence_student = None
last_id = None
added_words = False
def reset_word(el):
el.tag = "w"
del el.attrib["msd"]
del el.attrib["lemma"]
for idx, el in enumerate(xml_tree.findall(".//*")):
if el.tag == "body":
if paragraph_teacher is not None:
out_tree_teacher.append(paragraph_teacher)
out_tree_student.append(paragraph_student)
trees_teacher.append(out_tree_teacher)
trees_student.append(out_tree_student)
out_tree_teacher = ElementTree.Element('text')
out_tree_student = ElementTree.Element('text')
paragraph_teacher = ElementTree.Element('p')
paragraph_student = ElementTree.Element('p')
elif el.tag == 'st1':
if added_words:
paragraph_teacher.append(sentence_teacher)
paragraph_student.append(sentence_student)
sentence_student = ElementTree.Element('s')
sentence_teacher = ElementTree.Element('s')
added_words = False
elif el.tag in ["w1", "w2", "w3"]:
added_words = True
if el.tag == "w1":
if last_id == el.attrib[ID]:
print("REPEAT...")
continue
reset_word(el)
sentence_student.append(el)
last_id = el.attrib[ID]
elif el.tag == "w2":
if last_id == el.attrib[ID]:
continue
reset_word(el)
sentence_teacher.append(el)
last_id = el.attrib[ID]
else:
reset_word(el)
sentence_teacher.append(el)
sentence_student.append(el)
elif el.tag == "c1":
el.tag = "c"
sentence_student.append(el)
elif el.tag == "c2":
el.tag = "c"
sentence_teacher.append(el)
elif el.tag == "c3":
el.tag = "c"
sentence_student.append(el)
sentence_teacher.append(el)
elif el.tag == "S":
if len(sentence_student) == 0 or sentence_student[-1].tag != 'S':
sentence_student.append(el)
if len(sentence_teacher) == 0 or sentence_teacher[-1].tag != 'S':
sentence_teacher.append(el)
assert(None not in [paragraph_teacher, paragraph_student, sentence_student, sentence_teacher])
paragraph_teacher.append(sentence_teacher)
paragraph_teacher.append(sentence_student)
out_tree_teacher.append(paragraph_teacher)
out_tree_student.append(paragraph_student)
trees_teacher.append(out_tree_teacher)
trees_student.append(out_tree_student)
for folder, tree in [("teacher", trees_teacher), ("student", trees_student)]:
for idx, xml in enumerate(tree):
filename = "{}/{}.xml".format(folder, idx)
s = ElementTree.tostring(xml, encoding='utf8', method='xml').decode('utf8')
s = s.replace(ID, "xml:id")
s = re.sub(r"<lb\s*\/>", "", s)
s = s.replace('°', "")
snl = s.find('\n')
with open(filename, "w") as fp:
print(TOP + s[snl:] + "</TEI>", file=fp)