import xml.etree.ElementTree as ElementTree import sys import re FILE_IN = sys.argv[1] TOP = """ """ ID = "ozbo_id" with open(FILE_IN, "r") as fp: xml_tree = ElementTree.XML(fp.read()) trees_teacher = [] trees_student = [] out_tree_teacher = ElementTree.Element('text') out_tree_student = ElementTree.Element('text') paragraph_teacher = None paragraph_student = None sentence_teacher = None sentence_student = None last_id = None added_words = False def reset_word(el): el.tag = "w" del el.attrib["msd"] del el.attrib["lemma"] for idx, el in enumerate(xml_tree.findall(".//*")): if el.tag == "body": if paragraph_teacher is not None: out_tree_teacher.append(paragraph_teacher) out_tree_student.append(paragraph_student) trees_teacher.append(out_tree_teacher) trees_student.append(out_tree_student) out_tree_teacher = ElementTree.Element('text') out_tree_student = ElementTree.Element('text') paragraph_teacher = ElementTree.Element('p') paragraph_student = ElementTree.Element('p') elif el.tag == 'st1': if added_words: paragraph_teacher.append(sentence_teacher) paragraph_student.append(sentence_student) sentence_student = ElementTree.Element('s') sentence_teacher = ElementTree.Element('s') added_words = False elif el.tag in ["w1", "w2", "w3"]: added_words = True if el.tag == "w1": if last_id == el.attrib[ID]: print("REPEAT...") continue reset_word(el) sentence_student.append(el) last_id = el.attrib[ID] elif el.tag == "w2": if last_id == el.attrib[ID]: continue reset_word(el) sentence_teacher.append(el) last_id = el.attrib[ID] else: reset_word(el) sentence_teacher.append(el) sentence_student.append(el) elif el.tag == "c1": el.tag = "c" sentence_student.append(el) elif el.tag == "c2": el.tag = "c" sentence_teacher.append(el) elif el.tag == "c3": el.tag = "c" sentence_student.append(el) sentence_teacher.append(el) elif el.tag == "S": if len(sentence_student) == 0 or sentence_student[-1].tag != 'S': sentence_student.append(el) if len(sentence_teacher) == 0 or sentence_teacher[-1].tag != 'S': sentence_teacher.append(el) assert(None not in [paragraph_teacher, paragraph_student, sentence_student, sentence_teacher]) paragraph_teacher.append(sentence_teacher) paragraph_teacher.append(sentence_student) out_tree_teacher.append(paragraph_teacher) out_tree_student.append(paragraph_student) trees_teacher.append(out_tree_teacher) trees_student.append(out_tree_student) for folder, tree in [("teacher", trees_teacher), ("student", trees_student)]: for idx, xml in enumerate(tree): filename = "{}/{}.xml".format(folder, idx) s = ElementTree.tostring(xml, encoding='utf8', method='xml').decode('utf8') s = s.replace(ID, "xml:id") s = re.sub(r"", "", s) s = s.replace('°', "●") snl = s.find('\n') with open(filename, "w") as fp: print(TOP + s[snl:] + "", file=fp)