commit f67e9f47cf52d0802fe0b1a4ea7a1245c0f67640 Author: Ozbolt Menegatti Date: Tue Dec 11 17:49:17 2018 +0100 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..ce18b23 --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# SOLAR oznacevanje ucitelj/ucenec + +## Dodaj IDje + +"Pametno" doda ozbo\_id k vsem besedam v SOLAR xmlu. + +``` python3 add_ids.py SOLAR.xml SOLAR_ID.xml ``` + +## Loci ucenec/ucitelj + +Loci velik xml v skupek ucenec in skupek ucitelj xml-ov, vsak predstavlja \ element iz vhodnega xml-a. Vse datoteke se izpisejo v mapi `student` in `teacher`. + +```python3 separate.py SOLAR_ID.xml``` + +## Uporabi izhod tagger-ja + +Denimo da damo mapo `student` skozi taggerja in dobimo izhod v mapi `student-out`. Vsaka mapa ima polno datotek: `0.xml, 1.xml,...`. Zdruzimo ozbo_id informacijo z informacijo taggerja: + +```python3 student student-out tags.p``` + +To pozenemo prvo za ucenca in potem za ucitelja. + +## Nazaj v original datoteko + +Sedaj samo poberemo informacije iz `tags.p` in jih damo nazaj v vhodne xml-e. + +```python3 merge_back.py tags.p SOLAR_ID.xml SOLAR_OUT.xml``` + diff --git a/add_ids.xml b/add_ids.xml new file mode 100644 index 0000000..e4b0d15 --- /dev/null +++ b/add_ids.xml @@ -0,0 +1,48 @@ +import xml.etree.ElementTree as ElementTree +import sys + +FILE_IN = sys.argv[1] +FILE_OUT = sys.argv[2] + +with open(FILE_IN, "r") as fp: + xml_tree = ElementTree.XML(fp.read()) + +SEARCH_FOR = ["u1"] +ID = "ozbo_id" # should be the same in all files! + +out_xml = ElementTree.Element('top') + +ctr = 0 +def add_ctr(el, previous=False): + global ctr + if previous: + el.attrib[ID] = str(ctr - 1) + else: + el.attrib[ID] = str(ctr) + ctr += 1 + +empty = ElementTree.Element('') +last_w1 = empty +last_w2 = empty + +for idx, el in enumerate(xml_tree.findall(".//*")): + if el.tag == "w3": + add_ctr(el) + + elif el.tag == "w2": + add_ctr(el, el.text == last_w2.text) + last_w2 = el + + elif el.tag == "w1": + add_ctr(el, el.text == last_w1.text) + last_w1 = el + + # reset last_w1 lastw2 + elif el.tag == "S": + last_w1 = empty + last_w2 = empty + + +with open(FILE_OUT, "wb") as fp: + fp.write(ElementTree.tostring(xml_tree, encoding='utf8', method='xml')) + diff --git a/merge_back.py b/merge_back.py new file mode 100644 index 0000000..e54b4f8 --- /dev/null +++ b/merge_back.py @@ -0,0 +1,44 @@ +import xml.etree.ElementTree as ElementTree +import pickle +import sys +import re + + +ID = "ozbo_id" + +IDS_PICKLE = sys.argv[1] +IN_XML = sys.argv[2] +OUT_FILE = sys.argv[3] + + +with open(IDS_PICKLE, "rb") as fp: + ids_dict = pickle.load(fp) + +with open(IN_XML, "r") as fp: + content = fp.read() + print("XML read") + +# remove old msd-s ane lemma-s +msd_matcher = r"(msd|lemma)=\"\S+\"" +content = re.sub(msd_matcher, '', content) +print("removed old msd's lemma's successfully") + +matcher = r"{} *= *\"?(\d+)\"?".format(ID) +content_out = [] +prev_end = 0 + +for f in re.finditer(matcher, content): + content_out.append(content[prev_end:f.start()]) + msd, lemma = ids_dict[int(f.groups()[0])] + content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma)) + prev_end = f.end() + +content_out.append(content[prev_end:]) +print("added msd's lemma's successfully") + +content_out = "".join(content_out) +xml_tree = ElementTree.XML(content_out) +print("reparsed xml, all good!") + +with open(OUT_FILE, "wb") as fp: + fp.write(ElementTree.tostring(xml_tree, encoding='utf8')) diff --git a/separate.py b/separate.py new file mode 100644 index 0000000..a2ba7d9 --- /dev/null +++ b/separate.py @@ -0,0 +1,122 @@ +import xml.etree.ElementTree as ElementTree +import sys +import re + +FILE_IN = sys.argv[1] + +TOP = """ +""" + +ID = "ozbo_id" + +with open(FILE_IN, "r") as fp: + xml_tree = ElementTree.XML(fp.read()) + +trees_teacher = [] +trees_student = [] + +out_tree_teacher = ElementTree.Element('text') +out_tree_student = ElementTree.Element('text') + +paragraph_teacher = None +paragraph_student = None + +sentence_teacher = None +sentence_student = None + +last_id = None +added_words = False + +def reset_word(el): + el.tag = "w" + del el.attrib["msd"] + del el.attrib["lemma"] + +for idx, el in enumerate(xml_tree.findall(".//*")): + if el.tag == "body": + if paragraph_teacher is not None: + out_tree_teacher.append(paragraph_teacher) + out_tree_student.append(paragraph_student) + + trees_teacher.append(out_tree_teacher) + trees_student.append(out_tree_student) + + out_tree_teacher = ElementTree.Element('text') + out_tree_student = ElementTree.Element('text') + + paragraph_teacher = ElementTree.Element('p') + paragraph_student = ElementTree.Element('p') + + elif el.tag == 'st1': + if added_words: + paragraph_teacher.append(sentence_teacher) + paragraph_student.append(sentence_student) + + sentence_student = ElementTree.Element('s') + sentence_teacher = ElementTree.Element('s') + added_words = False + + elif el.tag in ["w1", "w2", "w3"]: + added_words = True + + if el.tag == "w1": + if last_id == el.attrib[ID]: + print("REPEAT...") + continue + + reset_word(el) + sentence_student.append(el) + last_id = el.attrib[ID] + + elif el.tag == "w2": + if last_id == el.attrib[ID]: + continue + + reset_word(el) + sentence_teacher.append(el) + last_id = el.attrib[ID] + else: + reset_word(el) + sentence_teacher.append(el) + sentence_student.append(el) + + elif el.tag == "c1": + el.tag = "c" + sentence_student.append(el) + elif el.tag == "c2": + el.tag = "c" + sentence_teacher.append(el) + elif el.tag == "c3": + el.tag = "c" + sentence_student.append(el) + sentence_teacher.append(el) + + elif el.tag == "S": + if len(sentence_student) == 0 or sentence_student[-1].tag != 'S': + sentence_student.append(el) + if len(sentence_teacher) == 0 or sentence_teacher[-1].tag != 'S': + sentence_teacher.append(el) + +assert(None not in [paragraph_teacher, paragraph_student, sentence_student, sentence_teacher]) + +paragraph_teacher.append(sentence_teacher) +paragraph_teacher.append(sentence_student) +out_tree_teacher.append(paragraph_teacher) +out_tree_student.append(paragraph_student) +trees_teacher.append(out_tree_teacher) +trees_student.append(out_tree_student) + + +for folder, tree in [("teacher", trees_teacher), ("student", trees_student)]: + for idx, xml in enumerate(tree): + filename = "{}/{}.xml".format(folder, idx) + + s = ElementTree.tostring(xml, encoding='utf8', method='xml').decode('utf8') + s = s.replace(ID, "xml:id") + s = re.sub(r"", "", s) + s = s.replace('°', "●") + + snl = s.find('\n') + with open(filename, "w") as fp: + print(TOP + s[snl:] + "", file=fp) + diff --git a/tag_ids.py b/tag_ids.py new file mode 100644 index 0000000..73cb1cb --- /dev/null +++ b/tag_ids.py @@ -0,0 +1,75 @@ +import xml.etree.ElementTree as ElementTree +import re +import pathlib +import sys +import pickle + +IN_FOLDER_IDS = sys.argv[1] +IN_FOLDER_TAGGED = sys.argv[2] +OUT_FILE = sys.argv[3] + +REPLACE = "xmlns=\"http://www.tei-c.org/ns/1.0\"" + +ids_files = {} +ids_tags = {} + + +def resolve_ids(id_content, tag_content, num): + print("\r", num, end="\t") + id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", "")) + tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", "")) + + id_sentences = list(id_tree.findall(".//w")) + tag_sentences = list(tag_tree.findall(".//w")) + + id_l = len(id_sentences) + tag_l = len(tag_sentences) + assert(id_l == tag_l) + + for id_w, tag_w in zip(id_sentences, tag_sentences): + if id_w.text != tag_w.text: + print(id_w.text, tag_w.text) + exit(0) + + id_ = id_w.attrib['id'] + tag = tag_w.attrib['msd'] + lemma = tag_w.attrib['lemma'] + ids_tags[int(id_)] = (tag, lemma) + + +for filepath in pathlib.Path(IN_FOLDER_IDS).glob("**/*.xml"): + if not filepath.is_file(): + continue + + filename = str(filepath).split('/')[-1] + filenum = int(re.match(r"(\d+).*", filename).groups()[0]) + + with open(str(filepath), "r") as fp: + ids_files[filenum] = fp.read() + + +for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")): + if not filepath.is_file(): + continue + + filename = str(filepath).split('/')[-1] + filenum = int(re.match(r"(\d+).*", filename).groups()[0]) + + with open(str(filepath), "r") as fp: + resolve_ids(ids_files[filenum], fp.read(), filenum) + + +old_tags = {} +if pathlib.Path(OUT_FILE).is_file(): + with open(OUT_FILE, "rb") as fp: + old_tags = pickle.load(fp) + + +for i, taglemma in ids_tags.items(): + old_tags[i] = taglemma +ids_tags = old_tags + + +with open(OUT_FILE, "wb") as fp: + pickle.dump(ids_tags, fp) +