Initial commit

2018-12-11 17:49:17 +01:00 · 2018-12-11 17:49:17 +01:00 · f67e9f47cf
commit f67e9f47cf
5 changed files with 317 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,28 @@
 # SOLAR oznacevanje ucitelj/ucenec
 ## Dodaj IDje
 "Pametno" doda ozbo\_id k vsem besedam v SOLAR xmlu.
 ``` python3 add_ids.py SOLAR.xml SOLAR_ID.xml ```
 ## Loci ucenec/ucitelj
 Loci velik xml v skupek ucenec in skupek ucitelj xml-ov, vsak predstavlja \<text\> element iz vhodnega xml-a. Vse datoteke se izpisejo v mapi `student` in `teacher`.
 ```python3 separate.py SOLAR_ID.xml```
 ## Uporabi izhod tagger-ja
 Denimo da damo mapo `student` skozi taggerja in dobimo izhod v mapi `student-out`. Vsaka mapa ima polno datotek: `0.xml, 1.xml,...`. Zdruzimo ozbo_id informacijo z informacijo taggerja:
 ```python3 student student-out tags.p```
 To pozenemo prvo za ucenca in potem za ucitelja.
 ## Nazaj v original datoteko
 Sedaj samo poberemo informacije iz `tags.p` in jih damo nazaj v vhodne xml-e. 
 ```python3 merge_back.py tags.p SOLAR_ID.xml SOLAR_OUT.xml```
--- a/add_ids.xml
+++ b/add_ids.xml
@ -0,0 +1,48 @@
 import xml.etree.ElementTree as ElementTree
 import sys
 FILE_IN = sys.argv[1]
 FILE_OUT = sys.argv[2]
 with open(FILE_IN, "r") as fp:
    xml_tree = ElementTree.XML(fp.read())
 SEARCH_FOR = ["u1"]
 ID = "ozbo_id" # should be the same in all files!
 out_xml = ElementTree.Element('top')
 ctr = 0
 def add_ctr(el, previous=False):
    global ctr
    if previous:
        el.attrib[ID] = str(ctr - 1)
    else:
        el.attrib[ID] = str(ctr)
        ctr += 1
 empty = ElementTree.Element('')
 last_w1 = empty
 last_w2 = empty
 for idx, el in enumerate(xml_tree.findall(".//*")):
    if el.tag == "w3":
        add_ctr(el)
    elif el.tag == "w2":
        add_ctr(el, el.text == last_w2.text)
        last_w2 = el
    elif el.tag == "w1":
        add_ctr(el, el.text == last_w1.text)
        last_w1 = el
    # reset last_w1 lastw2
    elif el.tag == "S":
        last_w1 = empty
        last_w2 = empty
 with open(FILE_OUT, "wb") as fp:
    fp.write(ElementTree.tostring(xml_tree, encoding='utf8', method='xml'))
--- a/merge_back.py
+++ b/merge_back.py
@ -0,0 +1,44 @@
 import xml.etree.ElementTree as ElementTree
 import pickle
 import sys
 import re
 ID = "ozbo_id"
 IDS_PICKLE = sys.argv[1]
 IN_XML = sys.argv[2]
 OUT_FILE = sys.argv[3]
 with open(IDS_PICKLE, "rb") as fp:
    ids_dict = pickle.load(fp)
 with open(IN_XML, "r") as fp:
    content = fp.read()
    print("XML read")
 # remove old msd-s ane lemma-s
 msd_matcher = r"(msd|lemma)=\"\S+\""
 content = re.sub(msd_matcher, '', content)
 print("removed old msd's lemma's successfully")
 matcher = r"{} *= *\"?(\d+)\"?".format(ID)
 content_out = []
 prev_end = 0
 for f in re.finditer(matcher, content):
    content_out.append(content[prev_end:f.start()])
    msd, lemma = ids_dict[int(f.groups()[0])]
    content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma))
    prev_end = f.end()
 content_out.append(content[prev_end:])
 print("added msd's lemma's successfully")
 content_out = "".join(content_out)
 xml_tree = ElementTree.XML(content_out)
 print("reparsed xml, all good!")
 with open(OUT_FILE, "wb") as fp:
    fp.write(ElementTree.tostring(xml_tree, encoding='utf8'))
--- a/separate.py
+++ b/separate.py
@ -0,0 +1,122 @@
 import xml.etree.ElementTree as ElementTree
 import sys
 import re
 FILE_IN = sys.argv[1]
 TOP = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <TEI xmlns="http://www.tei-c.org/ns/1.0" xml:lang="sl">"""
 ID = "ozbo_id"
 with open(FILE_IN, "r") as fp:
    xml_tree = ElementTree.XML(fp.read())
 trees_teacher = []
 trees_student = []
 out_tree_teacher = ElementTree.Element('text')
 out_tree_student = ElementTree.Element('text')
 paragraph_teacher = None
 paragraph_student = None
 sentence_teacher = None
 sentence_student = None
 last_id = None
 added_words = False
 def reset_word(el):
    el.tag = "w"
    del el.attrib["msd"]
    del el.attrib["lemma"]
 for idx, el in enumerate(xml_tree.findall(".//*")):
    if el.tag == "body":
        if paragraph_teacher is not None:
            out_tree_teacher.append(paragraph_teacher)
            out_tree_student.append(paragraph_student)
            trees_teacher.append(out_tree_teacher)
            trees_student.append(out_tree_student)
            out_tree_teacher = ElementTree.Element('text')
            out_tree_student = ElementTree.Element('text')
        paragraph_teacher = ElementTree.Element('p')
        paragraph_student = ElementTree.Element('p')
    elif el.tag == 'st1':
        if added_words:
            paragraph_teacher.append(sentence_teacher)
            paragraph_student.append(sentence_student)
        sentence_student = ElementTree.Element('s')
        sentence_teacher = ElementTree.Element('s')
        added_words = False
    elif el.tag in ["w1", "w2", "w3"]:
        added_words = True
        if el.tag == "w1":
            if last_id == el.attrib[ID]:
                print("REPEAT...")
                continue
            reset_word(el)
            sentence_student.append(el)
            last_id = el.attrib[ID]
        elif el.tag == "w2":
            if last_id == el.attrib[ID]:
                continue
            reset_word(el)
            sentence_teacher.append(el)
            last_id = el.attrib[ID]
        else:
            reset_word(el)
            sentence_teacher.append(el)
            sentence_student.append(el)
    elif el.tag == "c1":
        el.tag = "c"
        sentence_student.append(el)
    elif el.tag == "c2":
        el.tag = "c"
        sentence_teacher.append(el)
    elif el.tag == "c3":
        el.tag = "c"
        sentence_student.append(el)
        sentence_teacher.append(el)
    elif el.tag == "S":
        if len(sentence_student) == 0 or sentence_student[-1].tag != 'S':
            sentence_student.append(el)
        if len(sentence_teacher) == 0 or sentence_teacher[-1].tag != 'S':
            sentence_teacher.append(el)
 assert(None not in [paragraph_teacher, paragraph_student, sentence_student, sentence_teacher])
 paragraph_teacher.append(sentence_teacher)
 paragraph_teacher.append(sentence_student)
 out_tree_teacher.append(paragraph_teacher)
 out_tree_student.append(paragraph_student)
 trees_teacher.append(out_tree_teacher)
 trees_student.append(out_tree_student)
 for folder, tree in [("teacher", trees_teacher), ("student", trees_student)]:
    for idx, xml in enumerate(tree):
        filename =  "{}/{}.xml".format(folder, idx)
        s = ElementTree.tostring(xml, encoding='utf8', method='xml').decode('utf8')
        s = s.replace(ID, "xml:id")
        s = re.sub(r"<lb\s*\/>", "", s)
        s = s.replace('°', "●")
        snl = s.find('\n')
        with open(filename, "w") as fp:
            print(TOP + s[snl:] + "</TEI>", file=fp)
--- a/tag_ids.py
+++ b/tag_ids.py
@ -0,0 +1,75 @@
 import xml.etree.ElementTree as ElementTree
 import re
 import pathlib
 import sys
 import pickle
 IN_FOLDER_IDS = sys.argv[1]
 IN_FOLDER_TAGGED = sys.argv[2]
 OUT_FILE = sys.argv[3]
 REPLACE = "xmlns=\"http://www.tei-c.org/ns/1.0\""
 ids_files = {}
 ids_tags = {}
 def resolve_ids(id_content, tag_content, num):
    print("\r", num, end="\t")
    id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", ""))
    tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", ""))
    id_sentences = list(id_tree.findall(".//w"))
    tag_sentences = list(tag_tree.findall(".//w"))
    id_l = len(id_sentences)
    tag_l = len(tag_sentences)
    assert(id_l == tag_l)
    for id_w, tag_w in zip(id_sentences, tag_sentences):
        if id_w.text != tag_w.text:
            print(id_w.text, tag_w.text) 
            exit(0)
        id_ = id_w.attrib['id']
        tag = tag_w.attrib['msd']
        lemma = tag_w.attrib['lemma']
        ids_tags[int(id_)] = (tag, lemma)
 for filepath in pathlib.Path(IN_FOLDER_IDS).glob("**/*.xml"):
    if not filepath.is_file():
        continue
    filename = str(filepath).split('/')[-1]
    filenum = int(re.match(r"(\d+).*", filename).groups()[0])
    with open(str(filepath), "r") as fp:
        ids_files[filenum] = fp.read()
 for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")):
    if not filepath.is_file():
        continue
    filename = str(filepath).split('/')[-1]
    filenum = int(re.match(r"(\d+).*", filename).groups()[0])
    with open(str(filepath), "r") as fp:
        resolve_ids(ids_files[filenum], fp.read(), filenum)
 old_tags = {}
 if pathlib.Path(OUT_FILE).is_file():
    with open(OUT_FILE, "rb") as fp:
        old_tags = pickle.load(fp)
 for i, taglemma in ids_tags.items():
    old_tags[i] = taglemma
 ids_tags = old_tags
 with open(OUT_FILE, "wb") as fp:
    pickle.dump(ids_tags, fp)