Initial commit

2018-12-11 17:49:17 +01:00 · 2018-12-11 17:49:17 +01:00 · f67e9f47cf
commit f67e9f47cf
5 changed files with 317 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,28 @@
+# SOLAR oznacevanje ucitelj/ucenec
+
+## Dodaj IDje
+
+"Pametno" doda ozbo\_id k vsem besedam v SOLAR xmlu.
+
+``` python3 add_ids.py SOLAR.xml SOLAR_ID.xml ```
+
+## Loci ucenec/ucitelj
+
+Loci velik xml v skupek ucenec in skupek ucitelj xml-ov, vsak predstavlja \<text\> element iz vhodnega xml-a. Vse datoteke se izpisejo v mapi `student` in `teacher`.
+
+```python3 separate.py SOLAR_ID.xml```
+
+## Uporabi izhod tagger-ja
+
+Denimo da damo mapo `student` skozi taggerja in dobimo izhod v mapi `student-out`. Vsaka mapa ima polno datotek: `0.xml, 1.xml,...`. Zdruzimo ozbo_id informacijo z informacijo taggerja:
+
+```python3 student student-out tags.p```
+
+To pozenemo prvo za ucenca in potem za ucitelja.
+
+## Nazaj v original datoteko
+
+Sedaj samo poberemo informacije iz `tags.p` in jih damo nazaj v vhodne xml-e. 
+
+```python3 merge_back.py tags.p SOLAR_ID.xml SOLAR_OUT.xml```
+
--- a/add_ids.xml
+++ b/add_ids.xml
@ -0,0 +1,48 @@
+import xml.etree.ElementTree as ElementTree
+import sys
+
+FILE_IN = sys.argv[1]
+FILE_OUT = sys.argv[2]
+
+with open(FILE_IN, "r") as fp:
+    xml_tree = ElementTree.XML(fp.read())
+
+SEARCH_FOR = ["u1"]
+ID = "ozbo_id" # should be the same in all files!
+
+out_xml = ElementTree.Element('top')
+
+ctr = 0
+def add_ctr(el, previous=False):
+    global ctr
+    if previous:
+        el.attrib[ID] = str(ctr - 1)
+    else:
+        el.attrib[ID] = str(ctr)
+        ctr += 1
+
+empty = ElementTree.Element('')
+last_w1 = empty
+last_w2 = empty
+
+for idx, el in enumerate(xml_tree.findall(".//*")):
+    if el.tag == "w3":
+        add_ctr(el)
+        
+    elif el.tag == "w2":
+        add_ctr(el, el.text == last_w2.text)
+        last_w2 = el
+    
+    elif el.tag == "w1":
+        add_ctr(el, el.text == last_w1.text)
+        last_w1 = el
+        
+    # reset last_w1 lastw2
+    elif el.tag == "S":
+        last_w1 = empty
+        last_w2 = empty
+
+
+with open(FILE_OUT, "wb") as fp:
+    fp.write(ElementTree.tostring(xml_tree, encoding='utf8', method='xml'))
+
--- a/merge_back.py
+++ b/merge_back.py
@ -0,0 +1,44 @@
+import xml.etree.ElementTree as ElementTree
+import pickle
+import sys
+import re
+
+
+ID = "ozbo_id"
+
+IDS_PICKLE = sys.argv[1]
+IN_XML = sys.argv[2]
+OUT_FILE = sys.argv[3]
+
+
+with open(IDS_PICKLE, "rb") as fp:
+    ids_dict = pickle.load(fp)
+
+with open(IN_XML, "r") as fp:
+    content = fp.read()
+    print("XML read")
+
+# remove old msd-s ane lemma-s
+msd_matcher = r"(msd|lemma)=\"\S+\""
+content = re.sub(msd_matcher, '', content)
+print("removed old msd's lemma's successfully")
+
+matcher = r"{} *= *\"?(\d+)\"?".format(ID)
+content_out = []
+prev_end = 0
+
+for f in re.finditer(matcher, content):
+    content_out.append(content[prev_end:f.start()])
+    msd, lemma = ids_dict[int(f.groups()[0])]
+    content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma))
+    prev_end = f.end()
+
+content_out.append(content[prev_end:])
+print("added msd's lemma's successfully")
+
+content_out = "".join(content_out)
+xml_tree = ElementTree.XML(content_out)
+print("reparsed xml, all good!")
+
+with open(OUT_FILE, "wb") as fp:
+    fp.write(ElementTree.tostring(xml_tree, encoding='utf8'))
--- a/separate.py
+++ b/separate.py
@ -0,0 +1,122 @@
+import xml.etree.ElementTree as ElementTree
+import sys
+import re
+
+FILE_IN = sys.argv[1]
+
+TOP = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:lang="sl">"""
+
+ID = "ozbo_id"
+
+with open(FILE_IN, "r") as fp:
+    xml_tree = ElementTree.XML(fp.read())
+   
+trees_teacher = []
+trees_student = []
+
+out_tree_teacher = ElementTree.Element('text')
+out_tree_student = ElementTree.Element('text')
+
+paragraph_teacher = None
+paragraph_student = None
+
+sentence_teacher = None
+sentence_student = None
+
+last_id = None
+added_words = False
+
+def reset_word(el):
+    el.tag = "w"
+    del el.attrib["msd"]
+    del el.attrib["lemma"]
+
+for idx, el in enumerate(xml_tree.findall(".//*")):
+    if el.tag == "body":
+        if paragraph_teacher is not None:
+            out_tree_teacher.append(paragraph_teacher)
+            out_tree_student.append(paragraph_student)
+
+            trees_teacher.append(out_tree_teacher)
+            trees_student.append(out_tree_student)
+
+            out_tree_teacher = ElementTree.Element('text')
+            out_tree_student = ElementTree.Element('text')
+
+        paragraph_teacher = ElementTree.Element('p')
+        paragraph_student = ElementTree.Element('p')
+    
+    elif el.tag == 'st1':
+        if added_words:
+            paragraph_teacher.append(sentence_teacher)
+            paragraph_student.append(sentence_student)
+            
+        sentence_student = ElementTree.Element('s')
+        sentence_teacher = ElementTree.Element('s')
+        added_words = False
+    
+    elif el.tag in ["w1", "w2", "w3"]:
+        added_words = True
+        
+        if el.tag == "w1":
+            if last_id == el.attrib[ID]:
+                print("REPEAT...")
+                continue
+            
+            reset_word(el)
+            sentence_student.append(el)
+            last_id = el.attrib[ID]
+        
+        elif el.tag == "w2":
+            if last_id == el.attrib[ID]:
+                continue
+            
+            reset_word(el)
+            sentence_teacher.append(el)
+            last_id = el.attrib[ID]
+        else:
+            reset_word(el)
+            sentence_teacher.append(el)
+            sentence_student.append(el)
+    
+    elif el.tag == "c1":
+        el.tag = "c"
+        sentence_student.append(el)
+    elif el.tag == "c2":
+        el.tag = "c"
+        sentence_teacher.append(el)
+    elif el.tag == "c3":
+        el.tag = "c"
+        sentence_student.append(el)
+        sentence_teacher.append(el)
+        
+    elif el.tag == "S":
+        if len(sentence_student) == 0 or sentence_student[-1].tag != 'S':
+            sentence_student.append(el)
+        if len(sentence_teacher) == 0 or sentence_teacher[-1].tag != 'S':
+            sentence_teacher.append(el)
+        
+assert(None not in [paragraph_teacher, paragraph_student, sentence_student, sentence_teacher])
+
+paragraph_teacher.append(sentence_teacher)
+paragraph_teacher.append(sentence_student)
+out_tree_teacher.append(paragraph_teacher)
+out_tree_student.append(paragraph_student)
+trees_teacher.append(out_tree_teacher)
+trees_student.append(out_tree_student)
+
+
+for folder, tree in [("teacher", trees_teacher), ("student", trees_student)]:
+    for idx, xml in enumerate(tree):
+        filename =  "{}/{}.xml".format(folder, idx)
+    
+        s = ElementTree.tostring(xml, encoding='utf8', method='xml').decode('utf8')
+        s = s.replace(ID, "xml:id")
+        s = re.sub(r"<lb\s*\/>", "", s)
+        s = s.replace('°', "●")
+
+        snl = s.find('\n')
+        with open(filename, "w") as fp:
+            print(TOP + s[snl:] + "</TEI>", file=fp)
+
--- a/tag_ids.py
+++ b/tag_ids.py
@ -0,0 +1,75 @@
+import xml.etree.ElementTree as ElementTree
+import re
+import pathlib
+import sys
+import pickle
+
+IN_FOLDER_IDS = sys.argv[1]
+IN_FOLDER_TAGGED = sys.argv[2]
+OUT_FILE = sys.argv[3]
+
+REPLACE = "xmlns=\"http://www.tei-c.org/ns/1.0\""
+
+ids_files = {}
+ids_tags = {}
+
+
+def resolve_ids(id_content, tag_content, num):
+    print("\r", num, end="\t")
+    id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", ""))
+    tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", ""))
+
+    id_sentences = list(id_tree.findall(".//w"))
+    tag_sentences = list(tag_tree.findall(".//w"))
+
+    id_l = len(id_sentences)
+    tag_l = len(tag_sentences)
+    assert(id_l == tag_l)
+
+    for id_w, tag_w in zip(id_sentences, tag_sentences):
+        if id_w.text != tag_w.text:
+            print(id_w.text, tag_w.text) 
+            exit(0)
+
+        id_ = id_w.attrib['id']
+        tag = tag_w.attrib['msd']
+        lemma = tag_w.attrib['lemma']
+        ids_tags[int(id_)] = (tag, lemma)
+
+
+for filepath in pathlib.Path(IN_FOLDER_IDS).glob("**/*.xml"):
+    if not filepath.is_file():
+        continue
+
+    filename = str(filepath).split('/')[-1]
+    filenum = int(re.match(r"(\d+).*", filename).groups()[0])
+
+    with open(str(filepath), "r") as fp:
+        ids_files[filenum] = fp.read()
+
+
+for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")):
+    if not filepath.is_file():
+        continue
+
+    filename = str(filepath).split('/')[-1]
+    filenum = int(re.match(r"(\d+).*", filename).groups()[0])
+
+    with open(str(filepath), "r") as fp:
+        resolve_ids(ids_files[filenum], fp.read(), filenum)
+
+
+old_tags = {}
+if pathlib.Path(OUT_FILE).is_file():
+    with open(OUT_FILE, "rb") as fp:
+        old_tags = pickle.load(fp)
+
+
+for i, taglemma in ids_tags.items():
+    old_tags[i] = taglemma
+ids_tags = old_tags
+
+
+with open(OUT_FILE, "wb") as fp:
+    pickle.dump(ids_tags, fp)
+