Initial commit
This commit is contained in:
		
						commit
						f67e9f47cf
					
				
							
								
								
									
										28
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,28 @@
 | 
			
		||||
# SOLAR oznacevanje ucitelj/ucenec
 | 
			
		||||
 | 
			
		||||
## Dodaj IDje
 | 
			
		||||
 | 
			
		||||
"Pametno" doda ozbo\_id k vsem besedam v SOLAR xmlu.
 | 
			
		||||
 | 
			
		||||
``` python3 add_ids.py SOLAR.xml SOLAR_ID.xml ```
 | 
			
		||||
 | 
			
		||||
## Loci ucenec/ucitelj
 | 
			
		||||
 | 
			
		||||
Loci velik xml v skupek ucenec in skupek ucitelj xml-ov, vsak predstavlja \<text\> element iz vhodnega xml-a. Vse datoteke se izpisejo v mapi `student` in `teacher`.
 | 
			
		||||
 | 
			
		||||
```python3 separate.py SOLAR_ID.xml```
 | 
			
		||||
 | 
			
		||||
## Uporabi izhod tagger-ja
 | 
			
		||||
 | 
			
		||||
Denimo da damo mapo `student` skozi taggerja in dobimo izhod v mapi `student-out`. Vsaka mapa ima polno datotek: `0.xml, 1.xml,...`. Zdruzimo ozbo_id informacijo z informacijo taggerja:
 | 
			
		||||
 | 
			
		||||
```python3 student student-out tags.p```
 | 
			
		||||
 | 
			
		||||
To pozenemo prvo za ucenca in potem za ucitelja.
 | 
			
		||||
 | 
			
		||||
## Nazaj v original datoteko
 | 
			
		||||
 | 
			
		||||
Sedaj samo poberemo informacije iz `tags.p` in jih damo nazaj v vhodne xml-e. 
 | 
			
		||||
 | 
			
		||||
```python3 merge_back.py tags.p SOLAR_ID.xml SOLAR_OUT.xml```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										48
									
								
								add_ids.xml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								add_ids.xml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,48 @@
 | 
			
		||||
import xml.etree.ElementTree as ElementTree
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
FILE_IN = sys.argv[1]
 | 
			
		||||
FILE_OUT = sys.argv[2]
 | 
			
		||||
 | 
			
		||||
with open(FILE_IN, "r") as fp:
 | 
			
		||||
    xml_tree = ElementTree.XML(fp.read())
 | 
			
		||||
 | 
			
		||||
SEARCH_FOR = ["u1"]
 | 
			
		||||
ID = "ozbo_id" # should be the same in all files!
 | 
			
		||||
 | 
			
		||||
out_xml = ElementTree.Element('top')
 | 
			
		||||
 | 
			
		||||
ctr = 0
 | 
			
		||||
def add_ctr(el, previous=False):
 | 
			
		||||
    global ctr
 | 
			
		||||
    if previous:
 | 
			
		||||
        el.attrib[ID] = str(ctr - 1)
 | 
			
		||||
    else:
 | 
			
		||||
        el.attrib[ID] = str(ctr)
 | 
			
		||||
        ctr += 1
 | 
			
		||||
 | 
			
		||||
empty = ElementTree.Element('')
 | 
			
		||||
last_w1 = empty
 | 
			
		||||
last_w2 = empty
 | 
			
		||||
 | 
			
		||||
for idx, el in enumerate(xml_tree.findall(".//*")):
 | 
			
		||||
    if el.tag == "w3":
 | 
			
		||||
        add_ctr(el)
 | 
			
		||||
        
 | 
			
		||||
    elif el.tag == "w2":
 | 
			
		||||
        add_ctr(el, el.text == last_w2.text)
 | 
			
		||||
        last_w2 = el
 | 
			
		||||
    
 | 
			
		||||
    elif el.tag == "w1":
 | 
			
		||||
        add_ctr(el, el.text == last_w1.text)
 | 
			
		||||
        last_w1 = el
 | 
			
		||||
        
 | 
			
		||||
    # reset last_w1 lastw2
 | 
			
		||||
    elif el.tag == "S":
 | 
			
		||||
        last_w1 = empty
 | 
			
		||||
        last_w2 = empty
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
with open(FILE_OUT, "wb") as fp:
 | 
			
		||||
    fp.write(ElementTree.tostring(xml_tree, encoding='utf8', method='xml'))
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										44
									
								
								merge_back.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								merge_back.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,44 @@
 | 
			
		||||
import xml.etree.ElementTree as ElementTree
 | 
			
		||||
import pickle
 | 
			
		||||
import sys
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ID = "ozbo_id"
 | 
			
		||||
 | 
			
		||||
IDS_PICKLE = sys.argv[1]
 | 
			
		||||
IN_XML = sys.argv[2]
 | 
			
		||||
OUT_FILE = sys.argv[3]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
with open(IDS_PICKLE, "rb") as fp:
 | 
			
		||||
    ids_dict = pickle.load(fp)
 | 
			
		||||
 | 
			
		||||
with open(IN_XML, "r") as fp:
 | 
			
		||||
    content = fp.read()
 | 
			
		||||
    print("XML read")
 | 
			
		||||
 | 
			
		||||
# remove old msd-s ane lemma-s
 | 
			
		||||
msd_matcher = r"(msd|lemma)=\"\S+\""
 | 
			
		||||
content = re.sub(msd_matcher, '', content)
 | 
			
		||||
print("removed old msd's lemma's successfully")
 | 
			
		||||
 | 
			
		||||
matcher = r"{} *= *\"?(\d+)\"?".format(ID)
 | 
			
		||||
content_out = []
 | 
			
		||||
prev_end = 0
 | 
			
		||||
 | 
			
		||||
for f in re.finditer(matcher, content):
 | 
			
		||||
    content_out.append(content[prev_end:f.start()])
 | 
			
		||||
    msd, lemma = ids_dict[int(f.groups()[0])]
 | 
			
		||||
    content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma))
 | 
			
		||||
    prev_end = f.end()
 | 
			
		||||
 | 
			
		||||
content_out.append(content[prev_end:])
 | 
			
		||||
print("added msd's lemma's successfully")
 | 
			
		||||
 | 
			
		||||
content_out = "".join(content_out)
 | 
			
		||||
xml_tree = ElementTree.XML(content_out)
 | 
			
		||||
print("reparsed xml, all good!")
 | 
			
		||||
 | 
			
		||||
with open(OUT_FILE, "wb") as fp:
 | 
			
		||||
    fp.write(ElementTree.tostring(xml_tree, encoding='utf8'))
 | 
			
		||||
							
								
								
									
										122
									
								
								separate.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										122
									
								
								separate.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,122 @@
 | 
			
		||||
import xml.etree.ElementTree as ElementTree
 | 
			
		||||
import sys
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
FILE_IN = sys.argv[1]
 | 
			
		||||
 | 
			
		||||
TOP = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 | 
			
		||||
<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:lang="sl">"""
 | 
			
		||||
 | 
			
		||||
ID = "ozbo_id"
 | 
			
		||||
 | 
			
		||||
with open(FILE_IN, "r") as fp:
 | 
			
		||||
    xml_tree = ElementTree.XML(fp.read())
 | 
			
		||||
   
 | 
			
		||||
trees_teacher = []
 | 
			
		||||
trees_student = []
 | 
			
		||||
 | 
			
		||||
out_tree_teacher = ElementTree.Element('text')
 | 
			
		||||
out_tree_student = ElementTree.Element('text')
 | 
			
		||||
 | 
			
		||||
paragraph_teacher = None
 | 
			
		||||
paragraph_student = None
 | 
			
		||||
 | 
			
		||||
sentence_teacher = None
 | 
			
		||||
sentence_student = None
 | 
			
		||||
 | 
			
		||||
last_id = None
 | 
			
		||||
added_words = False
 | 
			
		||||
 | 
			
		||||
def reset_word(el):
 | 
			
		||||
    el.tag = "w"
 | 
			
		||||
    del el.attrib["msd"]
 | 
			
		||||
    del el.attrib["lemma"]
 | 
			
		||||
 | 
			
		||||
for idx, el in enumerate(xml_tree.findall(".//*")):
 | 
			
		||||
    if el.tag == "body":
 | 
			
		||||
        if paragraph_teacher is not None:
 | 
			
		||||
            out_tree_teacher.append(paragraph_teacher)
 | 
			
		||||
            out_tree_student.append(paragraph_student)
 | 
			
		||||
 | 
			
		||||
            trees_teacher.append(out_tree_teacher)
 | 
			
		||||
            trees_student.append(out_tree_student)
 | 
			
		||||
 | 
			
		||||
            out_tree_teacher = ElementTree.Element('text')
 | 
			
		||||
            out_tree_student = ElementTree.Element('text')
 | 
			
		||||
 | 
			
		||||
        paragraph_teacher = ElementTree.Element('p')
 | 
			
		||||
        paragraph_student = ElementTree.Element('p')
 | 
			
		||||
    
 | 
			
		||||
    elif el.tag == 'st1':
 | 
			
		||||
        if added_words:
 | 
			
		||||
            paragraph_teacher.append(sentence_teacher)
 | 
			
		||||
            paragraph_student.append(sentence_student)
 | 
			
		||||
            
 | 
			
		||||
        sentence_student = ElementTree.Element('s')
 | 
			
		||||
        sentence_teacher = ElementTree.Element('s')
 | 
			
		||||
        added_words = False
 | 
			
		||||
    
 | 
			
		||||
    elif el.tag in ["w1", "w2", "w3"]:
 | 
			
		||||
        added_words = True
 | 
			
		||||
        
 | 
			
		||||
        if el.tag == "w1":
 | 
			
		||||
            if last_id == el.attrib[ID]:
 | 
			
		||||
                print("REPEAT...")
 | 
			
		||||
                continue
 | 
			
		||||
            
 | 
			
		||||
            reset_word(el)
 | 
			
		||||
            sentence_student.append(el)
 | 
			
		||||
            last_id = el.attrib[ID]
 | 
			
		||||
        
 | 
			
		||||
        elif el.tag == "w2":
 | 
			
		||||
            if last_id == el.attrib[ID]:
 | 
			
		||||
                continue
 | 
			
		||||
            
 | 
			
		||||
            reset_word(el)
 | 
			
		||||
            sentence_teacher.append(el)
 | 
			
		||||
            last_id = el.attrib[ID]
 | 
			
		||||
        else:
 | 
			
		||||
            reset_word(el)
 | 
			
		||||
            sentence_teacher.append(el)
 | 
			
		||||
            sentence_student.append(el)
 | 
			
		||||
    
 | 
			
		||||
    elif el.tag == "c1":
 | 
			
		||||
        el.tag = "c"
 | 
			
		||||
        sentence_student.append(el)
 | 
			
		||||
    elif el.tag == "c2":
 | 
			
		||||
        el.tag = "c"
 | 
			
		||||
        sentence_teacher.append(el)
 | 
			
		||||
    elif el.tag == "c3":
 | 
			
		||||
        el.tag = "c"
 | 
			
		||||
        sentence_student.append(el)
 | 
			
		||||
        sentence_teacher.append(el)
 | 
			
		||||
        
 | 
			
		||||
    elif el.tag == "S":
 | 
			
		||||
        if len(sentence_student) == 0 or sentence_student[-1].tag != 'S':
 | 
			
		||||
            sentence_student.append(el)
 | 
			
		||||
        if len(sentence_teacher) == 0 or sentence_teacher[-1].tag != 'S':
 | 
			
		||||
            sentence_teacher.append(el)
 | 
			
		||||
        
 | 
			
		||||
assert(None not in [paragraph_teacher, paragraph_student, sentence_student, sentence_teacher])
 | 
			
		||||
 | 
			
		||||
paragraph_teacher.append(sentence_teacher)
 | 
			
		||||
paragraph_teacher.append(sentence_student)
 | 
			
		||||
out_tree_teacher.append(paragraph_teacher)
 | 
			
		||||
out_tree_student.append(paragraph_student)
 | 
			
		||||
trees_teacher.append(out_tree_teacher)
 | 
			
		||||
trees_student.append(out_tree_student)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for folder, tree in [("teacher", trees_teacher), ("student", trees_student)]:
 | 
			
		||||
    for idx, xml in enumerate(tree):
 | 
			
		||||
        filename =  "{}/{}.xml".format(folder, idx)
 | 
			
		||||
    
 | 
			
		||||
        s = ElementTree.tostring(xml, encoding='utf8', method='xml').decode('utf8')
 | 
			
		||||
        s = s.replace(ID, "xml:id")
 | 
			
		||||
        s = re.sub(r"<lb\s*\/>", "", s)
 | 
			
		||||
        s = s.replace('°', "●")
 | 
			
		||||
 | 
			
		||||
        snl = s.find('\n')
 | 
			
		||||
        with open(filename, "w") as fp:
 | 
			
		||||
            print(TOP + s[snl:] + "</TEI>", file=fp)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										75
									
								
								tag_ids.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								tag_ids.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,75 @@
 | 
			
		||||
import xml.etree.ElementTree as ElementTree
 | 
			
		||||
import re
 | 
			
		||||
import pathlib
 | 
			
		||||
import sys
 | 
			
		||||
import pickle
 | 
			
		||||
 | 
			
		||||
IN_FOLDER_IDS = sys.argv[1]
 | 
			
		||||
IN_FOLDER_TAGGED = sys.argv[2]
 | 
			
		||||
OUT_FILE = sys.argv[3]
 | 
			
		||||
 | 
			
		||||
REPLACE = "xmlns=\"http://www.tei-c.org/ns/1.0\""
 | 
			
		||||
 | 
			
		||||
ids_files = {}
 | 
			
		||||
ids_tags = {}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def resolve_ids(id_content, tag_content, num):
 | 
			
		||||
    print("\r", num, end="\t")
 | 
			
		||||
    id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", ""))
 | 
			
		||||
    tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", ""))
 | 
			
		||||
 | 
			
		||||
    id_sentences = list(id_tree.findall(".//w"))
 | 
			
		||||
    tag_sentences = list(tag_tree.findall(".//w"))
 | 
			
		||||
 | 
			
		||||
    id_l = len(id_sentences)
 | 
			
		||||
    tag_l = len(tag_sentences)
 | 
			
		||||
    assert(id_l == tag_l)
 | 
			
		||||
 | 
			
		||||
    for id_w, tag_w in zip(id_sentences, tag_sentences):
 | 
			
		||||
        if id_w.text != tag_w.text:
 | 
			
		||||
            print(id_w.text, tag_w.text) 
 | 
			
		||||
            exit(0)
 | 
			
		||||
 | 
			
		||||
        id_ = id_w.attrib['id']
 | 
			
		||||
        tag = tag_w.attrib['msd']
 | 
			
		||||
        lemma = tag_w.attrib['lemma']
 | 
			
		||||
        ids_tags[int(id_)] = (tag, lemma)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for filepath in pathlib.Path(IN_FOLDER_IDS).glob("**/*.xml"):
 | 
			
		||||
    if not filepath.is_file():
 | 
			
		||||
        continue
 | 
			
		||||
 | 
			
		||||
    filename = str(filepath).split('/')[-1]
 | 
			
		||||
    filenum = int(re.match(r"(\d+).*", filename).groups()[0])
 | 
			
		||||
 | 
			
		||||
    with open(str(filepath), "r") as fp:
 | 
			
		||||
        ids_files[filenum] = fp.read()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")):
 | 
			
		||||
    if not filepath.is_file():
 | 
			
		||||
        continue
 | 
			
		||||
 | 
			
		||||
    filename = str(filepath).split('/')[-1]
 | 
			
		||||
    filenum = int(re.match(r"(\d+).*", filename).groups()[0])
 | 
			
		||||
 | 
			
		||||
    with open(str(filepath), "r") as fp:
 | 
			
		||||
        resolve_ids(ids_files[filenum], fp.read(), filenum)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
old_tags = {}
 | 
			
		||||
if pathlib.Path(OUT_FILE).is_file():
 | 
			
		||||
    with open(OUT_FILE, "rb") as fp:
 | 
			
		||||
        old_tags = pickle.load(fp)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for i, taglemma in ids_tags.items():
 | 
			
		||||
    old_tags[i] = taglemma
 | 
			
		||||
ids_tags = old_tags
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
with open(OUT_FILE, "wb") as fp:
 | 
			
		||||
    pickle.dump(ids_tags, fp)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user