Initial commit
This commit is contained in:
commit
f67e9f47cf
28
README.md
Normal file
28
README.md
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
# SOLAR oznacevanje ucitelj/ucenec
|
||||||
|
|
||||||
|
## Dodaj IDje
|
||||||
|
|
||||||
|
"Pametno" doda ozbo\_id k vsem besedam v SOLAR xmlu.
|
||||||
|
|
||||||
|
``` python3 add_ids.py SOLAR.xml SOLAR_ID.xml ```
|
||||||
|
|
||||||
|
## Loci ucenec/ucitelj
|
||||||
|
|
||||||
|
Loci velik xml v skupek ucenec in skupek ucitelj xml-ov, vsak predstavlja \<text\> element iz vhodnega xml-a. Vse datoteke se izpisejo v mapi `student` in `teacher`.
|
||||||
|
|
||||||
|
```python3 separate.py SOLAR_ID.xml```
|
||||||
|
|
||||||
|
## Uporabi izhod tagger-ja
|
||||||
|
|
||||||
|
Denimo da damo mapo `student` skozi taggerja in dobimo izhod v mapi `student-out`. Vsaka mapa ima polno datotek: `0.xml, 1.xml,...`. Zdruzimo ozbo_id informacijo z informacijo taggerja:
|
||||||
|
|
||||||
|
```python3 student student-out tags.p```
|
||||||
|
|
||||||
|
To pozenemo prvo za ucenca in potem za ucitelja.
|
||||||
|
|
||||||
|
## Nazaj v original datoteko
|
||||||
|
|
||||||
|
Sedaj samo poberemo informacije iz `tags.p` in jih damo nazaj v vhodne xml-e.
|
||||||
|
|
||||||
|
```python3 merge_back.py tags.p SOLAR_ID.xml SOLAR_OUT.xml```
|
||||||
|
|
48
add_ids.xml
Normal file
48
add_ids.xml
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
import xml.etree.ElementTree as ElementTree
|
||||||
|
import sys
|
||||||
|
|
||||||
|
FILE_IN = sys.argv[1]
|
||||||
|
FILE_OUT = sys.argv[2]
|
||||||
|
|
||||||
|
with open(FILE_IN, "r") as fp:
|
||||||
|
xml_tree = ElementTree.XML(fp.read())
|
||||||
|
|
||||||
|
SEARCH_FOR = ["u1"]
|
||||||
|
ID = "ozbo_id" # should be the same in all files!
|
||||||
|
|
||||||
|
out_xml = ElementTree.Element('top')
|
||||||
|
|
||||||
|
ctr = 0
|
||||||
|
def add_ctr(el, previous=False):
|
||||||
|
global ctr
|
||||||
|
if previous:
|
||||||
|
el.attrib[ID] = str(ctr - 1)
|
||||||
|
else:
|
||||||
|
el.attrib[ID] = str(ctr)
|
||||||
|
ctr += 1
|
||||||
|
|
||||||
|
empty = ElementTree.Element('')
|
||||||
|
last_w1 = empty
|
||||||
|
last_w2 = empty
|
||||||
|
|
||||||
|
for idx, el in enumerate(xml_tree.findall(".//*")):
|
||||||
|
if el.tag == "w3":
|
||||||
|
add_ctr(el)
|
||||||
|
|
||||||
|
elif el.tag == "w2":
|
||||||
|
add_ctr(el, el.text == last_w2.text)
|
||||||
|
last_w2 = el
|
||||||
|
|
||||||
|
elif el.tag == "w1":
|
||||||
|
add_ctr(el, el.text == last_w1.text)
|
||||||
|
last_w1 = el
|
||||||
|
|
||||||
|
# reset last_w1 lastw2
|
||||||
|
elif el.tag == "S":
|
||||||
|
last_w1 = empty
|
||||||
|
last_w2 = empty
|
||||||
|
|
||||||
|
|
||||||
|
with open(FILE_OUT, "wb") as fp:
|
||||||
|
fp.write(ElementTree.tostring(xml_tree, encoding='utf8', method='xml'))
|
||||||
|
|
44
merge_back.py
Normal file
44
merge_back.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
import xml.etree.ElementTree as ElementTree
|
||||||
|
import pickle
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
ID = "ozbo_id"
|
||||||
|
|
||||||
|
IDS_PICKLE = sys.argv[1]
|
||||||
|
IN_XML = sys.argv[2]
|
||||||
|
OUT_FILE = sys.argv[3]
|
||||||
|
|
||||||
|
|
||||||
|
with open(IDS_PICKLE, "rb") as fp:
|
||||||
|
ids_dict = pickle.load(fp)
|
||||||
|
|
||||||
|
with open(IN_XML, "r") as fp:
|
||||||
|
content = fp.read()
|
||||||
|
print("XML read")
|
||||||
|
|
||||||
|
# remove old msd-s ane lemma-s
|
||||||
|
msd_matcher = r"(msd|lemma)=\"\S+\""
|
||||||
|
content = re.sub(msd_matcher, '', content)
|
||||||
|
print("removed old msd's lemma's successfully")
|
||||||
|
|
||||||
|
matcher = r"{} *= *\"?(\d+)\"?".format(ID)
|
||||||
|
content_out = []
|
||||||
|
prev_end = 0
|
||||||
|
|
||||||
|
for f in re.finditer(matcher, content):
|
||||||
|
content_out.append(content[prev_end:f.start()])
|
||||||
|
msd, lemma = ids_dict[int(f.groups()[0])]
|
||||||
|
content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma))
|
||||||
|
prev_end = f.end()
|
||||||
|
|
||||||
|
content_out.append(content[prev_end:])
|
||||||
|
print("added msd's lemma's successfully")
|
||||||
|
|
||||||
|
content_out = "".join(content_out)
|
||||||
|
xml_tree = ElementTree.XML(content_out)
|
||||||
|
print("reparsed xml, all good!")
|
||||||
|
|
||||||
|
with open(OUT_FILE, "wb") as fp:
|
||||||
|
fp.write(ElementTree.tostring(xml_tree, encoding='utf8'))
|
122
separate.py
Normal file
122
separate.py
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
import xml.etree.ElementTree as ElementTree
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
FILE_IN = sys.argv[1]
|
||||||
|
|
||||||
|
TOP = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:lang="sl">"""
|
||||||
|
|
||||||
|
ID = "ozbo_id"
|
||||||
|
|
||||||
|
with open(FILE_IN, "r") as fp:
|
||||||
|
xml_tree = ElementTree.XML(fp.read())
|
||||||
|
|
||||||
|
trees_teacher = []
|
||||||
|
trees_student = []
|
||||||
|
|
||||||
|
out_tree_teacher = ElementTree.Element('text')
|
||||||
|
out_tree_student = ElementTree.Element('text')
|
||||||
|
|
||||||
|
paragraph_teacher = None
|
||||||
|
paragraph_student = None
|
||||||
|
|
||||||
|
sentence_teacher = None
|
||||||
|
sentence_student = None
|
||||||
|
|
||||||
|
last_id = None
|
||||||
|
added_words = False
|
||||||
|
|
||||||
|
def reset_word(el):
|
||||||
|
el.tag = "w"
|
||||||
|
del el.attrib["msd"]
|
||||||
|
del el.attrib["lemma"]
|
||||||
|
|
||||||
|
for idx, el in enumerate(xml_tree.findall(".//*")):
|
||||||
|
if el.tag == "body":
|
||||||
|
if paragraph_teacher is not None:
|
||||||
|
out_tree_teacher.append(paragraph_teacher)
|
||||||
|
out_tree_student.append(paragraph_student)
|
||||||
|
|
||||||
|
trees_teacher.append(out_tree_teacher)
|
||||||
|
trees_student.append(out_tree_student)
|
||||||
|
|
||||||
|
out_tree_teacher = ElementTree.Element('text')
|
||||||
|
out_tree_student = ElementTree.Element('text')
|
||||||
|
|
||||||
|
paragraph_teacher = ElementTree.Element('p')
|
||||||
|
paragraph_student = ElementTree.Element('p')
|
||||||
|
|
||||||
|
elif el.tag == 'st1':
|
||||||
|
if added_words:
|
||||||
|
paragraph_teacher.append(sentence_teacher)
|
||||||
|
paragraph_student.append(sentence_student)
|
||||||
|
|
||||||
|
sentence_student = ElementTree.Element('s')
|
||||||
|
sentence_teacher = ElementTree.Element('s')
|
||||||
|
added_words = False
|
||||||
|
|
||||||
|
elif el.tag in ["w1", "w2", "w3"]:
|
||||||
|
added_words = True
|
||||||
|
|
||||||
|
if el.tag == "w1":
|
||||||
|
if last_id == el.attrib[ID]:
|
||||||
|
print("REPEAT...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
reset_word(el)
|
||||||
|
sentence_student.append(el)
|
||||||
|
last_id = el.attrib[ID]
|
||||||
|
|
||||||
|
elif el.tag == "w2":
|
||||||
|
if last_id == el.attrib[ID]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
reset_word(el)
|
||||||
|
sentence_teacher.append(el)
|
||||||
|
last_id = el.attrib[ID]
|
||||||
|
else:
|
||||||
|
reset_word(el)
|
||||||
|
sentence_teacher.append(el)
|
||||||
|
sentence_student.append(el)
|
||||||
|
|
||||||
|
elif el.tag == "c1":
|
||||||
|
el.tag = "c"
|
||||||
|
sentence_student.append(el)
|
||||||
|
elif el.tag == "c2":
|
||||||
|
el.tag = "c"
|
||||||
|
sentence_teacher.append(el)
|
||||||
|
elif el.tag == "c3":
|
||||||
|
el.tag = "c"
|
||||||
|
sentence_student.append(el)
|
||||||
|
sentence_teacher.append(el)
|
||||||
|
|
||||||
|
elif el.tag == "S":
|
||||||
|
if len(sentence_student) == 0 or sentence_student[-1].tag != 'S':
|
||||||
|
sentence_student.append(el)
|
||||||
|
if len(sentence_teacher) == 0 or sentence_teacher[-1].tag != 'S':
|
||||||
|
sentence_teacher.append(el)
|
||||||
|
|
||||||
|
assert(None not in [paragraph_teacher, paragraph_student, sentence_student, sentence_teacher])
|
||||||
|
|
||||||
|
paragraph_teacher.append(sentence_teacher)
|
||||||
|
paragraph_teacher.append(sentence_student)
|
||||||
|
out_tree_teacher.append(paragraph_teacher)
|
||||||
|
out_tree_student.append(paragraph_student)
|
||||||
|
trees_teacher.append(out_tree_teacher)
|
||||||
|
trees_student.append(out_tree_student)
|
||||||
|
|
||||||
|
|
||||||
|
for folder, tree in [("teacher", trees_teacher), ("student", trees_student)]:
|
||||||
|
for idx, xml in enumerate(tree):
|
||||||
|
filename = "{}/{}.xml".format(folder, idx)
|
||||||
|
|
||||||
|
s = ElementTree.tostring(xml, encoding='utf8', method='xml').decode('utf8')
|
||||||
|
s = s.replace(ID, "xml:id")
|
||||||
|
s = re.sub(r"<lb\s*\/>", "", s)
|
||||||
|
s = s.replace('°', "●")
|
||||||
|
|
||||||
|
snl = s.find('\n')
|
||||||
|
with open(filename, "w") as fp:
|
||||||
|
print(TOP + s[snl:] + "</TEI>", file=fp)
|
||||||
|
|
75
tag_ids.py
Normal file
75
tag_ids.py
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
import xml.etree.ElementTree as ElementTree
|
||||||
|
import re
|
||||||
|
import pathlib
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
IN_FOLDER_IDS = sys.argv[1]
|
||||||
|
IN_FOLDER_TAGGED = sys.argv[2]
|
||||||
|
OUT_FILE = sys.argv[3]
|
||||||
|
|
||||||
|
REPLACE = "xmlns=\"http://www.tei-c.org/ns/1.0\""
|
||||||
|
|
||||||
|
ids_files = {}
|
||||||
|
ids_tags = {}
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_ids(id_content, tag_content, num):
|
||||||
|
print("\r", num, end="\t")
|
||||||
|
id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", ""))
|
||||||
|
tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", ""))
|
||||||
|
|
||||||
|
id_sentences = list(id_tree.findall(".//w"))
|
||||||
|
tag_sentences = list(tag_tree.findall(".//w"))
|
||||||
|
|
||||||
|
id_l = len(id_sentences)
|
||||||
|
tag_l = len(tag_sentences)
|
||||||
|
assert(id_l == tag_l)
|
||||||
|
|
||||||
|
for id_w, tag_w in zip(id_sentences, tag_sentences):
|
||||||
|
if id_w.text != tag_w.text:
|
||||||
|
print(id_w.text, tag_w.text)
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
id_ = id_w.attrib['id']
|
||||||
|
tag = tag_w.attrib['msd']
|
||||||
|
lemma = tag_w.attrib['lemma']
|
||||||
|
ids_tags[int(id_)] = (tag, lemma)
|
||||||
|
|
||||||
|
|
||||||
|
for filepath in pathlib.Path(IN_FOLDER_IDS).glob("**/*.xml"):
|
||||||
|
if not filepath.is_file():
|
||||||
|
continue
|
||||||
|
|
||||||
|
filename = str(filepath).split('/')[-1]
|
||||||
|
filenum = int(re.match(r"(\d+).*", filename).groups()[0])
|
||||||
|
|
||||||
|
with open(str(filepath), "r") as fp:
|
||||||
|
ids_files[filenum] = fp.read()
|
||||||
|
|
||||||
|
|
||||||
|
for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")):
|
||||||
|
if not filepath.is_file():
|
||||||
|
continue
|
||||||
|
|
||||||
|
filename = str(filepath).split('/')[-1]
|
||||||
|
filenum = int(re.match(r"(\d+).*", filename).groups()[0])
|
||||||
|
|
||||||
|
with open(str(filepath), "r") as fp:
|
||||||
|
resolve_ids(ids_files[filenum], fp.read(), filenum)
|
||||||
|
|
||||||
|
|
||||||
|
old_tags = {}
|
||||||
|
if pathlib.Path(OUT_FILE).is_file():
|
||||||
|
with open(OUT_FILE, "rb") as fp:
|
||||||
|
old_tags = pickle.load(fp)
|
||||||
|
|
||||||
|
|
||||||
|
for i, taglemma in ids_tags.items():
|
||||||
|
old_tags[i] = taglemma
|
||||||
|
ids_tags = old_tags
|
||||||
|
|
||||||
|
|
||||||
|
with open(OUT_FILE, "wb") as fp:
|
||||||
|
pickle.dump(ids_tags, fp)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user