Browse Source

Initial commit

master
Ozbolt Menegatti 3 years ago
commit
f67e9f47cf
  1. 28
      README.md
  2. 48
      add_ids.xml
  3. 44
      merge_back.py
  4. 122
      separate.py
  5. 75
      tag_ids.py

28
README.md

@ -0,0 +1,28 @@
# SOLAR oznacevanje ucitelj/ucenec
## Dodaj IDje
"Pametno" doda ozbo\_id k vsem besedam v SOLAR xmlu.
``` python3 add_ids.py SOLAR.xml SOLAR_ID.xml ```
## Loci ucenec/ucitelj
Loci velik xml v skupek ucenec in skupek ucitelj xml-ov, vsak predstavlja \<text\> element iz vhodnega xml-a. Vse datoteke se izpisejo v mapi `student` in `teacher`.
```python3 separate.py SOLAR_ID.xml```
## Uporabi izhod tagger-ja
Denimo da damo mapo `student` skozi taggerja in dobimo izhod v mapi `student-out`. Vsaka mapa ima polno datotek: `0.xml, 1.xml,...`. Zdruzimo ozbo_id informacijo z informacijo taggerja:
```python3 student student-out tags.p```
To pozenemo prvo za ucenca in potem za ucitelja.
## Nazaj v original datoteko
Sedaj samo poberemo informacije iz `tags.p` in jih damo nazaj v vhodne xml-e.
```python3 merge_back.py tags.p SOLAR_ID.xml SOLAR_OUT.xml```

48
add_ids.xml

@ -0,0 +1,48 @@
import xml.etree.ElementTree as ElementTree
import sys
FILE_IN = sys.argv[1]
FILE_OUT = sys.argv[2]
with open(FILE_IN, "r") as fp:
xml_tree = ElementTree.XML(fp.read())
SEARCH_FOR = ["u1"]
ID = "ozbo_id" # should be the same in all files!
out_xml = ElementTree.Element('top')
ctr = 0
def add_ctr(el, previous=False):
global ctr
if previous:
el.attrib[ID] = str(ctr - 1)
else:
el.attrib[ID] = str(ctr)
ctr += 1
empty = ElementTree.Element('')
last_w1 = empty
last_w2 = empty
for idx, el in enumerate(xml_tree.findall(".//*")):
if el.tag == "w3":
add_ctr(el)
elif el.tag == "w2":
add_ctr(el, el.text == last_w2.text)
last_w2 = el
elif el.tag == "w1":
add_ctr(el, el.text == last_w1.text)
last_w1 = el
# reset last_w1 lastw2
elif el.tag == "S":
last_w1 = empty
last_w2 = empty
with open(FILE_OUT, "wb") as fp:
fp.write(ElementTree.tostring(xml_tree, encoding='utf8', method='xml'))

44
merge_back.py

@ -0,0 +1,44 @@
import xml.etree.ElementTree as ElementTree
import pickle
import sys
import re
ID = "ozbo_id"
IDS_PICKLE = sys.argv[1]
IN_XML = sys.argv[2]
OUT_FILE = sys.argv[3]
with open(IDS_PICKLE, "rb") as fp:
ids_dict = pickle.load(fp)
with open(IN_XML, "r") as fp:
content = fp.read()
print("XML read")
# remove old msd-s ane lemma-s
msd_matcher = r"(msd|lemma)=\"\S+\""
content = re.sub(msd_matcher, '', content)
print("removed old msd's lemma's successfully")
matcher = r"{} *= *\"?(\d+)\"?".format(ID)
content_out = []
prev_end = 0
for f in re.finditer(matcher, content):
content_out.append(content[prev_end:f.start()])
msd, lemma = ids_dict[int(f.groups()[0])]
content_out.append(" msd=\"{}\" lemma=\"{}\"".format(msd, lemma))
prev_end = f.end()
content_out.append(content[prev_end:])
print("added msd's lemma's successfully")
content_out = "".join(content_out)
xml_tree = ElementTree.XML(content_out)
print("reparsed xml, all good!")
with open(OUT_FILE, "wb") as fp:
fp.write(ElementTree.tostring(xml_tree, encoding='utf8'))

122
separate.py

@ -0,0 +1,122 @@
import xml.etree.ElementTree as ElementTree
import sys
import re
FILE_IN = sys.argv[1]
TOP = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:lang="sl">"""
ID = "ozbo_id"
with open(FILE_IN, "r") as fp:
xml_tree = ElementTree.XML(fp.read())
trees_teacher = []
trees_student = []
out_tree_teacher = ElementTree.Element('text')
out_tree_student = ElementTree.Element('text')
paragraph_teacher = None
paragraph_student = None
sentence_teacher = None
sentence_student = None
last_id = None
added_words = False
def reset_word(el):
el.tag = "w"
del el.attrib["msd"]
del el.attrib["lemma"]
for idx, el in enumerate(xml_tree.findall(".//*")):
if el.tag == "body":
if paragraph_teacher is not None:
out_tree_teacher.append(paragraph_teacher)
out_tree_student.append(paragraph_student)
trees_teacher.append(out_tree_teacher)
trees_student.append(out_tree_student)
out_tree_teacher = ElementTree.Element('text')
out_tree_student = ElementTree.Element('text')
paragraph_teacher = ElementTree.Element('p')
paragraph_student = ElementTree.Element('p')
elif el.tag == 'st1':
if added_words:
paragraph_teacher.append(sentence_teacher)
paragraph_student.append(sentence_student)
sentence_student = ElementTree.Element('s')
sentence_teacher = ElementTree.Element('s')
added_words = False
elif el.tag in ["w1", "w2", "w3"]:
added_words = True
if el.tag == "w1":
if last_id == el.attrib[ID]:
print("REPEAT...")
continue
reset_word(el)
sentence_student.append(el)
last_id = el.attrib[ID]
elif el.tag == "w2":
if last_id == el.attrib[ID]:
continue
reset_word(el)
sentence_teacher.append(el)
last_id = el.attrib[ID]
else:
reset_word(el)
sentence_teacher.append(el)
sentence_student.append(el)
elif el.tag == "c1":
el.tag = "c"
sentence_student.append(el)
elif el.tag == "c2":
el.tag = "c"
sentence_teacher.append(el)
elif el.tag == "c3":
el.tag = "c"
sentence_student.append(el)
sentence_teacher.append(el)
elif el.tag == "S":
if len(sentence_student) == 0 or sentence_student[-1].tag != 'S':
sentence_student.append(el)
if len(sentence_teacher) == 0 or sentence_teacher[-1].tag != 'S':
sentence_teacher.append(el)
assert(None not in [paragraph_teacher, paragraph_student, sentence_student, sentence_teacher])
paragraph_teacher.append(sentence_teacher)
paragraph_teacher.append(sentence_student)
out_tree_teacher.append(paragraph_teacher)
out_tree_student.append(paragraph_student)
trees_teacher.append(out_tree_teacher)
trees_student.append(out_tree_student)
for folder, tree in [("teacher", trees_teacher), ("student", trees_student)]:
for idx, xml in enumerate(tree):
filename = "{}/{}.xml".format(folder, idx)
s = ElementTree.tostring(xml, encoding='utf8', method='xml').decode('utf8')
s = s.replace(ID, "xml:id")
s = re.sub(r"<lb\s*\/>", "", s)
s = s.replace('°', "")
snl = s.find('\n')
with open(filename, "w") as fp:
print(TOP + s[snl:] + "</TEI>", file=fp)

75
tag_ids.py

@ -0,0 +1,75 @@
import xml.etree.ElementTree as ElementTree
import re
import pathlib
import sys
import pickle
IN_FOLDER_IDS = sys.argv[1]
IN_FOLDER_TAGGED = sys.argv[2]
OUT_FILE = sys.argv[3]
REPLACE = "xmlns=\"http://www.tei-c.org/ns/1.0\""
ids_files = {}
ids_tags = {}
def resolve_ids(id_content, tag_content, num):
print("\r", num, end="\t")
id_tree = ElementTree.XML(id_content.replace(REPLACE, "").replace("xml:", ""))
tag_tree = ElementTree.XML(tag_content.replace(REPLACE, "").replace("xml:", ""))
id_sentences = list(id_tree.findall(".//w"))
tag_sentences = list(tag_tree.findall(".//w"))
id_l = len(id_sentences)
tag_l = len(tag_sentences)
assert(id_l == tag_l)
for id_w, tag_w in zip(id_sentences, tag_sentences):
if id_w.text != tag_w.text:
print(id_w.text, tag_w.text)
exit(0)
id_ = id_w.attrib['id']
tag = tag_w.attrib['msd']
lemma = tag_w.attrib['lemma']
ids_tags[int(id_)] = (tag, lemma)
for filepath in pathlib.Path(IN_FOLDER_IDS).glob("**/*.xml"):
if not filepath.is_file():
continue
filename = str(filepath).split('/')[-1]
filenum = int(re.match(r"(\d+).*", filename).groups()[0])
with open(str(filepath), "r") as fp:
ids_files[filenum] = fp.read()
for filepath in sorted(pathlib.Path(IN_FOLDER_TAGGED).glob("**/*.xml")):
if not filepath.is_file():
continue
filename = str(filepath).split('/')[-1]
filenum = int(re.match(r"(\d+).*", filename).groups()[0])
with open(str(filepath), "r") as fp:
resolve_ids(ids_files[filenum], fp.read(), filenum)
old_tags = {}
if pathlib.Path(OUT_FILE).is_file():
with open(OUT_FILE, "rb") as fp:
old_tags = pickle.load(fp)
for i, taglemma in ids_tags.items():
old_tags[i] = taglemma
ids_tags = old_tags
with open(OUT_FILE, "wb") as fp:
pickle.dump(ids_tags, fp)
Loading…
Cancel
Save