From 51c3899ca3d9c57d1ce8e7361bb824e31fb768ed Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Wed, 2 Jan 2019 11:45:13 +0100 Subject: [PATCH] Pokrivam uporabo meta-taggerja in conllu formata. --- .gitignore | 2 + README.md | 6 ++ conllu_to_xml.py | 139 ++++++++++++++++++++++++++++++++++++++ multiple_conllu_to_xml.py | 19 ++++++ 4 files changed, 166 insertions(+) create mode 100644 .gitignore create mode 100644 conllu_to_xml.py create mode 100644 multiple_conllu_to_xml.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4624315 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +**.pyc +**.xml diff --git a/README.md b/README.md index 286a398..39bdfab 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,12 @@ Loci velik xml v skupek ucenec in skupek ucitelj xml-ov, vsak predstavlja \conllu obstajajo Python scripte nekje (TODO), za pretvorbo nazaj pa je scripta prilozena tu. pozeni (pazi: Python 2.7): + +```python2 multiple_conllu_to_xml.py teacher-out-txt teacher-out-xml``` + ## Uporabi izhod tagger-ja Denimo da damo mapo `student` skozi taggerja in dobimo izhod v mapi `student-out`. Vsaka mapa ima polno datotek: `0.xml, 1.xml,...`. Zdruzimo ozbo_id informacijo z informacijo taggerja: diff --git a/conllu_to_xml.py b/conllu_to_xml.py new file mode 100644 index 0000000..6d67aac --- /dev/null +++ b/conllu_to_xml.py @@ -0,0 +1,139 @@ +#!/use/bin/python2 +# encoding: utf-8 +# convert conllu to xml tui + +from __future__ import print_function, unicode_literals, division +import re +import sys + +import xml.dom.minidom as minidom +from lxml import etree +import lxml.builder as builder + + +# regexs +re_item = re.compile(r"(\S+)\t(\S+)\t(\S+)") + + +# cannot do node.set('xml:id', 1), this stuff is needed... +def set_xml_attr(node, attribute, value): + node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value + + +# actually output readable xml +def pretty_xml(base): + rough_xml = etree.tostring(base) + reparsed = minidom.parseString(rough_xml) + return reparsed.toprettyxml(indent='\t') + + +class Sentence: + def __init__(self, _id): + self.id = _id + self.items = [] + + def add_item(self, token, lemma, xpos): + """ + Collect token, lemma, xpos in misc of a gram; + ADded to sentence items + """ + self.items.append([token, lemma, xpos]) + + def as_xml(self): + """ + Create whole xml element for this sentence + from the id and items + """ + base = etree.Element('s') + set_xml_attr(base, 'id', str(self.id)) + id_counter = 1 + + for item in self.items: + token, lemma, xpos = item + # TODO: support slo/en xpos... + if xpos == "Z" and token not in ["€", "à", "⅛"] and len(token) == 1: + to_add = etree.Element('c') + else: + to_add = etree.Element('w') + to_add.set('lemma', lemma) + + to_add.set('msd', xpos) + set_xml_attr(to_add, 'id', "{}.{}".format(self.id, id_counter)) + to_add.text = token + + id_counter += 1 + base.append(to_add) + + return base + + +def build_xml(sentences): + """ + builds xml from a list of sentences + TODO: different langs, div ids, multiple divs,... + """ + + body_build = builder.ElementMaker( + namespace="http://www.tei-c.org/ns/1.0", + nsmap={ + None: "http://www.tei-c.org/ns/1.0", + 'xi': "http://www.w3.org/2001/XInclude"}) + + # body + body = body_build.body() + set_xml_attr(body, 'lang', 'si') + + # only one div in body + total_div = etree.Element('div') + set_xml_attr(total_div, 'id', 'div.1') + body.append(total_div) + + # only one p in div + total_p = etree.Element('p') + set_xml_attr(total_p, 'id', 'p.1') + total_div.append(total_p) + + # put all senteces in div + for sentence in sentences: + total_p.append(sentence.as_xml()) + + return body + + +def main(filein, fileout): + sentences = [] + with open(filein, 'r') as fp: + sentence = Sentence(0) + for line in fp: + # this can be optimized for speed, but for me this + # is fast enough and much more readable + m = re_item.match(line) + + if not m: + sentences.append(sentence) + sentence = Sentence(len(sentences)) + + else: + token = m.group(1) + xpos = m.group(2) + lemma = m.group(3) + + # using stupid Py2 :( + sentence.add_item( + token.decode('utf-8'), + lemma.decode('utf-8'), + xpos.decode('utf-8')) + + # generate xml, pretty print it to an output file + xml_tree = build_xml(sentences) + xml_str = pretty_xml(xml_tree) + with open(fileout, 'w') as fp: + print(xml_str.encode('utf-8'), file=fp) + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print("Uporaba: {} vhodna_datoteka izhodna_datoteks" + .format(sys.argv[0]), file=sys.stderr) + exit(1) + main(sys.argv[1], sys.argv[2]) diff --git a/multiple_conllu_to_xml.py b/multiple_conllu_to_xml.py new file mode 100644 index 0000000..0b4097e --- /dev/null +++ b/multiple_conllu_to_xml.py @@ -0,0 +1,19 @@ +from os import sys, path, listdir +sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) + +from conllu_to_xml import main as convert_one +import sys + +folderin = sys.argv[1] +folderout = sys.argv[2] + +for filename in sorted(list(listdir(folderin))): + if not filename.endswith('.txt'): + continue + convert_one( + "{}/{}".format(folderin, filename), + "{}/{}.xml".format(folderout, filename.split('.')[0])) + + print "\r{}".format(filename), + sys.stdout.flush() +