#!/use/bin/python2 # encoding: utf-8 # convert conllu to xml tui from __future__ import print_function, unicode_literals, division import re import sys import xml.dom.minidom as minidom from lxml import etree import lxml.builder as builder # regexs re_item = re.compile(r"(\S+)\t(\S+)\t(\S+)") # cannot do node.set('xml:id', 1), this stuff is needed... def set_xml_attr(node, attribute, value): node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value # actually output readable xml def pretty_xml(base): rough_xml = etree.tostring(base) reparsed = minidom.parseString(rough_xml) return reparsed.toprettyxml(indent='\t') class Sentence: def __init__(self, _id): self.id = _id self.items = [] def add_item(self, token, lemma, xpos): """ Collect token, lemma, xpos in misc of a gram; ADded to sentence items """ self.items.append([token, lemma, xpos]) def as_xml(self): """ Create whole xml element for this sentence from the id and items """ base = etree.Element('s') set_xml_attr(base, 'id', str(self.id)) id_counter = 1 for item in self.items: token, lemma, xpos = item # TODO: support slo/en xpos... if xpos == "Z" and token not in ["€", "à", "⅛"] and len(token) == 1: to_add = etree.Element('c') else: to_add = etree.Element('w') to_add.set('lemma', lemma) to_add.set('msd', xpos) set_xml_attr(to_add, 'id', "{}.{}".format(self.id, id_counter)) to_add.text = token id_counter += 1 base.append(to_add) return base def build_xml(sentences): """ builds xml from a list of sentences TODO: different langs, div ids, multiple divs,... """ body_build = builder.ElementMaker( namespace="http://www.tei-c.org/ns/1.0", nsmap={ None: "http://www.tei-c.org/ns/1.0", 'xi': "http://www.w3.org/2001/XInclude"}) # body body = body_build.body() set_xml_attr(body, 'lang', 'si') # only one div in body total_div = etree.Element('div') set_xml_attr(total_div, 'id', 'div.1') body.append(total_div) # only one p in div total_p = etree.Element('p') set_xml_attr(total_p, 'id', 'p.1') total_div.append(total_p) # put all senteces in div for sentence in sentences: total_p.append(sentence.as_xml()) return body def main(filein, fileout): sentences = [] with open(filein, 'r') as fp: sentence = Sentence(0) for line in fp: # this can be optimized for speed, but for me this # is fast enough and much more readable m = re_item.match(line) if not m: sentences.append(sentence) sentence = Sentence(len(sentences)) else: token = m.group(1) xpos = m.group(2) lemma = m.group(3) # using stupid Py2 :( sentence.add_item( token.decode('utf-8'), lemma.decode('utf-8'), xpos.decode('utf-8')) # generate xml, pretty print it to an output file xml_tree = build_xml(sentences) xml_str = pretty_xml(xml_tree) with open(fileout, 'w') as fp: print(xml_str.encode('utf-8'), file=fp) if __name__ == '__main__': if len(sys.argv) != 3: print("Uporaba: {} vhodna_datoteka izhodna_datoteks" .format(sys.argv[0]), file=sys.stderr) exit(1) main(sys.argv[1], sys.argv[2])