You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

139 lines
3.7 KiB

# encoding: utf-8
# convert conllu to xml tui
from __future__ import print_function, unicode_literals, division
import re
import sys
import xml.dom.minidom as minidom
from lxml import etree
import lxml.builder as builder
# regexs
re_item = re.compile(r"(\S+)\t(\S+)\t(\S+)")
# cannot do node.set('xml:id', 1), this stuff is needed...
def set_xml_attr(node, attribute, value):
node.attrib['{}' + attribute] = value
# actually output readable xml
def pretty_xml(base):
rough_xml = etree.tostring(base)
reparsed = minidom.parseString(rough_xml)
return reparsed.toprettyxml(indent='\t')
class Sentence:
def __init__(self, _id): = _id
self.items = []
def add_item(self, token, lemma, xpos):
Collect token, lemma, xpos in misc of a gram;
ADded to sentence items
self.items.append([token, lemma, xpos])
def as_xml(self):
Create whole <s> xml element for this sentence
from the id and items
base = etree.Element('s')
set_xml_attr(base, 'id', str(
id_counter = 1
for item in self.items:
token, lemma, xpos = item
# TODO: support slo/en xpos...
if xpos == "Z" and token not in ["", "à", ""] and len(token) == 1:
to_add = etree.Element('c')
to_add = etree.Element('w')
to_add.set('lemma', lemma)
to_add.set('msd', xpos)
set_xml_attr(to_add, 'id', "{}.{}".format(, id_counter))
to_add.text = token
id_counter += 1
return base
def build_xml(sentences):
builds xml from a list of sentences
TODO: different langs, div ids, multiple divs,...
body_build = builder.ElementMaker(
None: "",
'xi': ""})
# body
body = body_build.body()
set_xml_attr(body, 'lang', 'si')
# only one div in body
total_div = etree.Element('div')
set_xml_attr(total_div, 'id', 'div.1')
# only one p in div
total_p = etree.Element('p')
set_xml_attr(total_p, 'id', 'p.1')
# put all senteces in div
for sentence in sentences:
return body
def main(filein, fileout):
sentences = []
with open(filein, 'r') as fp:
sentence = Sentence(0)
for line in fp:
# this can be optimized for speed, but for me this
# is fast enough and much more readable
m = re_item.match(line)
if not m:
sentence = Sentence(len(sentences))
token =
xpos =
lemma =
# using stupid Py2 :(
# generate xml, pretty print it to an output file
xml_tree = build_xml(sentences)
xml_str = pretty_xml(xml_tree)
with open(fileout, 'w') as fp:
print(xml_str.encode('utf-8'), file=fp)
if __name__ == '__main__':
if len(sys.argv) != 3:
print("Uporaba: {} vhodna_datoteka izhodna_datoteks"
.format(sys.argv[0]), file=sys.stderr)
main(sys.argv[1], sys.argv[2])