diff --git a/MANIFEST.in b/MANIFEST.in
index e3182fb..a3e95e0 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,3 @@
-include conversion_utils/resources/jos_specifications.pickle
\ No newline at end of file
+include conversion_utils/resources/jos_specifications.pickle
+include conversion_utils/resources/dict.xml
+include conversion_utils/resources/structure_conversions.csv
diff --git a/conversion_utils/conllu_to_tei.py b/conversion_utils/conllu_to_tei.py
new file mode 100644
index 0000000..64f0ba3
--- /dev/null
+++ b/conversion_utils/conllu_to_tei.py
@@ -0,0 +1,291 @@
+import argparse
+import re
+import sys
+
+from lxml import etree
+
+class Sentence:
+ def __init__(self, _id, no_ud=False):
+ self._id = _id
+ self.items = []
+ self.links = []
+ self.no_ud = no_ud
+
+ def add_item(self, token, lemma, upos, upos_other, xpos, misc):
+ self.items.append([token, lemma, upos, upos_other, xpos, misc == "SpaceAfter=No"])
+
+ def add_link(self, link_ref, link_type):
+ self.links.append([link_ref, link_type])
+
+ def as_xml(self, id_prefix=None):
+ if id_prefix:
+ xml_id = id_prefix + '.' + self._id
+ else:
+ xml_id = self._id
+ base = etree.Element('s')
+ set_xml_attr(base, 'id', xml_id)
+ id_counter = 1
+
+ for item in self.items:
+ token, lemma, upos, upos_other, xpos, no_space_after = item
+
+ if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
+ to_add = etree.Element('pc')
+ else:
+ to_add = etree.Element('w')
+ to_add.set('lemma', lemma)
+
+ to_add.set('ana', 'mte:' + xpos)
+ if not self.no_ud:
+ if upos_other != '_':
+ to_add.set('msd', f'UposTag={upos}|{upos_other}')
+ else:
+ to_add.set('msd', f'UposTag={upos}')
+
+ set_xml_attr(to_add, 'id', "{}.{}".format(xml_id, id_counter))
+ to_add.text = token
+
+ id_counter += 1
+
+ if no_space_after:
+ to_add.set('join', 'right')
+
+ base.append(to_add)
+
+ link_grp = etree.Element('linkGrp')
+ link_grp.set('corresp', '#'+xml_id)
+ link_grp.set('targFunc', 'head argument')
+ link_grp.set('type', 'JOS-SYN')
+ for link_id, item in enumerate(self.links):
+ link_ref, link_type = item
+ link = etree.Element('link')
+ link.set('ana', 'jos-syn:' + link_type)
+ if link_ref == u'0':
+ link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1))
+ else:
+ link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
+ link_grp.append(link)
+ base.append(link_grp)
+ return base
+
+
+class Paragraph:
+ def __init__(self, _id):
+ self._id = _id
+ self.sentences = []
+
+ def add_sentence(self, sentence):
+ self.sentences.append(sentence)
+
+ def as_xml(self, id_prefix=None):
+ if id_prefix:
+ xml_id = id_prefix + '.' + self._id
+ else:
+ xml_id = self._id
+
+ p = etree.Element('p')
+ set_xml_attr(p, 'id', xml_id)
+
+ for sent in self.sentences:
+ p.append(sent.as_xml(id_prefix=id_prefix))
+ return p
+
+
+class TeiDocument:
+ def __init__(self, _id, paragraphs=list()):
+ self._id = _id
+ self.paragraphs = paragraphs
+
+ def as_xml(self):
+ root = etree.Element('TEI')
+ root.set('xmlns', 'http://www.tei-c.org/ns/1.0')
+ set_xml_attr(root, 'lang', 'sl')
+
+ xml_id = self._id
+ if xml_id is not None:
+ set_xml_attr(root, 'id', xml_id)
+
+ tei_header = etree.SubElement(root, 'teiHeader')
+
+ text = etree.SubElement(root, 'text')
+ body = etree.SubElement(text, 'body')
+ for para in self.paragraphs:
+ body.append(para.as_xml(id_prefix=xml_id))
+
+ encoding_desc = etree.SubElement(tei_header, 'encodingDesc')
+ tags_decl = etree.SubElement(encoding_desc, 'tagsDecl')
+ namespace = etree.SubElement(tags_decl, 'namespace')
+ namespace.set('name', 'http://www.tei-c.org/ns/1.0')
+ for tag in ['p', 's', 'pc', 'w']:
+ count = int(text.xpath('count(.//{})'.format(tag)))
+ tag_usage = etree.SubElement(namespace, 'tagUsage')
+ tag_usage.set('gi', tag)
+ tag_usage.set('occurs', str(count))
+ return root
+
+ def add_paragraph(self, paragraph):
+ self.paragraphs.append(paragraph)
+
+
+def build_tei_etrees(documents):
+ elements = []
+ for document in documents:
+ elements.append(document.as_xml())
+ return elements
+
+
+def set_xml_attr(node, attribute, value):
+ node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value
+
+
+def parse_metaline(line):
+ tokens = line.split('=', 1)
+ key = tokens[0].replace('#', '').strip()
+ if len(tokens) > 1 and not tokens[1].isspace():
+ val = tokens[1].strip()
+ else:
+ val = None
+ return (key, val)
+
+
+def is_metaline(line):
+ if re.match('# .+ =.*', line):
+ return True
+ return False
+
+
+def construct_tei_documents(conllu_lines):
+ documents = []
+
+ doc_id = None
+ document_paragraphs = []
+
+ para_id = None
+ para_buffer = []
+
+ for line in conllu_lines:
+ if is_metaline(line):
+ key, val = parse_metaline(line)
+ if key == 'newdoc id':
+ if len(para_buffer) > 0:
+ document_paragraphs.append(construct_paragraph(para_id, para_buffer))
+ if len(document_paragraphs) > 0:
+ documents.append(
+ TeiDocument(doc_id, document_paragraphs))
+ document_paragraphs = []
+ doc_id = val
+ elif key == 'newpar id':
+ if len(para_buffer) > 0:
+ document_paragraphs.append(construct_paragraph(para_id, para_buffer))
+ para_buffer = []
+ para_id = val
+ elif key == 'sent_id':
+ para_buffer.append(line)
+ else:
+ if not line.isspace():
+ para_buffer.append(line)
+
+ if len(para_buffer) > 0:
+ document_paragraphs.append(construct_paragraph(para_id, para_buffer))
+
+ if len(document_paragraphs) > 0:
+ documents.append(
+ TeiDocument(doc_id, document_paragraphs))
+
+ return documents
+
+
+def construct_paragraph(para_id, conllu_lines):
+ para = Paragraph(para_id)
+
+ sent_id = None
+ sent_buffer = []
+
+ for line in conllu_lines:
+ if is_metaline(line):
+ key, val = parse_metaline(line)
+ if key == 'sent_id':
+ if len(sent_buffer) > 0:
+ para.add_sentence(construct_sentence(sent_id, sent_buffer))
+ sent_buffer = []
+ sent_id = val
+ elif not line.isspace():
+ sent_buffer.append(line)
+
+ if len(sent_buffer) > 0:
+ para.add_sentence(construct_sentence(sent_id, sent_buffer))
+
+ return para
+
+
+def construct_sentence(sent_id, lines):
+ sentence = Sentence(sent_id)
+ for line in lines:
+ if line.startswith('#') or line.isspace():
+ continue
+ line = line.replace('\n', '')
+ tokens = line.split('\t')
+ word_id = tokens[0]
+ token = tokens[1]
+ lemma = tokens[2]
+ upos = tokens[3]
+ xpos = tokens[4]
+ upos_other = tokens[5]
+ depparse_link = tokens[6]
+ depparse_link_name = tokens[7]
+ misc = tokens[9]
+
+ sentence.add_item(
+ token,
+ lemma,
+ upos,
+ upos_other,
+ xpos,
+ misc)
+
+ sentence.add_link(
+ depparse_link,
+ depparse_link_name)
+ return sentence
+
+
+def construct_tei_etrees(conllu_lines):
+ documents = construct_tei_documents(conllu_lines)
+ return build_tei_etrees(documents)
+
+
+def convert_file(input_file_name, output_file_name):
+ input_file = open(input_file_name, 'r')
+ root = construct_tei_etrees(input_file)[0]
+ tree = etree.ElementTree(root)
+ tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
+ input_file.close()
+
+ tree = etree.ElementTree(root)
+ tree.write(output_file_name, pretty_print=True, encoding='utf-8')
+
+
+if __name__ == '__main__':
+ import argparse
+ from glob import glob
+
+ parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
+ parser.add_argument('files', nargs='+', help='CoNNL-U file')
+ parser.add_argument('-o', '--out-file', dest='out', default=None,
+ help='Write output to file instead of stdout.')
+
+ args = parser.parse_args()
+
+ if args.out:
+ f_out = open(args.out, 'w')
+ else:
+ f_out = sys.stdout
+
+ for arg in args.files:
+ filelist = glob(arg)
+ for f in filelist:
+ with open(f, 'r') as conllu_f:
+ tei_etrees = construct_tei_etrees(conllu_f)
+ for tei_etree in tei_etrees:
+ f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
+ f_out.write('')
diff --git a/conversion_utils/resources/dict.xml b/conversion_utils/resources/dict.xml
new file mode 100644
index 0000000..ad872d3
--- /dev/null
+++ b/conversion_utils/resources/dict.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/conversion_utils/resources/structure_conversions.csv b/conversion_utils/resources/structure_conversions.csv
new file mode 100644
index 0000000..07a3dd8
--- /dev/null
+++ b/conversion_utils/resources/structure_conversions.csv
@@ -0,0 +1,193 @@
+gramrel|struktura|Name_ID|DSB|Simon|Simon-1|NSSS|Pov-REFL|Pov-NEG|Zgled|LBS|Opomba-1|Opomba-2|Opomba-3|Flip
+S_G-inf|gbz Inf-GBZ|gg-ggn|12|50||12|||uspeti VIDETI|LBS-071|brez povratnih|||
+S_nedoločnik|GBZ Inf-gbz|gg-ggn|12|50||12|||USPETI videti|LBS-071|brez povratnih|||
+S_%s_g2|GBZ %s sbz2|gg-d-s2|13|34||13|||ODGOVORITI brez razmisleka|LBS-044|brez povratnega|||
+S_%s_x_g2|gbz %s SBZ2|gg-d-s2|13|34||13|||skočiti s STOLA|LBS-044|brez povratnega|||
+S_%s_g4|GBZ %s sbz4|gg-d-s4|14|36||14|||VNAŠATI v telo|LBS-046|brez povratnega|||
+S_%s_x_g4|gbz %s SBZ4|gg-d-s4|14|36||14|||voziti v ŠOLO|LBS-046|brez povratnega|||
+S_%s_x_g5|gbz %s SBZ5|gg-d-s5|15|37||15|||očistiti po UPORABI|LBS-047|brez povratnega|||
+S_%s_g5|GBZ %s sbz5|gg-d-s5|15|37||15|||UŽIVATI v vožnji|LBS-047|brez povratnega|||
+S_%s_x_g6|gbz %s SBZ6|gg-d-s6|16|38||16|||hoditi z BERGLAMI|LBS-048|brez povratnega|glej zgoraj opombo v vrstici 17, isti problem, drugi prvi del||
+S_%s_g6|GBZ %s sbz6|gg-d-s6|16|38||16|||VZGAJATI z ljubeznijo|LBS-048|brez povratnega|||
+S_%s_g3|GBZ %s sbz3|gg-d-s3|17|35||17|||RAVNATI proti vesti|LBS-045|brez povratnega|||
+S_%s_x_g3|gbz %s SBZ3|gg-d-s3|17|35||17|||steči proti VRATOM|LBS-045|brez povratnega|||
+S_kakšen-g?|gbz PBZ1|gg-p1|18|16||18|||pozirati GOL|LBS-019|brez povratnega|||
+S_kakšen-p?|GBZ pbz1|gg-p1|18|16||18|||TEČI bos|LBS-019|brez povratnega|||
+S_je_za_g|gbz RBZ||20|||||||LBS-023||združiti s strukturo 43||43
+S_kako-kdaj-za_g?|GBZ rbz||20|||||||LBS-023||združiti s strukturo 43||43
+S_v_rodil|GBZ sbz2||21|6||||||LBS-006||nova struktura 108||
+S_koga-česa|gbz SBZ2||21|6||||||LBS-006||nova struktura 108||
+S_v_dajal|GBZ sbz3|gg-s3|22|7||22|||SPREGOVORITI ljudstvu|LBS-007|brez povratnega|||
+S_komu-čemu|gbz SBZ3|gg-s3|22|7||22|||zaupati VODSTVU|LBS-007|brez povratnega|||
+S_koga-kaj|gbz SBZ4|gg-s4|23|8||23|||pogrniti MIZO|LBS-008|brez povratnega|||
+S_v_tožil|GBZ sbz4|gg-s4|23|8||23|||STISNITI sok|LBS-008|brez povratnega|||
+S_zanikani|Neg-gbz SBZ2|l-gg-s2|24|11|2|24||23|ne prepuščati ZRAKA|LBS-011||||
+S_z_nikalnim|Neg-GBZ sbz2|l-gg-s2|24|11|2|24||23|ne STISNITI zavore |LBS-011||||
+S_G-neg-inf|Neg-gbz Inf-GBZ|l-gg-ggn|25|51||25||12|ne ZAČETI teči|LBS-072|brez povratnih|pojavlja se tudi Neg-gbz Inf GBZ (Neg z veliko)||
+S_%s_x_p2|pbz0 %s SBZ2|p0-d-s2|26|39||26|||izdelan iz BOMBAŽA|LBS-054||||
+S_%s_p2|PBZ0 %s sbz2|p0-d-s2|26|39||26|||VELJAVEN do decembra|LBS-054||||
+S_%s_x_p4|pbz0 %s SBZ4|p0-d-s4|27|41||27|||alergičen na SONCE|LBS-056||||
+S_%s_p4|PBZ0 %s sbz4|p0-d-s4|27|41||27|||VAREN za uživanje|LBS-056||||
+S_%s_x_p5|pbz0 %s SBZ5|p0-d-s5|28|42||28|||prepričan o USPEHU|LBS-057||||
+S_%s_p5|PBZ0 %s sbz5|p0-d-s5|28|42||28|||UTRUJEN po tekmi|LBS-057||||
+S_%s_p6|PBZ0 %s sbz6|p0-d-s6|29|43||29|||IZVOLJEN z večino|LBS-058||||
+S_%s_x_p6|pbz0 %s SBZ6|p0-d-s6|29|43||29|||razočaran nad ODLOČITVIJO|LBS-058|glej zgoraj opombo v vrstici 17, isti problem, drugi prvi del|||
+S_priredje|PBZ0 in/ali pbz0|p0-vp-p0|30|64||30|||DOBER in/ali SLAB|LBS-099||||
+S_%s_x_p3|pbz0 %s SBZ3|p0-d-s3|31|40||31|||nagnjen k ZLORABI|LBS-055||||
+S_%s_p3|PBZ0 %s sbz3|p0-d-s3|31|40||31|||NAMENJEN proti domu|LBS-055||||
+S_primera|PBZ0 kot sbz0|p0-vd-s0|32|21||32|||LAČEN kot volk / utrujen kot PES|LBS-025||||
+S_je_za_p|pbz0 RBZ||33|||||||LBS-028||združiti s strukturo 46||46
+S_kako-kdaj-za_p?|PBZ0 rbz||33|||||||LBS-028||združiti s strukturo 46||46
+S_kakšen?|pbz0 SBZ0|p0-s0|34|1||34|||bela ZASTAVA|LBS-001||||
+S_kdo-kaj?|PBZ0 sbz0|p0-s0|34|1||34|||RDEČA jagoda|LBS-001||||
+S_p-koga-česa|pbz0 SBZ2|p0-s2|35|2||35|||[biti] obtožen UTAJE|LBS-002||||
+S_v_rodil-p|PBZ0 sbz2|p0-s2|35|2||35|||[biti] VESEL uspeha|LBS-002||||
+S_v_dajal-p|PBZ0 sbz3|p0-s3|36|3||36|||[biti] NAMENJEN vzgoji|LBS-003||||
+S_p-komu-čemu|pbz0 SBZ3|p0-s3|36|3||36|||[biti] zvest GOSPODARJU|LBS-003||||
+S_oba-v-rod|PBZ2 sbz2|p2-s2|37|4||37|||[biti] DOBRE volje|LBS-004||||
+S_oba-v-rod|pbz2 SBZ2|p2-s2|37|4||37|||[biti] dobrega SRCA|LBS-004||||
+S_%s_r2|RBZ %s sbz2|r-d-s2|38|44||38|||DANES do polnoči|LBS-064||||
+S_%s_r3|RBZ %s sbz3|r-d-s3|39|45||39|||VČERAJ proti večeru|LBS-065||||
+S_%s_r5|RBZ %s sbz5|r-d-s5|40|47||40|||VIDNO v zadregi|LBS-067||||
+S_%s_r4|RBZ %s sbz4|r-d-s4|41|46||41|||MALO za šalo|LBS-066||||
+S_%s_r6|RBZ %s sbz6|r-d-s6|42|48||42|||JUTRI pred odhodom|LBS-068|||vse so strukture|
+S_kako-kdaj_g?|rbz GBZ|r-gg|43|19||43|||čvrsto STISNITI|LBS-022|brez povratnega|||
+S_je_pred_g|RBZ gbz|r-gg|43|19||43|||DEBELO gledati|LBS-022|brez povratnega|||
+S_priredje|RBZ in/ali rbz|r-vp-r|44|67||44|||VČERAJ in/ali DANES|LBS-099||samo prvega zapišemo z velikimi črkami||
+S_primera|RBZ kot sbz0|r-vd-s0|45|22||45|||TIHO kot miš / mrzlo kot VRAG|LBS-025||||
+S_kako-kdaj_p?|rbz PBZ0|r-p0|46|24||46|||dovolj ZREL|LBS-027||sprememba v pridevniško lemo||
+S_je_pred_p|RBZ pbz0|r-p0|46|24||46|||RESNIČNO izjemen|LBS-027||sprememba v pridevniško lemo||
+S_količinski|rbz SBZ2|r-s2|47|28||47|||malo MOŽNOSTI|LBS-033||||
+S_količina_ob-s|RBZ sbz2|r-s2|47|28||47|||VELIKO ljudi|LBS-033||||
+S_%s_x_s2|sbz0 %s SBZ2|s0-d-s2|48|29||48|||dan brez AVTOMOBILA|LBS-034||||
+S_%s_s2|SBZ0 %s sbz2|s0-d-s2|48|29||48|||LISTINA iz spisa |LBS-034||||
+S_%s_s3|SBZ0 %s sbz3|s0-d-s3|49|30||49|||DOPOLNILO k zahvali |LBS-035||||
+S_%s_x_s3|sbz0 %s SBZ3|s0-d-s3|49|30||49|||poziv k ODSTOPU|LBS-035||||
+S_%s_s6|SBZ0 %s sbz6|s0-d-s6|50|33||50|||IZBOR med kandidati|LBS-038|"pri predlogu z se pojavljajo tudi zapisi ""SBZ0 s/z sbz6"" in ""SBZ0 z/s sbz6"", ker je ""s"" variantna oblika od ""z"" (lemma od obeh je ""z"")"|povsem enako kot za s/z velja za h/k (lemma od obeh je k)||
+S_%s_x_s6|sbz0 %s SBZ6|s0-d-s6|50|33||50|||odnos s PARTNERJEM|LBS-038|glej zgoraj opombo v vrstici 17, isti problem, drugi prvi del|||
+S_%s_s4|SBZ0 %s sbz4|s0-d-s4|51|31||51|||OBESEK za ključ|LBS-036||||
+S_%s_x_s4|sbz0 %s SBZ4|s0-d-s4|51|31||51|||predavanje na TEMO|LBS-036||||
+S_%s_s5|SBZ0 %s sbz5|s0-d-s5|52|32||52|||OTOK ob obali|LBS-037||||
+S_%s_x_s5|sbz0 %s SBZ5|s0-d-s5|52|32||52|||pesmica o SREČI|LBS-037||||
+S_v_rodil-s|SBZ0 sbz2|s0-s2|53|13||53|||PRANJE denarja|LBS-016||||
+S_s-koga-česa|sbz0 SBZ2|s0-s2|53|13||53|||utaja DAVKOV|LBS-016||||
+S_v_dajal-s|SBZ0 sbz3|s0-s3|54|14||54|||PISMO predsedniku|LBS-017||||
+S_s-komu-čemu|sbz0 SBZ3|s0-s3|54|14||54|||zahvala SPONZORJU|LBS-017||||
+S_prislov_prislov|rbz RBZ|r-r|55|25||55|||zares IZJEMNO / ENAKO besno|LBS-030||||
+S_s_prislovom|s prislovom||56|||||||LBS-012||brisati tudi podatke||
+S_s_prislovom|s prislovom||56|||||||LBS-012||brisati tudi podatke||
+S_Vez_P1|sbz1 Vez-gbz PBZ1|s1-gp-p1|57|5||57|||hruška je ZRELA|LBS-005||||
+S_S1_Vez_P|SBZ1 Vez-gbz pbz1|s1-gp-p1|57|5||57|||REZULTATI so dobri|LBS-005||||
+||gg-zp-s3||7||68|22||izneveriti se tradiciji|LBS-007|samo povratni (nova)|||
+||gg-zp-s4||8||69|23||ogledati si posnetek|LBS-008|samo povratni (nova)|||
+S_osebek_od|SBZ1 gbz|s1-gg||9||70|||PANIKA zavlada|LBS-009|brez povratnega|||
+S_osebek_je|sbz1 GBZ|s1-gg||9||70|||večina RAZUME |LBS-009|brez povratnega|||
+||s1-zp-gg||9||71|70||človek se USTRAŠI|LBS-009|samo povratni (nova)|||
+S_n-osebek_je|sbz1 Neg-GBZ|s1-l-gg||10|1|72||70|voda ne TEČE|LBS-010|brez povratnega|||
+S_n-osebek_od|SBZ1 neg-gbz|s1-l-gg||10|1|72||70|ČLOVEK ne vidi |LBS-010|brez povratnega|||
+||s1-zp-l-gg||10|1|73|72|70|vernik se ne BRIJE |LBS-010|samo povratni (nova)|||
+S_zanikani|Neg-gbz SBZ2|ggz-s2||11|1|74||23|ne hoteti ODGOVORA|LBS-011|brez povratnega|||
+S_z_nikalnim|Neg-GBZ sbz2|ggz-s2||11|1|74||23|ne IMETI znanja|LBS-011|brez povratnega|||
+||l-gg-zp-s2||11|1|75|24|23|ne ZVITI si noge|LBS-011|samo povratni (nova)|||
+S_sam-im_sam|sbz0 SBZ0|s0-s0||12||76|||angina PECTORIS|LBS-014||||
+S_im_sam-sam|SBZ0 sbz0|s0-s0||12||76|||ČLOVEK pajek|LBS-014||||
+S_Vez_S|SBZ1 Vez-gbz sbz1|s0-gp-s1||15||77|||STRIC je partizan|LBS-018||||
+||gg-zp-p1||16||78|18||zdeti se premagan|LBS-019|samo povratni (nova)|||
+S_kakšnega-p|Vez-gbz PBZ4|gg-p4|19|17||19|||pustiti SAMEGA|LBS-020|brez povratnega|to je v resnici številka 19 v DSB|SPREMLJATI REZULTAT!|
+S_kakšnega-g?|Vez-GBZ pbz4|gg-p4|19|17||19|||VIDETI nasmejanega|LBS-020|brez povratnega|to je v resnici številka 19 v DSB|SPREMLJATI REZULTAT!|
+||gg-zp-p4||17||80|19||počutiti se osamljenega|LBS-020|samo povratni (nova)|||
+||r-zp-gg||19||81|43||močno se prestrašiti|LBS-022|samo povratni (nova)|||
+S_primera|GBZ kot sbz0|gg-vd-s0||20||82|||BRUHATI kot vidra|LBS-025|brez povratnega|||
+||gg-zp-vd-s0||20||83|82||pojavljati se kot mora|LBS-025|samo povratni (nova)|||
+S_primera|SBZ0 kot sbz0|s0-vd-s0||23||84|||ZDRAVJE kot vrednota / vrednost kot OSNOVA|LBS-025||NE IZPISUJE 'KOT'!||
+S_kako-kdaj_r|RBZ Vez-gbz pbz1|r-p1||26||85|||[biti] objavljen včeraj|LBS-031||||
+||gg-zp-d-s2||34||86|13||pripeljati se do banke|LBS-044|samo povratni (nova)|||
+||gg-zp-d-s3||35||87|17||odločiti se kljub nasprotovanju|LBS-045|samo povratni (nova)|||
+||gg-zp-d-s4||36||88|14||voziti se v službo|LBS-047|samo povratni (nova)|||
+||gg-zp-d-s5||37||89|15||kopati se v morju|LBS-047|samo povratni (nova)|||
+||gg-zp-d-s6||38||90|16||pripeljati se z mopedom|LBS-048|samo povratni (nova)|||
+S_%s_r|sbz0 %s RBZ|s0-d-r||49||91|||prijatelj za VEDNO|LBS-069|||popravljeno|
+||gg-zp-ggn||||92|12||prizadevati si poiskati|LBS-071|samo povratni prvi glagol (nova)|||
+||gg-ggn-zp||||93|12||utegniti zaplesti se|LBS-071|samo povratni drugi glagol (nova)|||
+||gg-zp-ggn-zp||||94|92/93||odločiti se prodati se|LBS-071|samo povratni oba glagola (nova)|||
+||l-gg-zp-ggn||51||95|25|92|ne dati se predvideti|LBS-072|samo povratni prvi glagol (nova)|||
+||l-gg-ggn-zp||51||96|25|93|ne uspeti udeležiti se|LBS-072|samo povratni drugi glagol (nova)|||
+||l-gg-zp-ggn-zp||51||97|25|94|ne bati se pokazati se|LBS-072|samo povratni oba glagola (nova)|||
+S_R-inf|rbz Vez-gbz Inf-GBZ|r-ggn||52||98|||[biti] bolje PRESEKATI|LBS-073|brez povratnega|||
+||r-ggn-zp||52||99|98||[biti] bolje smejati se|LBS-073|samo povratni (nova)|||
+S_P-inf|pbz1 Vez-gbz Inf-GBZ|p1-ggn||54||100|||[biti] prisiljen ZAPRETI|LBS-075|brez povratnega|||
+||p1-ggn-zp||54||101|100||[biti] pripravljen zadolžiti se|LBS-075|samo povratni (nova)|||
+S_S-inf|sbz1 Vez-gbz Inf-GBZ|s1-ggn||56||102|||pravica POČETI |LBS-077|brez povratnega|||
+||s1-ggn-zp||56||103|||pravica odločati se|LBS-077|samo povratni (nova)|||
+S_namenilnik|gbz Nam-GBZ|gg-ggm||58||104|||iti PLAVAT|LBS-079|brez povratnega|||
+||gg-ggm-zp||58||105|||iti ogledat si|LBS-079|samo povratni (nova)|||
+S_priredje|priredje|s0-vp-s0||65||106|||VINO in/ali PIVO|LBS-099||"to najdemo tudi kot ""SBZ0 in sbz0"", ""SBZ0 in/ali sbz0"""||
+S_priredje|GBZ in/ali gbz|gg-vp-gg||66||107|||GOVORITI in/ali ŠEPETATI|LBS-099||"ta struktura je lahko ""GBZ in gbz"", ""GBZ ali gbz"", ""GBZ in/ali gbz"", pa tudi ""GBZ in/ali GBZ"" (oba elementa zapisana z veliko)"|za pretvorbo damo samo prvega z velikimi črkami|
+||gg-zp-s2||6||108|||BATI se maščevanja|LBS-006|samo povratni (nova)|bivša struktura 21||
+||gg-zp-s2||6||108|||želeti si ZDRAVJA|LBS-006|samo povratni (nova)|bivša struktura 22||
+S_n-osebek_je|sbz1 Neg-GBZ|s1-ggz||10|2||||država NIMA |LBS-010|odstranimo?|||
+S_n-osebek_od|SBZ1 neg-gbz|s1-ggz||10|2||||ČLOVEK nima|LBS-010|odstranimo?|||
+S_neg-kakšnega-g?|Neg-GBZ pbz2|||18|1/2||||ne RAZUMETI prebranega / ne IMETI zaposlenih|LBS-021|odstranimo?|||
+S_neg-kakšnega-p|Neg-gbz PBZ2|||18|1/2||||ne želeti SLABEGA / ne hoteti SLABEGA|LBS-021|odstranimo?|||
+S_kdo-kaj_r|sbz1 Vez-gbz RBZ|||27|||||vzdušje je IZJEMNO|LBS-032|odstranimo|NESMISELNI REZULTATI - PRIDEVNIK, ponavlja NSSS id-57||
+S_R-neg-inf|rbz Neg-Vez-gbz Inf-GBZ|||53|1||||ne (biti) lahko ZAPUSTITI|LBS-074|odstranimo|||
+S_P-neg-inf|pbz1 Neg-Vez-gbz Inf-GBZ|||55|1||||ne (biti) dolžen OBJAVITI|LBS-076|odstranimo|||
+S_S-neg-inf|sbz1 Neg-Vez-gbz Inf-GBZ|||57|1||||ne (vzeti) časa POISKATI|LBS-078|odstranimo|||
+S_biti_s2|vez-gbz SBZ2|||59|||||biti brez VOLJE|LBS-082|odstranimo|to pokriva id 34||
+S_biti_s3|vez-gbz SBZ3|||60|||||biti proti VOLJi|LBS-083|odstranimo|to pokriva id 35||
+S_biti_s4|vez-gbz SBZ4|||61|||||biti na VOLJO|LBS-084|odstranimo|to pokriva id 36||
+S_biti_s5|vez-gbz SBZ5|||62|||||biti po VOLJI|LBS-085|odstranimo|to pokriva id 37||
+S_biti_s6|vez-gbz SBZ6|||63|||||biti pod VPLIVOM|LBS-086|odstranimo|to pokriva id 38||
+|PBZ0 gbz|||||||||***||||
+S_vezni|gbz SBZ1|||||||||LBS-013||||
+S_s-kdo-kaj|sbz0 SBZ1|||||||||LBS-015||||
+S_v_imen-s|SBZ0 sbz1|||||||||LBS-015||||
+S_veznik_enob|SBZ0 Odv|||||||||LBS-024||||
+S_veznik_enob|RBZ Odv|||||||||LBS-024||||
+S_veznik_enob|PBZ0 Odv|||||||||LBS-024||||
+S_veznik_enob|GBZ Odv|||||||||LBS-024||||
+S_simile|primera|||||||||LBS-025|"replicira drugo stran ""primera"""|||
+S_predl-pred|s predlogom|||||||||LBS-026||||
+V_biti_videti|biti videti %s|||||||||LBS-029||ta je pri pridevniku||
+V_biti_videti|biti videti|||||||||LBS-033||bi moralo biti struktura||
+S_%(3.lempos)_x_s2||||||||||LBS-039| |||
+S_%(3.lempos)_x_s3||||||||||LBS-040| |||
+S_%(3.lempos)_x_s4||||||||||LBS-041| |||
+S_%(3.lempos)_x_s5||||||||||LBS-042| |||
+S_%(3.lempos)_x_s6||||||||||LBS-043| |||
+S_%(3.lempos)_x_g2||||||||||LBS-049| |||
+S_%(3.lempos)_x_g3||||||||||LBS-050| |||
+S_%(3.lempos)_x_g4||||||||||LBS-051| |||
+S_%(3.lempos)_x_g5||||||||||LBS-052| |||
+S_%(3.lempos)_x_g6||||||||||LBS-053| |||
+S_%(3.lempos)_x_p2||||||||||LBS-059| |||
+S_%(3.lempos)_x_p3||||||||||LBS-060| |||
+S_%(3.lempos)_x_p4||||||||||LBS-061| |||
+S_%(3.lempos)_x_p5||||||||||LBS-062| |||
+S_%(3.lempos)_x_p6||||||||||LBS-063| |||
+S_nedoločnik|SBZ0 Inf-gbz|||||||||LBS-070||||
+S_nedoločnik|RBZ Inf-gbz|||||||||LBS-070||||
+S_nedoločnik|PBZ0 Inf-gbz|||||||||LBS-070||||
+S_G_GInf_O4|gbz Inf-gbz SBZ4|||||||||LBS-080||||
+S_Vez_Inf_S|SBZ1 vez-gbz Inf-gbz|||||||||LBS-081 |||ok|
+O_z_lastnim_imenom|pogosto z lastnim imenom|||||||||LBS-087||manjka|ok|
+O_s_števili|pogosto s števili|||||||||LBS-088||"manjka (je mogooče ""pogosto s števili"")?"||
+V_S_V_O3_O4|kdo/kaj G komu kaj|||||||||LBS-089||||
+V_S_V_O3_O2|kdo/kaj G komu koga/česa|||||||||LBS-090||||
+V_S_V_O4_predl|kdo/kaj G koga/kaj + predlog|||||||||LBS-091||||
+V_S_V_O3|kdo/kaj G komu|||||||||LBS-092||||
+V_gl_Cit|cit GBZ|||||||||LBS-093||tu je bil prej vzorec, spremenjeno v struktura, ker je tako v bazi||
+O_povratni_se|povratni (se)|||||||||LBS-094||||
+O_povratni_si|povratni (si)|||||||||LBS-095||||
+O_nedoločnik_cs|pogosto v nedoločniku|||||||||LBS-096||||
+O_tretja_oseba|pogosto v tretji osebi|||||||||LBS-097||||
+V_lahko_G|mod-rbz GBZ|||||||||LBS-098||ta je pri glagolu||
+V_lahko_G|mod-rbz GBZ|||||||||LBS-098||tu je bil prej vzorec, spremenjeno v struktura, ker je tako v bazi||
+S_z_vezajem|rbz1-RBZ0|||||||||LBS-100||||
+S_z_vezajem|pbz1{-}PBZ0|||||||||LBS-100||možna je tudi verzija PBZ1{-}pbz0||
+O_zanikanje|pogosto zanikan|||||||||LBS-101||||
+O_količina|s količino|||||||||LBS-102||||
+S_veznik_dvob|dvobesedni veznik|||||||||LBS-103||||
+S_d_sam_d|z dvema predlogoma|||||||||LBS-104||struktura ali vzorec?||
+S_gl_K_sam|predlog s števnikom|||||||||LBS-105||je tole sploh pravilno zapisano?||
+S_gl_K_sam|zveze s števniki|||||||||LBS-105||Simon, tu je treba pogledati š (je čudno napisan)||
diff --git a/conversion_utils/tei_to_dictionary.py b/conversion_utils/tei_to_dictionary.py
new file mode 100644
index 0000000..89992cb
--- /dev/null
+++ b/conversion_utils/tei_to_dictionary.py
@@ -0,0 +1,59 @@
+import argparse
+import lxml.etree as lxml
+
+from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
+
+def get_parsed_unit_string(parsed_unit):
+ elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
+ return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
+
+def convert(input_file_name, output_file_name):
+
+ output_root = lxml.Element('dictionary')
+
+ parser = lxml.XMLParser(remove_blank_text=True)
+ input_root = lxml.parse(input_file_name, parser).getroot()
+ parsed_units = xpath_find(input_root, 'tei:text/tei:body/tei:p/tei:s')
+
+ for parsed_unit in parsed_units:
+ entry = lxml.SubElement(output_root, 'entry')
+ entry.set('sid', get_xml_id(parsed_unit))
+ head = lxml.SubElement(entry, 'head')
+ headword = lxml.SubElement(head, 'headword')
+ lemma_text = get_parsed_unit_string(parsed_unit)
+ lemma = lxml.SubElement(headword, 'lemma')
+ lemma.text = lemma_text
+ lexical_unit = lxml.SubElement(head, 'lexicalUnit')
+ tokens = xpath_find(parsed_unit, 'tei:w|tei:pc')
+ if (len(tokens) == 1):
+ token = tokens[0]
+ lexical_unit.set('type', 'single')
+ lexeme = lxml.SubElement(lexical_unit, 'lexeme')
+ if (token.tag == TEI_NAMESPACE_QUALIFIER + 'w'):
+ lexeme.set('lemma', token.get('lemma'))
+ lexeme.set('msd', token.get('ana')[len('mte:'):])
+ lexeme.text = token.text
+ else:
+ lexical_unit.set('type', 'MWE')
+ for (index, token) in enumerate(tokens, start=1):
+ component = lxml.SubElement(lexical_unit, 'component')
+ component.set('num', str(index))
+ lexeme = lxml.SubElement(component, 'lexeme')
+ if (token.tag == TEI_NAMESPACE_QUALIFIER + 'w'):
+ lexeme.set('lemma', token.get('lemma'))
+ lexeme.set('msd', token.get('ana')[len('mte:'):])
+ lexeme.text = token.text
+ lexical_unit.set('structure_id', str(parsed_unit.get('structure_id')))
+ body = lxml.SubElement(entry, 'body')
+ senseList = lxml.SubElement(body, 'senseList')
+
+ output_tree = lxml.ElementTree(output_root)
+ output_tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
+
+
+if (__name__ == '__main__'):
+ arg_parser = argparse.ArgumentParser(description='Convert TEI to dictionary xml.')
+ arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
+ arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
+ arguments = arg_parser.parse_args()
+ convert(input_file_name, output_file_name)
diff --git a/conversion_utils/translate_conllu_jos.py b/conversion_utils/translate_conllu_jos.py
new file mode 100644
index 0000000..b84894e
--- /dev/null
+++ b/conversion_utils/translate_conllu_jos.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import codecs
+import lxml.etree as lxml
+from importlib_resources import files
+
+from conversion_utils.jos_msds_and_properties import Converter, Msd
+
+def get_syn_map():
+ dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
+ dict_file = codecs.open(dict_file_name, 'r')
+ root = lxml.parse(dict_file).getroot()
+ dict_file.close()
+ return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
+
+def translate(input_file_name, output_file_name):
+
+ syn_map = get_syn_map()
+
+ output_file = codecs.open(output_file_name, 'w')
+ input_file = codecs.open(input_file_name, 'r')
+
+ converter = Converter()
+
+ for line in input_file:
+ columns = line.strip().split('\t')
+ if (len(columns) != 10):
+ output_file.write(line)
+ else:
+ columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
+ columns[7] = syn_map[columns[7]]
+ output_file.write('\t'.join(columns) + '\n')
+
+ input_file.close()
+ output_file.close()
+
+
+if (__name__ == '__main__'):
+
+ arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
+ arg_parser.add_argument('-infile', type=str, help='Input conllu')
+ arg_parser.add_argument('-outfile', type=str, help='Output conllu')
+ arguments = arg_parser.parse_args()
+ input_file_name = arguments.infile
+ output_file_name = arguments.outfile
+
+ translate(input_file_name, output_file_name)
diff --git a/conversion_utils/utils.py b/conversion_utils/utils.py
index a204321..dfd750d 100644
--- a/conversion_utils/utils.py
+++ b/conversion_utils/utils.py
@@ -1,4 +1,5 @@
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
+TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
def xpath_find(element,expression):