diff --git a/MANIFEST.in b/MANIFEST.in index e3182fb..a3e95e0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,3 @@ -include conversion_utils/resources/jos_specifications.pickle \ No newline at end of file +include conversion_utils/resources/jos_specifications.pickle +include conversion_utils/resources/dict.xml +include conversion_utils/resources/structure_conversions.csv diff --git a/conversion_utils/conllu_to_tei.py b/conversion_utils/conllu_to_tei.py new file mode 100644 index 0000000..64f0ba3 --- /dev/null +++ b/conversion_utils/conllu_to_tei.py @@ -0,0 +1,291 @@ +import argparse +import re +import sys + +from lxml import etree + +class Sentence: + def __init__(self, _id, no_ud=False): + self._id = _id + self.items = [] + self.links = [] + self.no_ud = no_ud + + def add_item(self, token, lemma, upos, upos_other, xpos, misc): + self.items.append([token, lemma, upos, upos_other, xpos, misc == "SpaceAfter=No"]) + + def add_link(self, link_ref, link_type): + self.links.append([link_ref, link_type]) + + def as_xml(self, id_prefix=None): + if id_prefix: + xml_id = id_prefix + '.' + self._id + else: + xml_id = self._id + base = etree.Element('s') + set_xml_attr(base, 'id', xml_id) + id_counter = 1 + + for item in self.items: + token, lemma, upos, upos_other, xpos, no_space_after = item + + if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one + to_add = etree.Element('pc') + else: + to_add = etree.Element('w') + to_add.set('lemma', lemma) + + to_add.set('ana', 'mte:' + xpos) + if not self.no_ud: + if upos_other != '_': + to_add.set('msd', f'UposTag={upos}|{upos_other}') + else: + to_add.set('msd', f'UposTag={upos}') + + set_xml_attr(to_add, 'id', "{}.{}".format(xml_id, id_counter)) + to_add.text = token + + id_counter += 1 + + if no_space_after: + to_add.set('join', 'right') + + base.append(to_add) + + link_grp = etree.Element('linkGrp') + link_grp.set('corresp', '#'+xml_id) + link_grp.set('targFunc', 'head argument') + link_grp.set('type', 'JOS-SYN') + for link_id, item in enumerate(self.links): + link_ref, link_type = item + link = etree.Element('link') + link.set('ana', 'jos-syn:' + link_type) + if link_ref == u'0': + link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1)) + else: + link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1)) + link_grp.append(link) + base.append(link_grp) + return base + + +class Paragraph: + def __init__(self, _id): + self._id = _id + self.sentences = [] + + def add_sentence(self, sentence): + self.sentences.append(sentence) + + def as_xml(self, id_prefix=None): + if id_prefix: + xml_id = id_prefix + '.' + self._id + else: + xml_id = self._id + + p = etree.Element('p') + set_xml_attr(p, 'id', xml_id) + + for sent in self.sentences: + p.append(sent.as_xml(id_prefix=id_prefix)) + return p + + +class TeiDocument: + def __init__(self, _id, paragraphs=list()): + self._id = _id + self.paragraphs = paragraphs + + def as_xml(self): + root = etree.Element('TEI') + root.set('xmlns', 'http://www.tei-c.org/ns/1.0') + set_xml_attr(root, 'lang', 'sl') + + xml_id = self._id + if xml_id is not None: + set_xml_attr(root, 'id', xml_id) + + tei_header = etree.SubElement(root, 'teiHeader') + + text = etree.SubElement(root, 'text') + body = etree.SubElement(text, 'body') + for para in self.paragraphs: + body.append(para.as_xml(id_prefix=xml_id)) + + encoding_desc = etree.SubElement(tei_header, 'encodingDesc') + tags_decl = etree.SubElement(encoding_desc, 'tagsDecl') + namespace = etree.SubElement(tags_decl, 'namespace') + namespace.set('name', 'http://www.tei-c.org/ns/1.0') + for tag in ['p', 's', 'pc', 'w']: + count = int(text.xpath('count(.//{})'.format(tag))) + tag_usage = etree.SubElement(namespace, 'tagUsage') + tag_usage.set('gi', tag) + tag_usage.set('occurs', str(count)) + return root + + def add_paragraph(self, paragraph): + self.paragraphs.append(paragraph) + + +def build_tei_etrees(documents): + elements = [] + for document in documents: + elements.append(document.as_xml()) + return elements + + +def set_xml_attr(node, attribute, value): + node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value + + +def parse_metaline(line): + tokens = line.split('=', 1) + key = tokens[0].replace('#', '').strip() + if len(tokens) > 1 and not tokens[1].isspace(): + val = tokens[1].strip() + else: + val = None + return (key, val) + + +def is_metaline(line): + if re.match('# .+ =.*', line): + return True + return False + + +def construct_tei_documents(conllu_lines): + documents = [] + + doc_id = None + document_paragraphs = [] + + para_id = None + para_buffer = [] + + for line in conllu_lines: + if is_metaline(line): + key, val = parse_metaline(line) + if key == 'newdoc id': + if len(para_buffer) > 0: + document_paragraphs.append(construct_paragraph(para_id, para_buffer)) + if len(document_paragraphs) > 0: + documents.append( + TeiDocument(doc_id, document_paragraphs)) + document_paragraphs = [] + doc_id = val + elif key == 'newpar id': + if len(para_buffer) > 0: + document_paragraphs.append(construct_paragraph(para_id, para_buffer)) + para_buffer = [] + para_id = val + elif key == 'sent_id': + para_buffer.append(line) + else: + if not line.isspace(): + para_buffer.append(line) + + if len(para_buffer) > 0: + document_paragraphs.append(construct_paragraph(para_id, para_buffer)) + + if len(document_paragraphs) > 0: + documents.append( + TeiDocument(doc_id, document_paragraphs)) + + return documents + + +def construct_paragraph(para_id, conllu_lines): + para = Paragraph(para_id) + + sent_id = None + sent_buffer = [] + + for line in conllu_lines: + if is_metaline(line): + key, val = parse_metaline(line) + if key == 'sent_id': + if len(sent_buffer) > 0: + para.add_sentence(construct_sentence(sent_id, sent_buffer)) + sent_buffer = [] + sent_id = val + elif not line.isspace(): + sent_buffer.append(line) + + if len(sent_buffer) > 0: + para.add_sentence(construct_sentence(sent_id, sent_buffer)) + + return para + + +def construct_sentence(sent_id, lines): + sentence = Sentence(sent_id) + for line in lines: + if line.startswith('#') or line.isspace(): + continue + line = line.replace('\n', '') + tokens = line.split('\t') + word_id = tokens[0] + token = tokens[1] + lemma = tokens[2] + upos = tokens[3] + xpos = tokens[4] + upos_other = tokens[5] + depparse_link = tokens[6] + depparse_link_name = tokens[7] + misc = tokens[9] + + sentence.add_item( + token, + lemma, + upos, + upos_other, + xpos, + misc) + + sentence.add_link( + depparse_link, + depparse_link_name) + return sentence + + +def construct_tei_etrees(conllu_lines): + documents = construct_tei_documents(conllu_lines) + return build_tei_etrees(documents) + + +def convert_file(input_file_name, output_file_name): + input_file = open(input_file_name, 'r') + root = construct_tei_etrees(input_file)[0] + tree = etree.ElementTree(root) + tree.write(output_file_name, encoding='UTF-8', pretty_print=True) + input_file.close() + + tree = etree.ElementTree(root) + tree.write(output_file_name, pretty_print=True, encoding='utf-8') + + +if __name__ == '__main__': + import argparse + from glob import glob + + parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.') + parser.add_argument('files', nargs='+', help='CoNNL-U file') + parser.add_argument('-o', '--out-file', dest='out', default=None, + help='Write output to file instead of stdout.') + + args = parser.parse_args() + + if args.out: + f_out = open(args.out, 'w') + else: + f_out = sys.stdout + + for arg in args.files: + filelist = glob(arg) + for f in filelist: + with open(f, 'r') as conllu_f: + tei_etrees = construct_tei_etrees(conllu_f) + for tei_etree in tei_etrees: + f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode()) + f_out.write('') diff --git a/conversion_utils/resources/dict.xml b/conversion_utils/resources/dict.xml new file mode 100644 index 0000000..ad872d3 --- /dev/null +++ b/conversion_utils/resources/dict.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/conversion_utils/resources/structure_conversions.csv b/conversion_utils/resources/structure_conversions.csv new file mode 100644 index 0000000..07a3dd8 --- /dev/null +++ b/conversion_utils/resources/structure_conversions.csv @@ -0,0 +1,193 @@ +gramrel|struktura|Name_ID|DSB|Simon|Simon-1|NSSS|Pov-REFL|Pov-NEG|Zgled|LBS|Opomba-1|Opomba-2|Opomba-3|Flip +S_G-inf|gbz Inf-GBZ|gg-ggn|12|50||12|||uspeti VIDETI|LBS-071|brez povratnih||| +S_nedoločnik|GBZ Inf-gbz|gg-ggn|12|50||12|||USPETI videti|LBS-071|brez povratnih||| +S_%s_g2|GBZ %s sbz2|gg-d-s2|13|34||13|||ODGOVORITI brez razmisleka|LBS-044|brez povratnega||| +S_%s_x_g2|gbz %s SBZ2|gg-d-s2|13|34||13|||skočiti s STOLA|LBS-044|brez povratnega||| +S_%s_g4|GBZ %s sbz4|gg-d-s4|14|36||14|||VNAŠATI v telo|LBS-046|brez povratnega||| +S_%s_x_g4|gbz %s SBZ4|gg-d-s4|14|36||14|||voziti v ŠOLO|LBS-046|brez povratnega||| +S_%s_x_g5|gbz %s SBZ5|gg-d-s5|15|37||15|||očistiti po UPORABI|LBS-047|brez povratnega||| +S_%s_g5|GBZ %s sbz5|gg-d-s5|15|37||15|||UŽIVATI v vožnji|LBS-047|brez povratnega||| +S_%s_x_g6|gbz %s SBZ6|gg-d-s6|16|38||16|||hoditi z BERGLAMI|LBS-048|brez povratnega|glej zgoraj opombo v vrstici 17, isti problem, drugi prvi del|| +S_%s_g6|GBZ %s sbz6|gg-d-s6|16|38||16|||VZGAJATI z ljubeznijo|LBS-048|brez povratnega||| +S_%s_g3|GBZ %s sbz3|gg-d-s3|17|35||17|||RAVNATI proti vesti|LBS-045|brez povratnega||| +S_%s_x_g3|gbz %s SBZ3|gg-d-s3|17|35||17|||steči proti VRATOM|LBS-045|brez povratnega||| +S_kakšen-g?|gbz PBZ1|gg-p1|18|16||18|||pozirati GOL|LBS-019|brez povratnega||| +S_kakšen-p?|GBZ pbz1|gg-p1|18|16||18|||TEČI bos|LBS-019|brez povratnega||| +S_je_za_g|gbz RBZ||20|||||||LBS-023||združiti s strukturo 43||43 +S_kako-kdaj-za_g?|GBZ rbz||20|||||||LBS-023||združiti s strukturo 43||43 +S_v_rodil|GBZ sbz2||21|6||||||LBS-006||nova struktura 108|| +S_koga-česa|gbz SBZ2||21|6||||||LBS-006||nova struktura 108|| +S_v_dajal|GBZ sbz3|gg-s3|22|7||22|||SPREGOVORITI ljudstvu|LBS-007|brez povratnega||| +S_komu-čemu|gbz SBZ3|gg-s3|22|7||22|||zaupati VODSTVU|LBS-007|brez povratnega||| +S_koga-kaj|gbz SBZ4|gg-s4|23|8||23|||pogrniti MIZO|LBS-008|brez povratnega||| +S_v_tožil|GBZ sbz4|gg-s4|23|8||23|||STISNITI sok|LBS-008|brez povratnega||| +S_zanikani|Neg-gbz SBZ2|l-gg-s2|24|11|2|24||23|ne prepuščati ZRAKA|LBS-011|||| +S_z_nikalnim|Neg-GBZ sbz2|l-gg-s2|24|11|2|24||23|ne STISNITI zavore |LBS-011|||| +S_G-neg-inf|Neg-gbz Inf-GBZ|l-gg-ggn|25|51||25||12|ne ZAČETI teči|LBS-072|brez povratnih|pojavlja se tudi Neg-gbz Inf GBZ (Neg z veliko)|| +S_%s_x_p2|pbz0 %s SBZ2|p0-d-s2|26|39||26|||izdelan iz BOMBAŽA|LBS-054|||| +S_%s_p2|PBZ0 %s sbz2|p0-d-s2|26|39||26|||VELJAVEN do decembra|LBS-054|||| +S_%s_x_p4|pbz0 %s SBZ4|p0-d-s4|27|41||27|||alergičen na SONCE|LBS-056|||| +S_%s_p4|PBZ0 %s sbz4|p0-d-s4|27|41||27|||VAREN za uživanje|LBS-056|||| +S_%s_x_p5|pbz0 %s SBZ5|p0-d-s5|28|42||28|||prepričan o USPEHU|LBS-057|||| +S_%s_p5|PBZ0 %s sbz5|p0-d-s5|28|42||28|||UTRUJEN po tekmi|LBS-057|||| +S_%s_p6|PBZ0 %s sbz6|p0-d-s6|29|43||29|||IZVOLJEN z večino|LBS-058|||| +S_%s_x_p6|pbz0 %s SBZ6|p0-d-s6|29|43||29|||razočaran nad ODLOČITVIJO|LBS-058|glej zgoraj opombo v vrstici 17, isti problem, drugi prvi del||| +S_priredje|PBZ0 in/ali pbz0|p0-vp-p0|30|64||30|||DOBER in/ali SLAB|LBS-099|||| +S_%s_x_p3|pbz0 %s SBZ3|p0-d-s3|31|40||31|||nagnjen k ZLORABI|LBS-055|||| +S_%s_p3|PBZ0 %s sbz3|p0-d-s3|31|40||31|||NAMENJEN proti domu|LBS-055|||| +S_primera|PBZ0 kot sbz0|p0-vd-s0|32|21||32|||LAČEN kot volk / utrujen kot PES|LBS-025|||| +S_je_za_p|pbz0 RBZ||33|||||||LBS-028||združiti s strukturo 46||46 +S_kako-kdaj-za_p?|PBZ0 rbz||33|||||||LBS-028||združiti s strukturo 46||46 +S_kakšen?|pbz0 SBZ0|p0-s0|34|1||34|||bela ZASTAVA|LBS-001|||| +S_kdo-kaj?|PBZ0 sbz0|p0-s0|34|1||34|||RDEČA jagoda|LBS-001|||| +S_p-koga-česa|pbz0 SBZ2|p0-s2|35|2||35|||[biti] obtožen UTAJE|LBS-002|||| +S_v_rodil-p|PBZ0 sbz2|p0-s2|35|2||35|||[biti] VESEL uspeha|LBS-002|||| +S_v_dajal-p|PBZ0 sbz3|p0-s3|36|3||36|||[biti] NAMENJEN vzgoji|LBS-003|||| +S_p-komu-čemu|pbz0 SBZ3|p0-s3|36|3||36|||[biti] zvest GOSPODARJU|LBS-003|||| +S_oba-v-rod|PBZ2 sbz2|p2-s2|37|4||37|||[biti] DOBRE volje|LBS-004|||| +S_oba-v-rod|pbz2 SBZ2|p2-s2|37|4||37|||[biti] dobrega SRCA|LBS-004|||| +S_%s_r2|RBZ %s sbz2|r-d-s2|38|44||38|||DANES do polnoči|LBS-064|||| +S_%s_r3|RBZ %s sbz3|r-d-s3|39|45||39|||VČERAJ proti večeru|LBS-065|||| +S_%s_r5|RBZ %s sbz5|r-d-s5|40|47||40|||VIDNO v zadregi|LBS-067|||| +S_%s_r4|RBZ %s sbz4|r-d-s4|41|46||41|||MALO za šalo|LBS-066|||| +S_%s_r6|RBZ %s sbz6|r-d-s6|42|48||42|||JUTRI pred odhodom|LBS-068|||vse so strukture| +S_kako-kdaj_g?|rbz GBZ|r-gg|43|19||43|||čvrsto STISNITI|LBS-022|brez povratnega||| +S_je_pred_g|RBZ gbz|r-gg|43|19||43|||DEBELO gledati|LBS-022|brez povratnega||| +S_priredje|RBZ in/ali rbz|r-vp-r|44|67||44|||VČERAJ in/ali DANES|LBS-099||samo prvega zapišemo z velikimi črkami|| +S_primera|RBZ kot sbz0|r-vd-s0|45|22||45|||TIHO kot miš / mrzlo kot VRAG|LBS-025|||| +S_kako-kdaj_p?|rbz PBZ0|r-p0|46|24||46|||dovolj ZREL|LBS-027||sprememba v pridevniško lemo|| +S_je_pred_p|RBZ pbz0|r-p0|46|24||46|||RESNIČNO izjemen|LBS-027||sprememba v pridevniško lemo|| +S_količinski|rbz SBZ2|r-s2|47|28||47|||malo MOŽNOSTI|LBS-033|||| +S_količina_ob-s|RBZ sbz2|r-s2|47|28||47|||VELIKO ljudi|LBS-033|||| +S_%s_x_s2|sbz0 %s SBZ2|s0-d-s2|48|29||48|||dan brez AVTOMOBILA|LBS-034|||| +S_%s_s2|SBZ0 %s sbz2|s0-d-s2|48|29||48|||LISTINA iz spisa |LBS-034|||| +S_%s_s3|SBZ0 %s sbz3|s0-d-s3|49|30||49|||DOPOLNILO k zahvali |LBS-035|||| +S_%s_x_s3|sbz0 %s SBZ3|s0-d-s3|49|30||49|||poziv k ODSTOPU|LBS-035|||| +S_%s_s6|SBZ0 %s sbz6|s0-d-s6|50|33||50|||IZBOR med kandidati|LBS-038|"pri predlogu z se pojavljajo tudi zapisi ""SBZ0 s/z sbz6"" in ""SBZ0 z/s sbz6"", ker je ""s"" variantna oblika od ""z"" (lemma od obeh je ""z"")"|povsem enako kot za s/z velja za h/k (lemma od obeh je k)|| +S_%s_x_s6|sbz0 %s SBZ6|s0-d-s6|50|33||50|||odnos s PARTNERJEM|LBS-038|glej zgoraj opombo v vrstici 17, isti problem, drugi prvi del||| +S_%s_s4|SBZ0 %s sbz4|s0-d-s4|51|31||51|||OBESEK za ključ|LBS-036|||| +S_%s_x_s4|sbz0 %s SBZ4|s0-d-s4|51|31||51|||predavanje na TEMO|LBS-036|||| +S_%s_s5|SBZ0 %s sbz5|s0-d-s5|52|32||52|||OTOK ob obali|LBS-037|||| +S_%s_x_s5|sbz0 %s SBZ5|s0-d-s5|52|32||52|||pesmica o SREČI|LBS-037|||| +S_v_rodil-s|SBZ0 sbz2|s0-s2|53|13||53|||PRANJE denarja|LBS-016|||| +S_s-koga-česa|sbz0 SBZ2|s0-s2|53|13||53|||utaja DAVKOV|LBS-016|||| +S_v_dajal-s|SBZ0 sbz3|s0-s3|54|14||54|||PISMO predsedniku|LBS-017|||| +S_s-komu-čemu|sbz0 SBZ3|s0-s3|54|14||54|||zahvala SPONZORJU|LBS-017|||| +S_prislov_prislov|rbz RBZ|r-r|55|25||55|||zares IZJEMNO / ENAKO besno|LBS-030|||| +S_s_prislovom|s prislovom||56|||||||LBS-012||brisati tudi podatke|| +S_s_prislovom|s prislovom||56|||||||LBS-012||brisati tudi podatke|| +S_Vez_P1|sbz1 Vez-gbz PBZ1|s1-gp-p1|57|5||57|||hruška je ZRELA|LBS-005|||| +S_S1_Vez_P|SBZ1 Vez-gbz pbz1|s1-gp-p1|57|5||57|||REZULTATI so dobri|LBS-005|||| +||gg-zp-s3||7||68|22||izneveriti se tradiciji|LBS-007|samo povratni (nova)||| +||gg-zp-s4||8||69|23||ogledati si posnetek|LBS-008|samo povratni (nova)||| +S_osebek_od|SBZ1 gbz|s1-gg||9||70|||PANIKA zavlada|LBS-009|brez povratnega||| +S_osebek_je|sbz1 GBZ|s1-gg||9||70|||večina RAZUME |LBS-009|brez povratnega||| +||s1-zp-gg||9||71|70||človek se USTRAŠI|LBS-009|samo povratni (nova)||| +S_n-osebek_je|sbz1 Neg-GBZ|s1-l-gg||10|1|72||70|voda ne TEČE|LBS-010|brez povratnega||| +S_n-osebek_od|SBZ1 neg-gbz|s1-l-gg||10|1|72||70|ČLOVEK ne vidi |LBS-010|brez povratnega||| +||s1-zp-l-gg||10|1|73|72|70|vernik se ne BRIJE |LBS-010|samo povratni (nova)||| +S_zanikani|Neg-gbz SBZ2|ggz-s2||11|1|74||23|ne hoteti ODGOVORA|LBS-011|brez povratnega||| +S_z_nikalnim|Neg-GBZ sbz2|ggz-s2||11|1|74||23|ne IMETI znanja|LBS-011|brez povratnega||| +||l-gg-zp-s2||11|1|75|24|23|ne ZVITI si noge|LBS-011|samo povratni (nova)||| +S_sam-im_sam|sbz0 SBZ0|s0-s0||12||76|||angina PECTORIS|LBS-014|||| +S_im_sam-sam|SBZ0 sbz0|s0-s0||12||76|||ČLOVEK pajek|LBS-014|||| +S_Vez_S|SBZ1 Vez-gbz sbz1|s0-gp-s1||15||77|||STRIC je partizan|LBS-018|||| +||gg-zp-p1||16||78|18||zdeti se premagan|LBS-019|samo povratni (nova)||| +S_kakšnega-p|Vez-gbz PBZ4|gg-p4|19|17||19|||pustiti SAMEGA|LBS-020|brez povratnega|to je v resnici številka 19 v DSB|SPREMLJATI REZULTAT!| +S_kakšnega-g?|Vez-GBZ pbz4|gg-p4|19|17||19|||VIDETI nasmejanega|LBS-020|brez povratnega|to je v resnici številka 19 v DSB|SPREMLJATI REZULTAT!| +||gg-zp-p4||17||80|19||počutiti se osamljenega|LBS-020|samo povratni (nova)||| +||r-zp-gg||19||81|43||močno se prestrašiti|LBS-022|samo povratni (nova)||| +S_primera|GBZ kot sbz0|gg-vd-s0||20||82|||BRUHATI kot vidra|LBS-025|brez povratnega||| +||gg-zp-vd-s0||20||83|82||pojavljati se kot mora|LBS-025|samo povratni (nova)||| +S_primera|SBZ0 kot sbz0|s0-vd-s0||23||84|||ZDRAVJE kot vrednota / vrednost kot OSNOVA|LBS-025||NE IZPISUJE 'KOT'!|| +S_kako-kdaj_r|RBZ Vez-gbz pbz1|r-p1||26||85|||[biti] objavljen včeraj|LBS-031|||| +||gg-zp-d-s2||34||86|13||pripeljati se do banke|LBS-044|samo povratni (nova)||| +||gg-zp-d-s3||35||87|17||odločiti se kljub nasprotovanju|LBS-045|samo povratni (nova)||| +||gg-zp-d-s4||36||88|14||voziti se v službo|LBS-047|samo povratni (nova)||| +||gg-zp-d-s5||37||89|15||kopati se v morju|LBS-047|samo povratni (nova)||| +||gg-zp-d-s6||38||90|16||pripeljati se z mopedom|LBS-048|samo povratni (nova)||| +S_%s_r|sbz0 %s RBZ|s0-d-r||49||91|||prijatelj za VEDNO|LBS-069|||popravljeno| +||gg-zp-ggn||||92|12||prizadevati si poiskati|LBS-071|samo povratni prvi glagol (nova)||| +||gg-ggn-zp||||93|12||utegniti zaplesti se|LBS-071|samo povratni drugi glagol (nova)||| +||gg-zp-ggn-zp||||94|92/93||odločiti se prodati se|LBS-071|samo povratni oba glagola (nova)||| +||l-gg-zp-ggn||51||95|25|92|ne dati se predvideti|LBS-072|samo povratni prvi glagol (nova)||| +||l-gg-ggn-zp||51||96|25|93|ne uspeti udeležiti se|LBS-072|samo povratni drugi glagol (nova)||| +||l-gg-zp-ggn-zp||51||97|25|94|ne bati se pokazati se|LBS-072|samo povratni oba glagola (nova)||| +S_R-inf|rbz Vez-gbz Inf-GBZ|r-ggn||52||98|||[biti] bolje PRESEKATI|LBS-073|brez povratnega||| +||r-ggn-zp||52||99|98||[biti] bolje smejati se|LBS-073|samo povratni (nova)||| +S_P-inf|pbz1 Vez-gbz Inf-GBZ|p1-ggn||54||100|||[biti] prisiljen ZAPRETI|LBS-075|brez povratnega||| +||p1-ggn-zp||54||101|100||[biti] pripravljen zadolžiti se|LBS-075|samo povratni (nova)||| +S_S-inf|sbz1 Vez-gbz Inf-GBZ|s1-ggn||56||102|||pravica POČETI |LBS-077|brez povratnega||| +||s1-ggn-zp||56||103|||pravica odločati se|LBS-077|samo povratni (nova)||| +S_namenilnik|gbz Nam-GBZ|gg-ggm||58||104|||iti PLAVAT|LBS-079|brez povratnega||| +||gg-ggm-zp||58||105|||iti ogledat si|LBS-079|samo povratni (nova)||| +S_priredje|priredje|s0-vp-s0||65||106|||VINO in/ali PIVO|LBS-099||"to najdemo tudi kot ""SBZ0 in sbz0"", ""SBZ0 in/ali sbz0"""|| +S_priredje|GBZ in/ali gbz|gg-vp-gg||66||107|||GOVORITI in/ali ŠEPETATI|LBS-099||"ta struktura je lahko ""GBZ in gbz"", ""GBZ ali gbz"", ""GBZ in/ali gbz"", pa tudi ""GBZ in/ali GBZ"" (oba elementa zapisana z veliko)"|za pretvorbo damo samo prvega z velikimi črkami| +||gg-zp-s2||6||108|||BATI se maščevanja|LBS-006|samo povratni (nova)|bivša struktura 21|| +||gg-zp-s2||6||108|||želeti si ZDRAVJA|LBS-006|samo povratni (nova)|bivša struktura 22|| +S_n-osebek_je|sbz1 Neg-GBZ|s1-ggz||10|2||||država NIMA |LBS-010|odstranimo?||| +S_n-osebek_od|SBZ1 neg-gbz|s1-ggz||10|2||||ČLOVEK nima|LBS-010|odstranimo?||| +S_neg-kakšnega-g?|Neg-GBZ pbz2|||18|1/2||||ne RAZUMETI prebranega / ne IMETI zaposlenih|LBS-021|odstranimo?||| +S_neg-kakšnega-p|Neg-gbz PBZ2|||18|1/2||||ne želeti SLABEGA / ne hoteti SLABEGA|LBS-021|odstranimo?||| +S_kdo-kaj_r|sbz1 Vez-gbz RBZ|||27|||||vzdušje je IZJEMNO|LBS-032|odstranimo|NESMISELNI REZULTATI - PRIDEVNIK, ponavlja NSSS id-57|| +S_R-neg-inf|rbz Neg-Vez-gbz Inf-GBZ|||53|1||||ne (biti) lahko ZAPUSTITI|LBS-074|odstranimo||| +S_P-neg-inf|pbz1 Neg-Vez-gbz Inf-GBZ|||55|1||||ne (biti) dolžen OBJAVITI|LBS-076|odstranimo||| +S_S-neg-inf|sbz1 Neg-Vez-gbz Inf-GBZ|||57|1||||ne (vzeti) časa POISKATI|LBS-078|odstranimo||| +S_biti_s2|vez-gbz SBZ2|||59|||||biti brez VOLJE|LBS-082|odstranimo|to pokriva id 34|| +S_biti_s3|vez-gbz SBZ3|||60|||||biti proti VOLJi|LBS-083|odstranimo|to pokriva id 35|| +S_biti_s4|vez-gbz SBZ4|||61|||||biti na VOLJO|LBS-084|odstranimo|to pokriva id 36|| +S_biti_s5|vez-gbz SBZ5|||62|||||biti po VOLJI|LBS-085|odstranimo|to pokriva id 37|| +S_biti_s6|vez-gbz SBZ6|||63|||||biti pod VPLIVOM|LBS-086|odstranimo|to pokriva id 38|| +|PBZ0 gbz|||||||||***|||| +S_vezni|gbz SBZ1|||||||||LBS-013|||| +S_s-kdo-kaj|sbz0 SBZ1|||||||||LBS-015|||| +S_v_imen-s|SBZ0 sbz1|||||||||LBS-015|||| +S_veznik_enob|SBZ0 Odv|||||||||LBS-024|||| +S_veznik_enob|RBZ Odv|||||||||LBS-024|||| +S_veznik_enob|PBZ0 Odv|||||||||LBS-024|||| +S_veznik_enob|GBZ Odv|||||||||LBS-024|||| +S_simile|primera|||||||||LBS-025|"replicira drugo stran ""primera"""||| +S_predl-pred|s predlogom|||||||||LBS-026|||| +V_biti_videti|biti videti %s|||||||||LBS-029||ta je pri pridevniku|| +V_biti_videti|biti videti|||||||||LBS-033||bi moralo biti struktura|| +S_%(3.lempos)_x_s2||||||||||LBS-039| ||| +S_%(3.lempos)_x_s3||||||||||LBS-040| ||| +S_%(3.lempos)_x_s4||||||||||LBS-041| ||| +S_%(3.lempos)_x_s5||||||||||LBS-042| ||| +S_%(3.lempos)_x_s6||||||||||LBS-043| ||| +S_%(3.lempos)_x_g2||||||||||LBS-049| ||| +S_%(3.lempos)_x_g3||||||||||LBS-050| ||| +S_%(3.lempos)_x_g4||||||||||LBS-051| ||| +S_%(3.lempos)_x_g5||||||||||LBS-052| ||| +S_%(3.lempos)_x_g6||||||||||LBS-053| ||| +S_%(3.lempos)_x_p2||||||||||LBS-059| ||| +S_%(3.lempos)_x_p3||||||||||LBS-060| ||| +S_%(3.lempos)_x_p4||||||||||LBS-061| ||| +S_%(3.lempos)_x_p5||||||||||LBS-062| ||| +S_%(3.lempos)_x_p6||||||||||LBS-063| ||| +S_nedoločnik|SBZ0 Inf-gbz|||||||||LBS-070|||| +S_nedoločnik|RBZ Inf-gbz|||||||||LBS-070|||| +S_nedoločnik|PBZ0 Inf-gbz|||||||||LBS-070|||| +S_G_GInf_O4|gbz Inf-gbz SBZ4|||||||||LBS-080|||| +S_Vez_Inf_S|SBZ1 vez-gbz Inf-gbz|||||||||LBS-081 |||ok| +O_z_lastnim_imenom|pogosto z lastnim imenom|||||||||LBS-087||manjka|ok| +O_s_števili|pogosto s števili|||||||||LBS-088||"manjka (je mogooče ""pogosto s števili"")?"|| +V_S_V_O3_O4|kdo/kaj G komu kaj|||||||||LBS-089|||| +V_S_V_O3_O2|kdo/kaj G komu koga/česa|||||||||LBS-090|||| +V_S_V_O4_predl|kdo/kaj G koga/kaj + predlog|||||||||LBS-091|||| +V_S_V_O3|kdo/kaj G komu|||||||||LBS-092|||| +V_gl_Cit|cit GBZ|||||||||LBS-093||tu je bil prej vzorec, spremenjeno v struktura, ker je tako v bazi|| +O_povratni_se|povratni (se)|||||||||LBS-094|||| +O_povratni_si|povratni (si)|||||||||LBS-095|||| +O_nedoločnik_cs|pogosto v nedoločniku|||||||||LBS-096|||| +O_tretja_oseba|pogosto v tretji osebi|||||||||LBS-097|||| +V_lahko_G|mod-rbz GBZ|||||||||LBS-098||ta je pri glagolu|| +V_lahko_G|mod-rbz GBZ|||||||||LBS-098||tu je bil prej vzorec, spremenjeno v struktura, ker je tako v bazi|| +S_z_vezajem|rbz1-RBZ0|||||||||LBS-100|||| +S_z_vezajem|pbz1{-}PBZ0|||||||||LBS-100||možna je tudi verzija PBZ1{-}pbz0|| +O_zanikanje|pogosto zanikan|||||||||LBS-101|||| +O_količina|s količino|||||||||LBS-102|||| +S_veznik_dvob|dvobesedni veznik|||||||||LBS-103|||| +S_d_sam_d|z dvema predlogoma|||||||||LBS-104||struktura ali vzorec?|| +S_gl_K_sam|predlog s števnikom|||||||||LBS-105||je tole sploh pravilno zapisano?|| +S_gl_K_sam|zveze s števniki|||||||||LBS-105||Simon, tu je treba pogledati š (je čudno napisan)|| diff --git a/conversion_utils/tei_to_dictionary.py b/conversion_utils/tei_to_dictionary.py new file mode 100644 index 0000000..89992cb --- /dev/null +++ b/conversion_utils/tei_to_dictionary.py @@ -0,0 +1,59 @@ +import argparse +import lxml.etree as lxml + +from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER + +def get_parsed_unit_string(parsed_unit): + elements = xpath_find(parsed_unit, 'tei:w|tei:pc') + return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip() + +def convert(input_file_name, output_file_name): + + output_root = lxml.Element('dictionary') + + parser = lxml.XMLParser(remove_blank_text=True) + input_root = lxml.parse(input_file_name, parser).getroot() + parsed_units = xpath_find(input_root, 'tei:text/tei:body/tei:p/tei:s') + + for parsed_unit in parsed_units: + entry = lxml.SubElement(output_root, 'entry') + entry.set('sid', get_xml_id(parsed_unit)) + head = lxml.SubElement(entry, 'head') + headword = lxml.SubElement(head, 'headword') + lemma_text = get_parsed_unit_string(parsed_unit) + lemma = lxml.SubElement(headword, 'lemma') + lemma.text = lemma_text + lexical_unit = lxml.SubElement(head, 'lexicalUnit') + tokens = xpath_find(parsed_unit, 'tei:w|tei:pc') + if (len(tokens) == 1): + token = tokens[0] + lexical_unit.set('type', 'single') + lexeme = lxml.SubElement(lexical_unit, 'lexeme') + if (token.tag == TEI_NAMESPACE_QUALIFIER + 'w'): + lexeme.set('lemma', token.get('lemma')) + lexeme.set('msd', token.get('ana')[len('mte:'):]) + lexeme.text = token.text + else: + lexical_unit.set('type', 'MWE') + for (index, token) in enumerate(tokens, start=1): + component = lxml.SubElement(lexical_unit, 'component') + component.set('num', str(index)) + lexeme = lxml.SubElement(component, 'lexeme') + if (token.tag == TEI_NAMESPACE_QUALIFIER + 'w'): + lexeme.set('lemma', token.get('lemma')) + lexeme.set('msd', token.get('ana')[len('mte:'):]) + lexeme.text = token.text + lexical_unit.set('structure_id', str(parsed_unit.get('structure_id'))) + body = lxml.SubElement(entry, 'body') + senseList = lxml.SubElement(body, 'senseList') + + output_tree = lxml.ElementTree(output_root) + output_tree.write(output_file_name, encoding='UTF-8', pretty_print=True) + + +if (__name__ == '__main__'): + arg_parser = argparse.ArgumentParser(description='Convert TEI to dictionary xml.') + arg_parser.add_argument('-infile', type=str, help='Input TEI xml') + arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema') + arguments = arg_parser.parse_args() + convert(input_file_name, output_file_name) diff --git a/conversion_utils/translate_conllu_jos.py b/conversion_utils/translate_conllu_jos.py new file mode 100644 index 0000000..b84894e --- /dev/null +++ b/conversion_utils/translate_conllu_jos.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import argparse +import codecs +import lxml.etree as lxml +from importlib_resources import files + +from conversion_utils.jos_msds_and_properties import Converter, Msd + +def get_syn_map(): + dict_file_name = files('conversion_utils.resources').joinpath('dict.xml') + dict_file = codecs.open(dict_file_name, 'r') + root = lxml.parse(dict_file).getroot() + dict_file.close() + return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')} + +def translate(input_file_name, output_file_name): + + syn_map = get_syn_map() + + output_file = codecs.open(output_file_name, 'w') + input_file = codecs.open(input_file_name, 'r') + + converter = Converter() + + for line in input_file: + columns = line.strip().split('\t') + if (len(columns) != 10): + output_file.write(line) + else: + columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code + columns[7] = syn_map[columns[7]] + output_file.write('\t'.join(columns) + '\n') + + input_file.close() + output_file.close() + + +if (__name__ == '__main__'): + + arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.') + arg_parser.add_argument('-infile', type=str, help='Input conllu') + arg_parser.add_argument('-outfile', type=str, help='Output conllu') + arguments = arg_parser.parse_args() + input_file_name = arguments.infile + output_file_name = arguments.outfile + + translate(input_file_name, output_file_name) diff --git a/conversion_utils/utils.py b/conversion_utils/utils.py index a204321..dfd750d 100644 --- a/conversion_utils/utils.py +++ b/conversion_utils/utils.py @@ -1,4 +1,5 @@ TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0' +TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}' XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id' def xpath_find(element,expression):