connl2009 output for kres

2019-02-13 08:49:37 +01:00
parent 825f67a054
commit d3ebf82ba0
11 changed files with 1995 additions and 470 deletions
@@ -0,0 +1,60 @@
+import pkg_resources
+import pandas
+import sys
+
+# msd mappings from slo to ang
+class Msdmap():
+    def __init__(self):
+        # http://nl.ijs.si/ME/V4/msd/html/msd.categories-sl.html
+        self.pos_slo_ang = [
+            ("samostalnik", "S", "Noun", "N"),
+            ("glagol", "G", "Verb", "V"),
+            ("pridevnik", "P", "Adjective", "A"),
+            ("prislov", "R", "Adverb", "R"),
+            ("zaimek", "Z", "Pronoun", "P"),
+            ("števnik", "K", "Numeral", "M"),
+            ("predlog", "D", "Adposition", "S"),
+            ("veznik", "V", "Conjunction", "C"),
+            ("členek", "L", "Particle", "Q"),
+            ("medmet", "M", "Interjection", "I"),
+            ("okrajšava", "O", "Abbreviation", "Y"),
+            ("neuvrščeno", "N", "Residual", "X"),
+        ]
+
+        table_name = "msd-human-sl.tbl"
+        table_path = pkg_resources.resource_filename(__name__, "msd/" + table_name)
+        self.msd_table = pandas.read_csv(
+            table_name,
+            "\t",
+            names=["id", "eng_msd", "eng_long", "slo_msd", "slo_long", "slo_very_long"]
+        )
+
+    def slo_msd_to_eng_long(self, slo_msd):
+        return (self.msd_table[self.msd_table["slo_msd"] == slo_msd]["eng_long"]).values[0]
+
+    def pos_slo_ang_map(self, col, query):
+        for pos in self.pos_slo_ang:
+            if pos[col] == query:
+                return pos
+        raise ValueError("Wrong part of speech.")
+
+    def msd_from_slo(self, msd):
+        pos = self.pos_slo_ang_map(1, msd[0])
+        category = pos[2]
+        attr = [self.pos_val_map(category, 1, m)
+                for m in msd[1:] if m != "-"]
+        return (pos, attr)
+
+
+if __name__ == "__main__":
+    msdmap = Msdmap()
+    test_msds = [
+        "Soser",
+        "Ppnzmm",
+        "Gp-d-mz"
+    ]
+
+    for msd in test_msds:
+        print(msd)
+        print(msdmap.slo_msd_to_eng_long(msd))
+        print()
@@ -1,315 +0,0 @@
-# msd mappings from slo to ang
-class Msdmap():
-    def __init__(self):
-        # http://nl.ijs.si/ME/V4/msd/html/msd.categories-sl.html
-        self.pos_slo_ang = [
-            ("samostalnik", "S", "Noun", "N"),
-            ("glagol", "G", "Verb", "V"),
-            ("pridevnik", "P", "Adjective", "A"),
-            ("prislov", "R", "Adverb", "R"),
-            ("zaimek", "Z", "Pronoun", "P"),
-            ("števnik", "K", "Numeral", "M"),
-            ("predlog", "D", "Adposition", "S"),
-            ("veznik", "V", "Conjunction", "C"),
-            ("členek", "L", "Particle", "Q"),
-            ("medmet", "M", "Interjection", "I"),
-            ("okrajšava", "O", "Abbreviation", "Y"),
-            ("neuvrščeno", "N", "Residual", "X"),
-        ]
-
-        # http://nl.ijs.si/ME/V4/msd/html/msd.values-sl.html
-        # col:
-        # (Value (sl), Code (sl), Attribute (sl), Category (sl),
-        #   Value (en), Code (en), Attribute (en), Category (en))
-        self.pos_val = [
-            ("arabski", "a", "zapis", "števnik",
-                "digit", "d", "Form", "Numeral"),
-            ("besedni", "b", "zapis", "števnik",
-                "letter", "l", "Form", "Numeral"),
-            ("celostni", "c", "vrsta", "zaimek",
-                "general", "g", "Type", "Pronoun"),
-            ("da", "d", "določnost", "pridevnik",
-                "yes", "y", "Definiteness", "Adjective"),
-            ("da", "d", "določnost", "števnik",
-                "yes", "y", "Definiteness", "Numeral"),
-            ("da", "d", "živost", "samostalnik",
-                "yes", "y", "Animate", "Noun"),
-            ("dajalnik", "d", "sklon", "predlog",
-                "dative", "d", "Case", "Adposition"),
-            ("dajalnik", "d", "sklon", "pridevnik",
-                "dative", "d", "Case", "Adjective"),
-            ("dajalnik", "d", "sklon", "samostalnik",
-                "dative", "d", "Case", "Noun"),
-            ("dajalnik", "d", "sklon", "zaimek",
-                "dative", "d", "Case", "Pronoun"),
-            ("dajalnik", "d", "sklon", "števnik",
-                "dative", "d", "Case", "Numeral"),
-            ("deležje", "d", "vrsta", "prislov",
-                "participle", "r", "Type", "Adverb"),
-            ("deležnik", "d", "oblika", "glagol",
-                "participle", "p", "VForm", "Verb"),
-            ("deležniški", "d", "vrsta", "pridevnik",
-                "participle", "p", "Type", "Adjective"),
-            ("dovršni", "d", "vid", "glagol",
-                "perfective", "e", "Aspect", "Verb"),
-            ("druga", "d", "oseba", "glagol",
-                "second", "2", "Person", "Verb"),
-            ("druga", "d", "oseba", "zaimek",
-                "second", "2", "Person", "Pronoun"),
-            ("drugi", "d", "vrsta", "števnik",
-                "special", "s", "Type", "Numeral"),
-            ("dvojina", "d", "število", "glagol",
-                "dual", "d", "Number", "Verb"),
-            ("dvojina", "d", "število", "pridevnik",
-                "dual", "d", "Number", "Adjective"),
-            ("dvojina", "d", "število", "samostalnik",
-                "dual", "d", "Number", "Noun"),
-            ("dvojina", "d", "število", "zaimek",
-                "dual", "d", "Number", "Pronoun"),
-            ("dvojina", "d", "število", "števnik",
-                "dual", "d", "Number", "Numeral"),
-            ("dvojina", "d", "število_svojine", "zaimek",
-                "dual", "d", "Owner_Number", "Pronoun"),
-            ("dvovidski", "v", "vid", "glagol",
-                "biaspectual", "b", "Aspect", "Verb"),
-            ("ednina", "e", "število", "glagol",
-                "singular", "s", "Number", "Verb"),
-            ("ednina", "e", "število", "pridevnik",
-                "singular", "s", "Number", "Adjective"),
-            ("ednina", "e", "število", "samostalnik",
-                "singular", "s", "Number", "Noun"),
-            ("ednina", "e", "število", "zaimek",
-                "singular", "s", "Number", "Pronoun"),
-            ("ednina", "e", "število", "števnik",
-                "singular", "s", "Number", "Numeral"),
-            ("ednina", "e", "število_svojine", "zaimek",
-                "singular", "s", "Owner_Number", "Pronoun"),
-            ("glavni", "g", "vrsta", "glagol",
-                "main", "m", "Type", "Verb"),
-            ("glavni", "g", "vrsta", "števnik",
-                "cardinal", "c", "Type", "Numeral"),
-            ("imenovalnik", "i", "sklon", "predlog",
-                "nominative", "n", "Case", "Adposition"),
-            ("imenovalnik", "i", "sklon", "pridevnik",
-                "nominative", "n", "Case", "Adjective"),
-            ("imenovalnik", "i", "sklon", "samostalnik",
-                "nominative", "n", "Case", "Noun"),
-            ("imenovalnik", "i", "sklon", "zaimek",
-                "nominative", "n", "Case", "Pronoun"),
-            ("imenovalnik", "i", "sklon", "števnik",
-                "nominative", "n", "Case", "Numeral"),
-            ("kazalni", "k", "vrsta", "zaimek",
-                "demonstrative", "d", "Type", "Pronoun"),
-            ("klitična", "k", "naslonskost", "zaimek",
-                "yes", "y", "Clitic", "Pronoun"),
-            ("lastno_ime", "l", "vrsta", "samostalnik",
-                "proper", "p", "Type", "Noun"),
-            ("mestnik", "m", "sklon", "predlog",
-                "locative", "l", "Case", "Adposition"),
-            ("mestnik", "m", "sklon", "pridevnik",
-                "locative", "l", "Case", "Adjective"),
-            ("mestnik", "m", "sklon", "samostalnik",
-                "locative", "l", "Case", "Noun"),
-            ("mestnik", "m", "sklon", "zaimek",
-                "locative", "l", "Case", "Pronoun"),
-            ("mestnik", "m", "sklon", "števnik",
-                "locative", "l", "Case", "Numeral"),
-            ("množina", "m", "število", "glagol",
-                "plural", "p", "Number", "Verb"),
-            ("množina", "m", "število", "pridevnik",
-                "plural", "p", "Number", "Adjective"),
-            ("množina", "m", "število", "samostalnik",
-                "plural", "p", "Number", "Noun"),
-            ("množina", "m", "število", "zaimek",
-                "plural", "p", "Number", "Pronoun"),
-            ("množina", "m", "število", "števnik",
-                "plural", "p", "Number", "Numeral"),
-            ("množina", "m", "število_svojine", "zaimek",
-                "plural", "p", "Owner_Number", "Pronoun"),
-            ("moški", "m", "spol", "glagol",
-                "masculine", "m", "Gender", "Verb"),
-            ("moški", "m", "spol", "pridevnik",
-                "masculine", "m", "Gender", "Adjective"),
-            ("moški", "m", "spol", "samostalnik",
-                "masculine", "m", "Gender", "Noun"),
-            ("moški", "m", "spol", "zaimek",
-                "masculine", "m", "Gender", "Pronoun"),
-            ("moški", "m", "spol", "števnik",
-                "masculine", "m", "Gender", "Numeral"),
-            ("moški", "m", "spol_svojine", "zaimek",
-                "masculine", "m", "Owner_Gender", "Pronoun"),
-            ("namenilnik", "m", "oblika", "glagol",
-                "supine", "u", "VForm", "Verb"),
-            ("navezna", "z", "naslonskost", "zaimek",
-                "bound", "b", "Clitic", "Pronoun"),
-            ("ne", "n", "določnost", "pridevnik",
-                "no", "n", "Definiteness", "Adjective"),
-            ("ne", "n", "določnost", "števnik",
-                "no", "n", "Definiteness", "Numeral"),
-            ("ne", "n", "živost", "samostalnik",
-                "no", "n", "Animate", "Noun"),
-            ("nedoločeno", "n", "stopnja", "pridevnik",
-                "positive", "p", "Degree", "Adjective"),
-            ("nedoločeno", "n", "stopnja", "prislov",
-                "positive", "p", "Degree", "Adverb"),
-            ("nedoločni", "n", "vrsta", "zaimek",
-                "indefinite", "i", "Type", "Pronoun"),
-            ("nedoločnik", "n", "oblika", "glagol",
-                "infinitive", "n", "VForm", "Verb"),
-            ("nedovršni", "n", "vid", "glagol",
-                "progressive", "p", "Aspect", "Verb"),
-            ("nezanikani", "n", "nikalnost", "glagol",
-                "no", "n", "Negative", "Verb"),
-            ("nikalni", "l", "vrsta", "zaimek",
-                "negative", "z", "Type", "Pronoun"),
-            ("občno_ime", "o", "vrsta", "samostalnik",
-                "common", "c", "Type", "Noun"),
-            ("orodnik", "o", "sklon", "predlog",
-                "instrumental", "i", "Case", "Adposition"),
-            ("orodnik", "o", "sklon", "pridevnik",
-                "instrumental", "i", "Case", "Adjective"),
-            ("orodnik", "o", "sklon", "samostalnik",
-                "instrumental", "i", "Case", "Noun"),
-            ("orodnik", "o", "sklon", "zaimek",
-                "instrumental", "i", "Case", "Pronoun"),
-            ("orodnik", "o", "sklon", "števnik",
-                "instrumental", "i", "Case", "Numeral"),
-            ("osebni", "o", "vrsta", "zaimek",
-                "personal", "p", "Type", "Pronoun"),
-            ("oziralni", "z", "vrsta", "zaimek",
-                "relative", "r", "Type", "Pronoun"),
-            ("podredni", "d", "vrsta", "veznik",
-                "subordinating", "s", "Type", "Conjunction"),
-            ("pogojnik", "g", "oblika", "glagol",
-                "conditional", "c", "VForm", "Verb"),
-            ("pomožni", "p", "vrsta", "glagol",
-                "auxiliary", "a", "Type", "Verb"),
-            ("povratni", "p", "vrsta", "zaimek",
-                "reflexive", "x", "Type", "Pronoun"),
-            ("presežnik", "s", "stopnja", "pridevnik",
-                "superlative", "s", "Degree", "Adjective"),
-            ("presežnik", "s", "stopnja", "prislov",
-                "superlative", "s", "Degree", "Adverb"),
-            ("prihodnjik", "p", "oblika", "glagol",
-                "future", "f", "VForm", "Verb"),
-            ("primernik", "p", "stopnja", "pridevnik",
-                "comparative", "c", "Degree", "Adjective"),
-            ("primernik", "r", "stopnja", "prislov",
-                "comparative", "c", "Degree", "Adverb"),
-            ("priredni", "p", "vrsta", "veznik",
-                "coordinating", "c", "Type", "Conjunction"),
-            ("program", "p", "vrsta", "neuvrščeno",
-                "program", "p", "Type", "Residual"),
-            ("prva", "p", "oseba", "glagol",
-                "first", "1", "Person", "Verb"),
-            ("prva", "p", "oseba", "zaimek",
-                "first", "1", "Person", "Pronoun"),
-            ("rimski", "r", "zapis", "števnik",
-                "roman", "r", "Form", "Numeral"),
-            ("rodilnik", "r", "sklon", "predlog",
-                "genitive", "g", "Case", "Adposition"),
-            ("rodilnik", "r", "sklon", "pridevnik",
-                "genitive", "g", "Case", "Adjective"),
-            ("rodilnik", "r", "sklon", "samostalnik",
-                "genitive", "g", "Case", "Noun"),
-            ("rodilnik", "r", "sklon", "zaimek",
-                "genitive", "g", "Case", "Pronoun"),
-            ("rodilnik", "r", "sklon", "števnik",
-                "genitive", "g", "Case", "Numeral"),
-            ("sedanjik", "s", "oblika", "glagol",
-                "present", "r", "VForm", "Verb"),
-            ("splošni", "p", "vrsta", "pridevnik",
-                "general", "g", "Type", "Adjective"),
-            ("splošni", "s", "vrsta", "prislov",
-                "general", "g", "Type", "Adverb"),
-            ("srednji", "s", "spol", "glagol",
-                "neuter", "n", "Gender", "Verb"),
-            ("srednji", "s", "spol", "pridevnik",
-                "neuter", "n", "Gender", "Adjective"),
-            ("srednji", "s", "spol", "samostalnik",
-                "neuter", "n", "Gender", "Noun"),
-            ("srednji", "s", "spol", "zaimek",
-                "neuter", "n", "Gender", "Pronoun"),
-            ("srednji", "s", "spol", "števnik",
-                "neuter", "n", "Gender", "Numeral"),
-            ("srednji", "s", "spol_svojine", "zaimek",
-                "neuter", "n", "Owner_Gender", "Pronoun"),
-            ("svojilni", "s", "vrsta", "pridevnik",
-                "possessive", "s", "Type", "Adjective"),
-            ("svojilni", "s", "vrsta", "zaimek",
-                "possessive", "s", "Type", "Pronoun"),
-            ("tipkarska", "t", "vrsta", "neuvrščeno",
-                "typo", "t", "Type", "Residual"),
-            ("tožilnik", "t", "sklon", "predlog",
-                "accusative", "a", "Case", "Adposition"),
-            ("tožilnik", "t", "sklon", "pridevnik",
-                "accusative", "a", "Case", "Adjective"),
-            ("tožilnik", "t", "sklon", "samostalnik",
-                "accusative", "a", "Case", "Noun"),
-            ("tožilnik", "t", "sklon", "zaimek",
-                "accusative", "a", "Case", "Pronoun"),
-            ("tožilnik", "t", "sklon", "števnik",
-                "accusative", "a", "Case", "Numeral"),
-            ("tretja", "t", "oseba", "glagol",
-                "third", "3", "Person", "Verb"),
-            ("tretja", "t", "oseba", "zaimek",
-                "third", "3", "Person", "Pronoun"),
-            ("tujejezično", "j", "vrsta", "neuvrščeno",
-                "foreign", "f", "Type", "Residual"),
-            ("velelnik", "v", "oblika", "glagol",
-                "imperative", "m", "VForm", "Verb"),
-            ("vprašalni", "v", "vrsta", "zaimek",
-                "interrogative", "q", "Type", "Pronoun"),
-            ("vrstilni", "v", "vrsta", "števnik",
-                "ordinal", "o", "Type", "Numeral"),
-            ("zaimkovni", "z", "vrsta", "števnik",
-                "pronominal", "p", "Type", "Numeral"),
-            ("zanikani", "d", "nikalnost", "glagol",
-                "yes", "y", "Negative", "Verb"),
-            ("ženski", "z", "spol", "glagol",
-                "feminine", "f", "Gender", "Verb"),
-            ("ženski", "z", "spol", "pridevnik",
-                "feminine", "f", "Gender", "Adjective"),
-            ("ženski", "z", "spol", "samostalnik",
-                "feminine", "f", "Gender", "Noun"),
-            ("ženski", "z", "spol", "zaimek",
-                "feminine", "f", "Gender", "Pronoun"),
-            ("ženski", "z", "spol", "števnik",
-                "feminine", "f", "Gender", "Numeral"),
-            ("ženski", "z", "spol_svojine", "zaimek",
-                "feminine", "f", "Owner_Gender", "Pronoun"),
-        ]
-
-    def pos_slo_ang_map(self, col, query):
-        for pos in self.pos_slo_ang:
-            if pos[col] == query:
-                return pos
-        raise ValueError("Wrong part of speech.")
-
-    def pos_val_map(self, en_category, col, query):
-        for pos in self.pos_val:
-            if pos[7] == en_category and pos[col] == query:
-                return pos
-        print("---")
-        print(en_category)
-        print(col)
-        print(query)
-        raise ValueError("Wrong part of speech value.")
-
-    def msd_from_slo(self, msd):
-        pos = self.pos_slo_ang_map(1, msd[0])
-        category = pos[2]
-        attr = [self.pos_val_map(category, 1, m)
-                for m in msd[1:] if m != "-"]
-        return pos, attr
-
-
-if __name__ == "__main__":
-    test_msds = [
-        "Soser",
-        "Ppnzmm",
-        "Gp-d-mz"
-    ]
-
-    for msd in test_msds:
-        print(Msdmap().msd_from_slo(msd))
@@ -1,144 +0,0 @@
-#!/usr/bin/python3
-
-from __future__ import print_function, unicode_literals, division
-import sys
-import os
-import re
-import pickle
-from pathlib import Path
-
-try:
-    from lxml import etree as ElementTree
-except ImportError:
-    import xml.etree.ElementTree as ElementTree
-
-
-# attributes
-ID_ATTR = "id"
-LEMMA_ATTR = "lemma"
-ANA_ATTR = "ana"
-
-
-# tags
-SENTENCE_TAG = 's'
-BIBL_TAG = 'bibl'
-PARAGRAPH_TAG = 'p'
-PC_TAG = 'pc'
-WORD_TAG = 'w'
-C_TAG = 'c'
-S_TAG = 'S'
-SEG_TAG = 'seg'
-
-
-class Sentence:
-    def __init__(self, sentence, s_id):
-        self.id = s_id
-        self.words = []
-        self.text = ""
-
-        for word in sentence:
-            self.handle_word(word)
-
-    def handle_word(self, word):
-        # handle space after
-        if word.tag == S_TAG:
-            assert(word.text is None)
-            self.text += ' '
-            return
-
-        # ASK am I handling this correctly?
-        elif word.tag == SEG_TAG:
-            for segword in word:
-                self.handle_word(segword)
-            return
-
-        # ASK handle unknown tags (are there others?)
-        elif word.tag not in (WORD_TAG, C_TAG):
-            return
-
-        # ID
-        idx = str(len(self.words) + 1)
-
-        # TOKEN
-        token = word.text
-
-        # LEMMA
-        if word.tag == WORD_TAG:
-            lemma = word.get(LEMMA_ATTR)
-            assert(lemma is not None)
-        else:
-            lemma = token
-
-        # XPOS
-        xpos = word.get('msd')
-        if word.tag == C_TAG:
-            xpos = "Z"
-        elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
-            xpos = "N"
-        elif xpos is None:
-            print(self.id)
-
-        # save word entry
-        self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
-
-        # save for text
-        self.text += word.text
-
-
-    def to_conllu(self):
-        lines = []
-        # lines.append('# sent_id = ' + self.id)
-        # CONLLu does not like spaces at the end of # text
-        # lines.append('# text = ' + self.text.strip())
-        for word in self.words:
-            lines.append('\t'.join('_' if data is None else data for data in word))
-
-        return lines
-
-def convert_file(in_file, out_file):
-    print("Nalaganje xml: {}".format(in_file))
-    with open(str(in_file), 'r') as fp:
-        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
-        xmlstring = xmlstring.replace(' xml:', ' ')
-        xml_tree = ElementTree.XML(xmlstring)
-
-    print("Pretvarjanje TEI -> TSV-U ...")
-    lines = []
-
-    for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
-        sidx = 1
-        for sentence in paragraph:
-            if sentence.tag != SENTENCE_TAG:
-                continue
-
-            sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
-            lines.extend(sentence.to_conllu())
-            lines.append('') # ASK newline between sentences
-            sidx += 1
-
-    if len(lines) == 0:
-        raise RuntimeError("Nobenih stavkov najdenih")
-
-    print("Zapisovanje izhodne datoteke: {}".format(out_file))
-    with open(out_file, 'w') as fp:
-        for line in lines:
-            if sys.version_info < (3, 0):
-                line = line.encode('utf-8')
-            print(line, file=fp)
-
-
-if __name__ == "__main__":
-    """
-    Input: folder of TEI files, msds are encoded as msd="Z"
-    Ouput: just a folder
-    """
-
-    in_folder = sys.argv[1]
-    out_folder = sys.argv[2]
-    num_processes = int(sys.argv[3])
-
-    files = Path(in_folder).rglob("*.xml")
-    in_out = []
-    for filename in files:
-        out_file = out_folder + "/" + filename.name[:-4] + ".txt"
-        convert_file(filename, out_file)
@@ -122,30 +122,35 @@ def to_conll09(sentence_entry):
    for token in sentence_entry["tokens"]:
        if token[0] != "w":
            continue
+        t_id = token[1]
        msd = msdm.msd_from_slo(token[4])
-        fprd = fillpred("todo", "todo")
+        fprd = fillpred("TODO", "todo")
+
+        """
+        print(t_id)
+        print("msd:")
        print(msd)
        print(token)
        print(sentence_entry["links"])
-        t_id = token[1]
-        print(t_id)
+        """
+
        #            1   3   4   5   6   7   8   9  10  11  12  13  14
        out_str += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            t_id,  # id
            token[2],  # form
            token[3],  # lemma
            token[3],  # plemma
-            "todo",  # pos (TODO)
-            "todo",  # ppos (TODO)
-            "todo",  # feat (TODO)
-            "todo",  # pfeat (TODO)
+            msd[0][3],  # pos
+            msd[0][3],  # ppos
+            "|".join([msd[0][2]] + [el[4] for el in msd[1]]),  # feat
+            "|".join([msd[0][2]] + [el[4] for el in msd[1]]),  # pfeat
            sentence_entry["links"][t_id][2],  # head
            sentence_entry["links"][t_id][2],  # phead
-            sentence_entry["links"][t_id][1],  # deprel
-            sentence_entry["links"][t_id][1],  # pdeprel
+            sentence_entry["links"][t_id][0],  # deprel
+            sentence_entry["links"][t_id][0],  # pdeprel
            fprd,  # fillpred
            (token[3] if fprd == "Y" else "_"),  # pred
-            "todo"  # apredn...
        )
    out_str += "\n"
+    print(out_str)
    return out_str