Added some functions for compatibility with valency, fixed readme and fixed some minor bugs.

2020-09-10 15:06:09 +02:00 · 2020-09-10 15:06:09 +02:00 · 01b08667d2
commit 01b08667d2
parent 1b0e6a27eb
7 changed files with 81 additions and 38 deletions
--- a/README.md
+++ b/README.md
@ -26,18 +26,22 @@ pip install -r requirements.txt
 # Running
 ```bash
-python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
+python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE>
 python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
 ```
 ## Most important optional parameters
 ### --sloleks_db
 This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement. 
 To use this sqlalchemy has to be installed as well.
-PATH TO SLOLEKS DB
+
 This parameter has to include information about database in following order:
 <DB_USERNAME>:<DB_PASSWORD>:<DB_NAME>:<DB_URL>
 ### --collocation_sentence_map_dest
-../data/collocation_sentence_mapper 
+If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids.
 ### --db
 This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
--- a/src/component.py
+++ b/src/component.py
@ -135,6 +135,7 @@ class Component:
        # matches for every component in links from this component
        to_ret = []
        # need to get all links that match
        for next, link, order in self.next_element:
            next_links = word.get_links(link)
@ -146,6 +147,9 @@ class Component:
                if not order.match(word, next_word):
                    continue
                if word.lemma == 'aktivirati' and next_word.text == 'potomcih':
                    a = 0
                match = next.match(next_word)
                if match is not None:
--- a/src/formatter.py
+++ b/src/formatter.py
@ -82,7 +82,7 @@ class AllFormatter(Formatter):
        word = words[idx]
        return [word.id, word.text, word.lemma, word.msd]
-    def content_right(self, _freq):
+    def content_right(self, _freq, variable_word_order=None):
        return []
    def group(self):
--- a/src/lemma_features.py
+++ b/src/lemma_features.py
@ -8,7 +8,7 @@ def get_lemma_features(et):
    result = {}
    for pos in lf.iter('POS'):
-        rgx_list = MorphologyRegex(pos).rgx
+        rgx_list = MorphologyRegex(pos).rgxs[0]
        rgx_str = ""
        for position in rgx_list:
            if position == ".":
--- a/src/restriction.py
+++ b/src/restriction.py
@ -10,7 +10,10 @@ class RestrictionType(Enum):
    MatchAll = 2
-def determine_ppb(rgx):
+def determine_ppb(rgxs):
    if len(rgxs) != 1:
        return 0
    rgx = rgxs[0]
    if rgx[0] in ("A", "N", "R"):
        return 0
    elif rgx[0] == "V":
@ -27,7 +30,7 @@ def determine_ppb(rgx):
 class MorphologyRegex:
    def __init__(self, restriction):
-        self.min_msd_length = 1
+        # self.min_msd_length = 1
        restr_dict = {}
        for feature in restriction:
@ -44,45 +47,75 @@ class MorphologyRegex:
            restr_dict[key] = (value, match_type)
        assert 'POS' in restr_dict
-        category = restr_dict['POS'][0].capitalize()
+
-        cat_code = CODES[category]
+        # handle multiple word types
-        rgx = [cat_code] + ['.'] * 10
+        if '|' in restr_dict['POS'][0]:
            categories = restr_dict['POS'][0].split('|')
        else:
            categories = [restr_dict['POS'][0]]
        self.rgxs = []
        self.re_objects = []
        self.min_msd_lengths = []
        del restr_dict['POS']
-        for attribute, (value, typ) in restr_dict.items():
+        for category in categories:
-            index = TAGSET[cat_code].index(attribute.lower())
+            min_msd_length = 1
-            assert index >= 0
+            category = category.capitalize()
            cat_code = CODES[category]
            rgx = [cat_code] + ['.'] * 10
            if '|' in value:
                match = "".join(CODES[val] for val in value.split('|'))
            else:
                match = CODES[value]
            match = "[{}{}]".format("" if typ else "^", match)
            rgx[index + 1] = match
-            if typ:
+            for attribute, (value, typ) in restr_dict.items():
-                self.min_msd_length = max(index + 1, self.min_msd_length)
+                if attribute.lower() not in TAGSET[cat_code]:
                    continue
                index = TAGSET[cat_code].index(attribute.lower())
                assert index >= 0
-        # strip rgx
+                if '|' in value:
-        for i in reversed(range(len(rgx))):
+                    match = "".join(CODES[val] for val in value.split('|'))
-            if rgx[i] == '.':
+                else:
-                rgx = rgx[:-1]
+                    match = CODES[value]
            else:
                break
-        self.re_objects = [re.compile(r) for r in rgx]
+                match = "[{}{}]".format("" if typ else "^", match)
-        self.rgx = rgx
+                rgx[index + 1] = match
                if typ:
                    min_msd_length = max(index + 1, min_msd_length)
            # strip rgx
            for i in reversed(range(len(rgx))):
                if rgx[i] == '.':
                    rgx = rgx[:-1]
                else:
                    break
            self.re_objects.append([re.compile(r) for r in rgx])
            self.rgxs.append(rgx)
            self.min_msd_lengths.append(min_msd_length)
        # self.re_objects = [re.compile(r) for r in rgx]
        # self.rgx = rgx
    def __call__(self, text):
-        if len(text) <= self.min_msd_length:
+        # if len(text) <= self.min_msd_length:
-            return False
+        #     return False
        # if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
        #     a = 1
        for i, re_object in enumerate(self.re_objects):
            if len(text) <= self.min_msd_lengths[i]:
                continue
            match = True
-        for c, r in zip(text, self.re_objects):
+            for c, r in zip(text, re_object):
-            if not r.match(c):
+                if not r.match(c):
-                return False
+                    match = False
-        return True
+                    break
            if match:
                return True
        return False
 class LexisRegex:
@ -111,7 +144,7 @@ class Restriction:
        if restriction_type == "morphology":
            self.type = RestrictionType.Morphology
            self.matcher = MorphologyRegex(list(restriction_tag))
-            self.ppb = determine_ppb(self.matcher.rgx)
+            self.ppb = determine_ppb(self.matcher.rgxs)
        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
--- a/src/wani.py
+++ b/src/wani.py
@ -134,7 +134,7 @@ if __name__ == '__main__':
                        action='store_true')
    parser.add_argument('--load-sloleks',
-                        help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
+                        help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
                        action='store_true')
    parser.add_argument('--sort-by',
--- a/src/word.py
+++ b/src/word.py
@ -36,7 +36,9 @@ class Word:
        self.lemma = lemma
        self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
        self.id = wid
        self.idi = None
        self.text = text
        self.glue = ''
        self.links = defaultdict(list)