Added some functions for compatibility with valency, fixed readme and fixed some minor bugs.

2020-09-10 15:06:09 +02:00 · 2020-09-10 15:06:09 +02:00 · 01b08667d2
commit 01b08667d2
parent 1b0e6a27eb
7 changed files with 81 additions and 38 deletions
--- a/README.md
+++ b/README.md
@ -26,18 +26,22 @@ pip install -r requirements.txt
 # Running

 ```bash
-python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
-python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
+python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE>
 ```

 ## Most important optional parameters

 ### --sloleks_db
+This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement. 
+
 To use this sqlalchemy has to be installed as well.
-PATH TO SLOLEKS DB
+
+This parameter has to include information about database in following order:
+
+<DB_USERNAME>:<DB_PASSWORD>:<DB_NAME>:<DB_URL>

 ### --collocation_sentence_map_dest
-../data/collocation_sentence_mapper 
+If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids.

 ### --db
 This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
--- a/src/component.py
+++ b/src/component.py
@ -135,6 +135,7 @@ class Component:
        # matches for every component in links from this component
        to_ret = []

+
        # need to get all links that match
        for next, link, order in self.next_element:
            next_links = word.get_links(link)
@ -146,6 +147,9 @@ class Component:
                if not order.match(word, next_word):
                    continue

+                if word.lemma == 'aktivirati' and next_word.text == 'potomcih':
+                    a = 0
+
                match = next.match(next_word)

                if match is not None:
--- a/src/formatter.py
+++ b/src/formatter.py
@ -82,7 +82,7 @@ class AllFormatter(Formatter):
        word = words[idx]
        return [word.id, word.text, word.lemma, word.msd]
    
-    def content_right(self, _freq):
+    def content_right(self, _freq, variable_word_order=None):
        return []
    
    def group(self):
--- a/src/lemma_features.py
+++ b/src/lemma_features.py
@ -8,7 +8,7 @@ def get_lemma_features(et):

    result = {}
    for pos in lf.iter('POS'):
-        rgx_list = MorphologyRegex(pos).rgx
+        rgx_list = MorphologyRegex(pos).rgxs[0]
        rgx_str = ""
        for position in rgx_list:
            if position == ".":
--- a/src/restriction.py
+++ b/src/restriction.py
@ -10,7 +10,10 @@ class RestrictionType(Enum):
    MatchAll = 2


-def determine_ppb(rgx):
+def determine_ppb(rgxs):
+    if len(rgxs) != 1:
+        return 0
+    rgx = rgxs[0]
    if rgx[0] in ("A", "N", "R"):
        return 0
    elif rgx[0] == "V":
@ -27,7 +30,7 @@ def determine_ppb(rgx):

 class MorphologyRegex:
    def __init__(self, restriction):
-        self.min_msd_length = 1
+        # self.min_msd_length = 1

        restr_dict = {}
        for feature in restriction:
@ -44,45 +47,75 @@ class MorphologyRegex:
            restr_dict[key] = (value, match_type)

        assert 'POS' in restr_dict
-        category = restr_dict['POS'][0].capitalize()
-        cat_code = CODES[category]
-        rgx = [cat_code] + ['.'] * 10
+
+        # handle multiple word types
+        if '|' in restr_dict['POS'][0]:
+            categories = restr_dict['POS'][0].split('|')
+        else:
+            categories = [restr_dict['POS'][0]]
+
+        self.rgxs = []
+        self.re_objects = []
+        self.min_msd_lengths = []

        del restr_dict['POS']

-        for attribute, (value, typ) in restr_dict.items():
-            index = TAGSET[cat_code].index(attribute.lower())
-            assert index >= 0
+        for category in categories:
+            min_msd_length = 1
+            category = category.capitalize()
+            cat_code = CODES[category]
+            rgx = [cat_code] + ['.'] * 10

-            if '|' in value:
-                match = "".join(CODES[val] for val in value.split('|'))
-            else:
-                match = CODES[value]

-            match = "[{}{}]".format("" if typ else "^", match)
-            rgx[index + 1] = match

-            if typ:
-                self.min_msd_length = max(index + 1, self.min_msd_length)
+            for attribute, (value, typ) in restr_dict.items():
+                if attribute.lower() not in TAGSET[cat_code]:
+                    continue
+                index = TAGSET[cat_code].index(attribute.lower())
+                assert index >= 0

-        # strip rgx
-        for i in reversed(range(len(rgx))):
-            if rgx[i] == '.':
-                rgx = rgx[:-1]
-            else:
-                break
+                if '|' in value:
+                    match = "".join(CODES[val] for val in value.split('|'))
+                else:
+                    match = CODES[value]

-        self.re_objects = [re.compile(r) for r in rgx]
-        self.rgx = rgx
+                match = "[{}{}]".format("" if typ else "^", match)
+                rgx[index + 1] = match
+
+                if typ:
+                    min_msd_length = max(index + 1, min_msd_length)
+
+            # strip rgx
+            for i in reversed(range(len(rgx))):
+                if rgx[i] == '.':
+                    rgx = rgx[:-1]
+                else:
+                    break
+
+            self.re_objects.append([re.compile(r) for r in rgx])
+            self.rgxs.append(rgx)
+            self.min_msd_lengths.append(min_msd_length)
+
+        # self.re_objects = [re.compile(r) for r in rgx]
+        # self.rgx = rgx
    
    def __call__(self, text):
-        if len(text) <= self.min_msd_length:
-            return False
+        # if len(text) <= self.min_msd_length:
+        #     return False
+        # if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
+        #     a = 1
+        for i, re_object in enumerate(self.re_objects):
+            if len(text) <= self.min_msd_lengths[i]:
+                continue
+            match = True

-        for c, r in zip(text, self.re_objects):
-            if not r.match(c):
-                return False
-        return True
+            for c, r in zip(text, re_object):
+                if not r.match(c):
+                    match = False
+                    break
+            if match:
+                return True
+        return False


 class LexisRegex:
@ -111,7 +144,7 @@ class Restriction:
        if restriction_type == "morphology":
            self.type = RestrictionType.Morphology
            self.matcher = MorphologyRegex(list(restriction_tag))
-            self.ppb = determine_ppb(self.matcher.rgx)
+            self.ppb = determine_ppb(self.matcher.rgxs)

        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
--- a/src/wani.py
+++ b/src/wani.py
@ -134,7 +134,7 @@ if __name__ == '__main__':
                        action='store_true')

    parser.add_argument('--load-sloleks',
-                        help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
+                        help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
                        action='store_true')

    parser.add_argument('--sort-by',
--- a/src/word.py
+++ b/src/word.py
@ -36,7 +36,9 @@ class Word:
        self.lemma = lemma
        self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
        self.id = wid
+        self.idi = None
        self.text = text
+        self.glue = ''

        self.links = defaultdict(list)