diff --git a/README.md b/README.md index 762db3c..0e4b9dd 100644 --- a/README.md +++ b/README.md @@ -26,18 +26,22 @@ pip install -r requirements.txt # Running ```bash -python3 wani.py --out --sloleks_db -python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k +python3 wani.py --out ``` ## Most important optional parameters ### --sloleks_db +This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement. + To use this sqlalchemy has to be installed as well. -PATH TO SLOLEKS DB + +This parameter has to include information about database in following order: + +::: ### --collocation_sentence_map_dest -../data/collocation_sentence_mapper +If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids. ### --db This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified. diff --git a/src/component.py b/src/component.py index 05035ad..918d8fa 100644 --- a/src/component.py +++ b/src/component.py @@ -135,6 +135,7 @@ class Component: # matches for every component in links from this component to_ret = [] + # need to get all links that match for next, link, order in self.next_element: next_links = word.get_links(link) @@ -146,6 +147,9 @@ class Component: if not order.match(word, next_word): continue + if word.lemma == 'aktivirati' and next_word.text == 'potomcih': + a = 0 + match = next.match(next_word) if match is not None: diff --git a/src/formatter.py b/src/formatter.py index ad11c65..26f4f12 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -82,7 +82,7 @@ class AllFormatter(Formatter): word = words[idx] return [word.id, word.text, word.lemma, word.msd] - def content_right(self, _freq): + def content_right(self, _freq, variable_word_order=None): return [] def group(self): diff --git a/src/lemma_features.py b/src/lemma_features.py index 5650fa4..a11ce49 100644 --- a/src/lemma_features.py +++ b/src/lemma_features.py @@ -8,7 +8,7 @@ def get_lemma_features(et): result = {} for pos in lf.iter('POS'): - rgx_list = MorphologyRegex(pos).rgx + rgx_list = MorphologyRegex(pos).rgxs[0] rgx_str = "" for position in rgx_list: if position == ".": diff --git a/src/restriction.py b/src/restriction.py index f521f2f..199ad07 100644 --- a/src/restriction.py +++ b/src/restriction.py @@ -10,7 +10,10 @@ class RestrictionType(Enum): MatchAll = 2 -def determine_ppb(rgx): +def determine_ppb(rgxs): + if len(rgxs) != 1: + return 0 + rgx = rgxs[0] if rgx[0] in ("A", "N", "R"): return 0 elif rgx[0] == "V": @@ -27,7 +30,7 @@ def determine_ppb(rgx): class MorphologyRegex: def __init__(self, restriction): - self.min_msd_length = 1 + # self.min_msd_length = 1 restr_dict = {} for feature in restriction: @@ -44,45 +47,75 @@ class MorphologyRegex: restr_dict[key] = (value, match_type) assert 'POS' in restr_dict - category = restr_dict['POS'][0].capitalize() - cat_code = CODES[category] - rgx = [cat_code] + ['.'] * 10 + + # handle multiple word types + if '|' in restr_dict['POS'][0]: + categories = restr_dict['POS'][0].split('|') + else: + categories = [restr_dict['POS'][0]] + + self.rgxs = [] + self.re_objects = [] + self.min_msd_lengths = [] del restr_dict['POS'] - for attribute, (value, typ) in restr_dict.items(): - index = TAGSET[cat_code].index(attribute.lower()) - assert index >= 0 + for category in categories: + min_msd_length = 1 + category = category.capitalize() + cat_code = CODES[category] + rgx = [cat_code] + ['.'] * 10 + - if '|' in value: - match = "".join(CODES[val] for val in value.split('|')) - else: - match = CODES[value] - match = "[{}{}]".format("" if typ else "^", match) - rgx[index + 1] = match + for attribute, (value, typ) in restr_dict.items(): + if attribute.lower() not in TAGSET[cat_code]: + continue + index = TAGSET[cat_code].index(attribute.lower()) + assert index >= 0 - if typ: - self.min_msd_length = max(index + 1, self.min_msd_length) + if '|' in value: + match = "".join(CODES[val] for val in value.split('|')) + else: + match = CODES[value] - # strip rgx - for i in reversed(range(len(rgx))): - if rgx[i] == '.': - rgx = rgx[:-1] - else: - break + match = "[{}{}]".format("" if typ else "^", match) + rgx[index + 1] = match - self.re_objects = [re.compile(r) for r in rgx] - self.rgx = rgx + if typ: + min_msd_length = max(index + 1, min_msd_length) + + # strip rgx + for i in reversed(range(len(rgx))): + if rgx[i] == '.': + rgx = rgx[:-1] + else: + break + + self.re_objects.append([re.compile(r) for r in rgx]) + self.rgxs.append(rgx) + self.min_msd_lengths.append(min_msd_length) + + # self.re_objects = [re.compile(r) for r in rgx] + # self.rgx = rgx def __call__(self, text): - if len(text) <= self.min_msd_length: - return False - - for c, r in zip(text, self.re_objects): - if not r.match(c): - return False - return True + # if len(text) <= self.min_msd_length: + # return False + # if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1: + # a = 1 + for i, re_object in enumerate(self.re_objects): + if len(text) <= self.min_msd_lengths[i]: + continue + match = True + + for c, r in zip(text, re_object): + if not r.match(c): + match = False + break + if match: + return True + return False class LexisRegex: @@ -111,7 +144,7 @@ class Restriction: if restriction_type == "morphology": self.type = RestrictionType.Morphology self.matcher = MorphologyRegex(list(restriction_tag)) - self.ppb = determine_ppb(self.matcher.rgx) + self.ppb = determine_ppb(self.matcher.rgxs) elif restriction_type == "lexis": self.type = RestrictionType.Lexis diff --git a/src/wani.py b/src/wani.py index df6d1fe..31aa71f 100644 --- a/src/wani.py +++ b/src/wani.py @@ -134,7 +134,7 @@ if __name__ == '__main__': action='store_true') parser.add_argument('--load-sloleks', - help='Tells weather sloleks is loaded into memory at the beginning of processing or not.', + help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in', action='store_true') parser.add_argument('--sort-by', diff --git a/src/word.py b/src/word.py index 53f4036..212331f 100644 --- a/src/word.py +++ b/src/word.py @@ -36,7 +36,9 @@ class Word: self.lemma = lemma self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd self.id = wid + self.idi = None self.text = text + self.glue = '' self.links = defaultdict(list)