diff --git a/luscenje_struktur/postprocessor.py b/luscenje_struktur/postprocessor.py index 60d15cc..b4fc3ac 100644 --- a/luscenje_struktur/postprocessor.py +++ b/luscenje_struktur/postprocessor.py @@ -1,7 +1,8 @@ class Postprocessor: - def __init__(self, fix_one_letter_words=True): + def __init__(self, fix_one_letter_words=True, fixed_restriction_order=False): self.fix_one_letter_words = fix_one_letter_words + self.fixed_restriction_order = fixed_restriction_order @staticmethod def fix_sz(next_word): @@ -28,3 +29,19 @@ class Postprocessor: match[col_id].text = correct_letter collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]] return match, collocation_id + + def is_fixed_restriction_order(self, match): + if not self.fixed_restriction_order: + return True + + sorted_dict = {k: v for k, v in sorted(match.items(), key=lambda item: item[1].int_id)} + prev_id = -1 + for key in sorted_dict.keys(): + if key == '#': + continue + int_key = int(key) + if prev_id > int_key: + return False + prev_id = int_key + + return True diff --git a/wani.py b/wani.py index 35bcf84..09b4f6b 100644 --- a/wani.py +++ b/wani.py @@ -31,6 +31,8 @@ def match_file(words, structures, postprocessor): for w in words: mhere = s.match(w) for match in mhere: + if not postprocessor.is_fixed_restriction_order(match): + continue colocation_id = [[idx, w.lemma] for idx, w in match.items()] colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) match, collocation_id = postprocessor.process(match, colocation_id) @@ -48,6 +50,7 @@ def main(args): database = Database(args) match_store = MatchStore(args, database) word_stats = WordStats(lemma_msds, database) + postprocessor = Postprocessor(fixed_restriction_order=args.fixed_restriction_order) for words in load_files(args, database): if words is None: @@ -55,7 +58,6 @@ def main(args): continue start_time = time.time() - postprocessor = Postprocessor() matches = match_file(words, structures, postprocessor) match_store.add_matches(matches) @@ -155,6 +157,9 @@ if __name__ == '__main__': help='Separator in output file', default="\t") parser.add_argument('--ignore-punctuations', help="Sort in reversed ored", action='store_true') + parser.add_argument('--fixed-restriction-order', + help='If used, words have to be in the same order as components.', + action='store_true') args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())