diff --git a/src/representation.py b/src/representation.py index 2b5ebf2..93fe828 100644 --- a/src/representation.py +++ b/src/representation.py @@ -55,18 +55,25 @@ class WordFormAnyCR(ComponentRepresentation): sorted_words = sorted( set(words_counter), key=lambda x: -words_counter.count(x) + (sum(ord(l) for l in x[1]) / 1e5 if x[1] is not None else .5)) + # so lets got through all words, sorted by frequency for word_msd, word_lemma in sorted_words: - for agr in self.agreement: - if not agr.match(word_msd): - break - else: + # check if agreements match + agreements_matched = [agr.match(word_msd) for agr in self.agreement] + + # if we are at the last "backup word", then confirm matches + # that worked for this one and return + if word_lemma is None: + for agr, matched in zip(self.agreement, agreements_matched): + if matched: + agr.confirm_match() + return None + + # if all agreements match, we win! + if all(agreements_matched): for agr in self.agreement: agr.confirm_match() - if word_lemma is None: - return None - else: - return text_forms[(word_msd, word_lemma)] + return text_forms[(word_msd, word_lemma)] class WordFormMsdCR(WordFormAnyCR): def __init__(self, *args):