diff --git a/wani.py b/wani.py index ab99345..65d7600 100644 --- a/wani.py +++ b/wani.py @@ -381,80 +381,6 @@ class ComponentRendition: representations[agr][0].agreement.append(rep) - # representations = { - # c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""] - # for c in structure.components - # } - # found_agreements = {} - - # def render_form(component_id, lst, backup_word): - # if backup_word is not None: - # lst.append(backup_word) - - # text_forms = {} - # msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst]) - # for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()): - # text_forms[(msd, lemma)] = text - - # lst_ctr = [] - # for word in lst: - # lst_ctr.append((word.msd, word.lemma)) - # sorted_lst = sorted(set(lst_ctr), key=lst.count) - - # for word_msd, word_lemma in sorted_lst: - # if component_id in found_agreements: - # other_component_id, other_word, agreements, other_texts = found_agreements[component_id] - # agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts) - # if agr is None: - # continue - - # matches.representations[other_component_id] = agr - - # if word_lemma is not None: - # matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd) - - # break - - # def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts): - # for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts): - # if ow_msd[0] != w2_msd[0]: - # continue - - # if check_agreement(w1_msd, w2_msd, agreements): - # return w2_txt - - - # def check_agreement(msd1, msd2, agreements): - # for agr_case in agreements: - # t1 = msd1[0] - # # if not in msd, some strange msd was tries, skipping... - # if agr_case not in TAGSET[t1]: - # logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1)) - # return False - - # v1 = TAGSET[t1].index(agr_case) - # # if none specified: nedolocnik, always agrees - # if v1 + 1 >= len(msd1): - # continue - # # first is uppercase, not in TAGSET - # m1 = msd1[v1 + 1] - - # # REPEAT (not DRY!) - # t2 = msd2[0] - # if agr_case not in TAGSET[t2]: - # logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2)) - # return False - # v2 = TAGSET[t2].index(agr_case) - # if v2 + 1 >= len(msd2): - # continue - # m2 = msd2[v2 + 1] - - # # match! - # if '-' not in [m1, m2] and m1 != m2: - # return False - - # return True - for words in matches.matches: # first pass, check everything but agreements for w_id, w in words.items(): @@ -462,43 +388,6 @@ class ComponentRendition: component_representations = representations[component.idx] for representation in component_representations: representation.add_word(w) - - # if rep.isit(Rendition.Lemma): - # representations[w_id][0] = False - # representations[w_id][1] = w.lemma - # elif rep.isit(Rendition.Lexis): - # representations[w_id][0] = False - # representations[w_id][1] = rep.more - # elif rep.isit(Rendition.Unknown): - # representations[w_id][0] = False - # representations[w_id][1] = "" - - # # it HAS to be word_form now - # else: - # assert(rep.isit(Rendition.WordForm)) - # wf_type, more = rep.more - # add = True - - # if wf_type is WordFormSelection.Msd: - # add = check_msd(w, more) - # func = render_form - # elif wf_type is WordFormSelection.All: - # func = render_all - # elif wf_type is WordFormSelection.Any: - # func = render_form - # else: - # assert(wf_type is WordFormSelection.Agreement) - # other_w, agreements = more - # if other_w not in found_agreements: - # found_agreements[other_w] = (w_id, w, agreements, []) - - # found_agreements[other_w][-1].append((w.msd, w.text)) - # func = lambda *x: None - - # representations[w_id][1] = func - # if add: - # representations[w_id][0].append(w) - for cid, reps in representations.items(): for rep in reps: @@ -513,25 +402,6 @@ class ComponentRendition: else: matches.representations[cid] = " ".join(("" if r is None else r) for r in reps) - # # just need to set representation to first group, - # # but in correct order, agreements last! - # representation_sorted_words = [] - # for w_id, w in matches.matches[0].items(): - # rep = component.representation - # if rep.isit(Rendition.WordForm) and rep.more[0] is WordFormSelection.Agreement: - # representation_sorted_words.append((w_id, w)) - # else: - # representation_sorted_words.insert(0, (w_id, w)) - - # for w_id, w in representation_sorted_words: - # data = representations[w_id] - # if type(data[1]) is str: - # matches.representations[w_id] = None if data[0] else data[1] - # else: - # backup_msd = word_renderer.get_lemma_msd(w.lemma) - # backup_word = lemma_only_word(backup_msd) - # data[1](str(w_id), data[0], backup_word) - def __str__(self): return str(self.rendition) @@ -1329,8 +1199,7 @@ class ColocationIds: def match_file(words, structures): matches = {s.id: [] for s in structures} - for idx, s in tqdm(list(enumerate(structures))): - # logging.info("{}/{}: {:7s}".format(idx, len(structures), s.id)) + for s in tqdm(structures): for w in words: mhere = s.match(w) logging.debug(" GOT: {}".format(len(mhere)))