diff --git a/wani.py b/wani.py index 319ebc0..d64e1a9 100644 --- a/wani.py +++ b/wani.py @@ -126,19 +126,6 @@ class RestrictionType(Enum): Lexis = 1 MatchAll = 2 - -class Rendition(Enum): - Lemma = 0 - WordForm = 1 - Lexis = 2 - Unknown = 3 - -class WordFormSelection(Enum): - All = 0 - Msd = 1 - Agreement = 2 - Any = 3 - class Order(Enum): FromTo = 0 ToFrom = 1 @@ -171,14 +158,163 @@ class Order(Enum): else: raise NotImplementedError("Should not be here: Order match") + +class ComponentRepresentation: + def __init__(self, data, word_renderer): + self.data = data + self.word_renderer = word_renderer + + self.words = [] + self.rendition_text = None + self.agreement = None + + def get_agreement(self): + return None + + def add_word(self, word): + self.words.append(word) + + def render(self): + if self.rendition_text is None: + print(type(self)) + self.rendition_text = self._render() + + def rendition(self): + return "" if self.rendition_text is None else self.rendition_text + + def _render(self): + raise NotImplementedError("Not implemented for class: {}".format(type(self))) + +class LemmaCR(ComponentRepresentation): + def _render(self): + return self.words[0].lemma if len(self.words) > 0 else None + +class LexisCR(ComponentRepresentation): + def _render(self): + return self.data + +class WordFormAllCR(ComponentRepresentation): + def _render(self): + txt = "/".join(set([w.text for w in set(self.words)])) if len(self.words) > 0 else None + return txt + +class WordFormAnyCR(ComponentRepresentation): + def _render(self): + text_forms = {} + msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words]) + for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()): + text_forms[(msd, lemma)] = text + + words_counter = [] + for word in self.words: + words_counter.append((word.msd, word.lemma)) + sorted_words = sorted(set(words_counter), key=words_counter.count) + + for word_msd, word_lemma in sorted_words: + if self.agreement is not None: + if self.agreement.match(word_msd): + if word_lemma is None: + return None + else: + return text_forms[(word_msd, word_lemma)] + +class WordFormMsdCR(WordFormAnyCR): + def __init__(self, *args): + super().__init__(*args) + self.backup_word = None + + def check_msd(self, word): + selectors = self.data + for key, value in selectors.items(): + t = word.msd[0] + v = TAGSET[t].index(key.lower()) + f1 = word.msd[v + 1] + f2 = CODES[value] + + if '-' not in [f1, f2] and f1 != f2: + return False + + return True + pass + + def add_word(self, word): + if self.backup_word is None: + msd = self.word_renderer.get_lemma_msd(word.lemma, word.msd) + WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text') + self.backup_word = WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None) + + if self.check_msd(word): + super().add_word(word) + + def _render(self): + self.words.append(self.backup_word) + return super()._render() + +class WordFormAgreementCR(ComponentRepresentation): + def __init__(self, data, word_renderer): + super().__init__(data, word_renderer) + self.agree_with, self.data = self.data + + def get_agreement(self): + return self.agree_with + + def match(self, word_msd): + word_category = self.words[0].msd[0] + word_lemma = self.words[0].lemma + agreements = self.data + + existing = [(w.msd, w.text) for w in self.words] + + for candidate_msd, candidate_text in self.word_renderer.available_words(word_lemma, existing): + if word_category != candidate_msd[0]: + continue + + if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, agreements): + self.rendition_text = candidate_text + return True + + return False + + @staticmethod + def check_agreement(msd1, msd2, agreements): + for agr_case in agreements: + t1 = msd1[0] + # if not in msd, some strange msd was tries, skipping... + if agr_case not in TAGSET[t1]: + logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1)) + return False + + v1 = TAGSET[t1].index(agr_case) + # if none specified: nedolocnik, always agrees + if v1 + 1 >= len(msd1): + continue + # first is uppercase, not in TAGSET + m1 = msd1[v1 + 1] + + # REPEAT (not DRY!) + t2 = msd2[0] + if agr_case not in TAGSET[t2]: + logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2)) + return False + v2 = TAGSET[t2].index(agr_case) + if v2 + 1 >= len(msd2): + continue + m2 = msd2[v2 + 1] + + # match! + if '-' not in [m1, m2] and m1 != m2: + return False + + return True + + def render(self): + pass + + class ComponentRendition: def __init__(self): self.more = None - self.rendition = Rendition.Unknown - - def _set_rendition(self, r): - assert(self.rendition is Rendition.Unknown) - self.rendition = r + self.representation_factory = ComponentRepresentation def _set_more(self, m): self.more = m @@ -186,190 +322,205 @@ class ComponentRendition: def add_feature(self, feature): if 'rendition' in feature: if feature['rendition'] == "lemma": - self._set_rendition(Rendition.Lemma) + self.representation_factory = LemmaCR elif feature['rendition'] == "word_form": - self._set_rendition(Rendition.WordForm) - self._set_more((WordFormSelection.Any, None)) + # just by default, changes with selection + self.representation_factory = WordFormAnyCR elif feature['rendition'] == "lexis": - self._set_rendition(Rendition.Lexis) - self._set_more(feature['string']) + self.representation_factory = LexisCR + self.mor = feature['string'] else: raise NotImplementedError("Representation rendition: {}".format(feature)) elif 'selection' in feature: if feature['selection'] == "msd": - selectors = {k: v for k, v in feature.items() if k != 'selection'} - self._set_more((WordFormSelection.Msd, selectors)) + self.representation_factory = WordFormMsdCR + self.more = {k: v for k, v in feature.items() if k != 'selection'} elif feature['selection'] == "all": - self._set_more((WordFormSelection.All, None)) + self.representation_factory = WordFormAllCR elif feature['selection'] == 'agreement': assert(feature['head'][:4] == 'cid_') assert(feature['msd'] is not None) - - self._set_more((WordFormSelection.Agreement, - (feature['head'][4:], feature['msd'].split('+')))) + self.representation_factory = WordFormAgreementCR + self.more = (feature['head'][4:], feature['msd'].split('+')) else: raise NotImplementedError("Representation selection: {}".format(feature)) else: return None - def isit(self, rendition): - return self.rendition is rendition + def cr_instance(self, word_renderer): + return self.representation_factory(self.more, word_renderer) @staticmethod def set_representations(matches, structure, word_renderer): - representations = { - c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""] - for c in structure.components - } - found_agreements = {} - word_component_id = {} - - def render_all(component_id, lst, _bw): - rep = "/".join(set([w.text for w in set(lst)])) if len(lst) > 0 else None - matches.representations[component_id] = rep + representations = {} + for c in structure.components: + representations[c.idx] = [] + for rep in c.representation: + representations[c.idx].append(rep.cr_instance(word_renderer)) - def render_form(component_id, lst, backup_word): - if backup_word is not None: - lst.append(backup_word) - - text_forms = {} - msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst]) - for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()): - text_forms[(msd, lemma)] = text - - lst_ctr = [] - for word in lst: - lst_ctr.append((word.msd, word.lemma)) - sorted_lst = sorted(set(lst_ctr), key=lst.count) - - for word_msd, word_lemma in sorted_lst: - if component_id in found_agreements: - other_component_id, other_word, agreements, other_texts = found_agreements[component_id] - agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts) - if agr is None: - continue - - matches.representations[other_component_id] = agr - - if word_lemma is not None: - matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd) - - break - - def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts): - for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts): - if ow_msd[0] != w2_msd[0]: + for cid, reps in representations.items(): + for rep in reps: + agr = rep.get_agreement() + if agr is None: continue - if check_agreement(w1_msd, w2_msd, agreements): - return w2_txt + if len(representations[agr]) != 1: + n = len(representations[agr]) + raise NotImplementedError( + "Structure {}: ".format(structure.id) + + "component {} has agreement".format(cid) + + " with component {}".format(agr) + + ", however there are {} (!= 1) representations".format(n) + + " of component {}!".format(agr)) - def check_msd(word, selectors): - for key, value in selectors.items(): - t = word.msd[0] - v = TAGSET[t].index(key.lower()) - f1 = word.msd[v + 1] - f2 = CODES[value] + representations[agr][0].agreement = rep - if '-' not in [f1, f2] and f1 != f2: - return False + # representations = { + # c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""] + # for c in structure.components + # } + # found_agreements = {} - return True + # def render_form(component_id, lst, backup_word): + # if backup_word is not None: + # lst.append(backup_word) + + # text_forms = {} + # msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst]) + # for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()): + # text_forms[(msd, lemma)] = text + + # lst_ctr = [] + # for word in lst: + # lst_ctr.append((word.msd, word.lemma)) + # sorted_lst = sorted(set(lst_ctr), key=lst.count) + + # for word_msd, word_lemma in sorted_lst: + # if component_id in found_agreements: + # other_component_id, other_word, agreements, other_texts = found_agreements[component_id] + # agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts) + # if agr is None: + # continue + + # matches.representations[other_component_id] = agr + + # if word_lemma is not None: + # matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd) + + # break - def check_agreement(msd1, msd2, agreements): - for agr_case in agreements: - t1 = msd1[0] - # if not in msd, some strange msd was tries, skipping... - if agr_case not in TAGSET[t1]: - logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1)) - return False + # def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts): + # for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts): + # if ow_msd[0] != w2_msd[0]: + # continue - v1 = TAGSET[t1].index(agr_case) - # if none specified: nedolocnik, always agrees - if v1 + 1 >= len(msd1): - continue - # first is uppercase, not in TAGSET - m1 = msd1[v1 + 1] + # if check_agreement(w1_msd, w2_msd, agreements): + # return w2_txt - # REPEAT (not DRY!) - t2 = msd2[0] - if agr_case not in TAGSET[t2]: - logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2)) - return False - v2 = TAGSET[t2].index(agr_case) - if v2 + 1 >= len(msd2): - continue - m2 = msd2[v2 + 1] + + # def check_agreement(msd1, msd2, agreements): + # for agr_case in agreements: + # t1 = msd1[0] + # # if not in msd, some strange msd was tries, skipping... + # if agr_case not in TAGSET[t1]: + # logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1)) + # return False - # match! - if '-' not in [m1, m2] and m1 != m2: - return False + # v1 = TAGSET[t1].index(agr_case) + # # if none specified: nedolocnik, always agrees + # if v1 + 1 >= len(msd1): + # continue + # # first is uppercase, not in TAGSET + # m1 = msd1[v1 + 1] - return True + # # REPEAT (not DRY!) + # t2 = msd2[0] + # if agr_case not in TAGSET[t2]: + # logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2)) + # return False + # v2 = TAGSET[t2].index(agr_case) + # if v2 + 1 >= len(msd2): + # continue + # m2 = msd2[v2 + 1] + + # # match! + # if '-' not in [m1, m2] and m1 != m2: + # return False + + # return True for words in matches.matches: # first pass, check everything but agreements for w_id, w in words.items(): component = structure.get_component(w_id) - rep = component.representation - word_component_id[w.id] = w_id - - if rep.isit(Rendition.Lemma): - representations[w_id][0] = False - representations[w_id][1] = w.lemma - elif rep.isit(Rendition.Lexis): - representations[w_id][0] = False - representations[w_id][1] = rep.more - elif rep.isit(Rendition.Unknown): - representations[w_id][0] = False - representations[w_id][1] = "" + component_representations = representations[component.idx] + for representation in component_representations: + representation.add_word(w) - # it HAS to be word_form now - else: - assert(rep.isit(Rendition.WordForm)) - wf_type, more = rep.more - add = True + # if rep.isit(Rendition.Lemma): + # representations[w_id][0] = False + # representations[w_id][1] = w.lemma + # elif rep.isit(Rendition.Lexis): + # representations[w_id][0] = False + # representations[w_id][1] = rep.more + # elif rep.isit(Rendition.Unknown): + # representations[w_id][0] = False + # representations[w_id][1] = "" + + # # it HAS to be word_form now + # else: + # assert(rep.isit(Rendition.WordForm)) + # wf_type, more = rep.more + # add = True - if wf_type is WordFormSelection.Msd: - add = check_msd(w, more) - func = render_form - elif wf_type is WordFormSelection.All: - func = render_all - elif wf_type is WordFormSelection.Any: - func = render_form - else: - assert(wf_type is WordFormSelection.Agreement) - other_w, agreements = more - if other_w not in found_agreements: - found_agreements[other_w] = (w_id, w, agreements, []) + # if wf_type is WordFormSelection.Msd: + # add = check_msd(w, more) + # func = render_form + # elif wf_type is WordFormSelection.All: + # func = render_all + # elif wf_type is WordFormSelection.Any: + # func = render_form + # else: + # assert(wf_type is WordFormSelection.Agreement) + # other_w, agreements = more + # if other_w not in found_agreements: + # found_agreements[other_w] = (w_id, w, agreements, []) - found_agreements[other_w][-1].append((w.msd, w.text)) - func = lambda *x: None + # found_agreements[other_w][-1].append((w.msd, w.text)) + # func = lambda *x: None - representations[w_id][1] = func - if add: - representations[w_id][0].append(w) + # representations[w_id][1] = func + # if add: + # representations[w_id][0].append(w) - # just need to set representation to first group, - # but in correct order, agreements last! - representation_sorted_words = [] - for w_id, w in matches.matches[0].items(): - rep = component.representation - if rep.isit(Rendition.WordForm) and rep.more[0] is WordFormSelection.Agreement: - representation_sorted_words.append((w_id, w)) - else: - representation_sorted_words.insert(0, (w_id, w)) + for cid, reps in representations.items(): + for rep in reps: + rep.render() - for w_id, w in representation_sorted_words: - data = representations[w_id] - if type(data[1]) is str: - matches.representations[w_id] = None if data[0] else data[1] - else: - backup_msd = word_renderer.get_lemma_msd(w.lemma) - backup_word = lemma_only_word(backup_msd) - data[1](str(w_id), data[0], backup_word) + for cid, reps in representations.items(): + rep = " ".join(rep.rendition() for rep in reps) + matches.representations[cid] = rep + + # # just need to set representation to first group, + # # but in correct order, agreements last! + # representation_sorted_words = [] + # for w_id, w in matches.matches[0].items(): + # rep = component.representation + # if rep.isit(Rendition.WordForm) and rep.more[0] is WordFormSelection.Agreement: + # representation_sorted_words.append((w_id, w)) + # else: + # representation_sorted_words.insert(0, (w_id, w)) + + # for w_id, w in representation_sorted_words: + # data = representations[w_id] + # if type(data[1]) is str: + # matches.representations[w_id] = None if data[0] else data[1] + # else: + # backup_msd = word_renderer.get_lemma_msd(w.lemma) + # backup_word = lemma_only_word(backup_msd) + # data[1](str(w_id), data[0], backup_word) def __str__(self): return str(self.rendition) @@ -519,7 +670,7 @@ class Component: self.idx = idx self.restriction = None self.next_element = [] - self.representation = ComponentRendition() + self.representation = [] self.selection = {} self.iter_ctr = 0 @@ -541,8 +692,11 @@ class Component: raise RuntimeError("Unreachable") def set_representation(self, representation): - for feature in representation: - self.representation.add_feature(feature.attrib) + for rep in representation: + crend = ComponentRendition() + for feature in rep: + crend.add_feature(feature.attrib) + self.representation.append(crend) def find_next(self, deps, comps, restrs, reprs): to_ret = [] @@ -721,21 +875,17 @@ class SyntacticStructure: return st def add_representation(self, n, rep_el, forms): - if rep_el.tag == "representation_and": - rep_el = rep_el[0] - logging.warning("Only using first reprentation in representation_and in structure {}".format(self.id)) - assert(rep_el.tag == "representation") + to_add = [] for el in rep_el: assert(el.tag == "feature") - if 'rendition' in el.attrib: - forms[n].append(el) - elif 'selection' in el.attrib: - forms[n].append(el) + if 'rendition' in el.attrib or 'selection' in el.attrib: + to_add.append(el) else: logging.warning("Strange representation feature in structure {}. Skipping" .format(self.id)) continue + forms[n].append(to_add) def __str__(self): comp_str = "\n".join(str(comp) for comp in self.components) @@ -892,16 +1042,17 @@ class Word: return word_renderer.render(self.lemma, self.msd) class WordMsdRenderer: - def __init__(self): + def __init__(self, lemma_features): self.all_words = [] self.rendered_words = {} self.frequent_words = {} self.lemma_msd = {} + self.lemma_features = lemma_features def add_words(self, words): self.all_words.extend(words) - def generate_renders(self, lemma_features): + def generate_renders(self): data = defaultdict(lambda: defaultdict(list)) for w in self.all_words: data[w.lemma][w.msd].append(w.text) @@ -926,11 +1077,12 @@ class WordMsdRenderer: for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]): self.frequent_words[lemma].append((msd, txt, n)) + lf = self.lemma_features for lemma in self.lemma_msd.keys(): cmsd = self.lemma_msd[lemma] - if cmsd[0] in lemma_features: + if cmsd[0] in lf: self.lemma_msd[lemma] = "".join( - l1 if l1 != "-" else l2 for l1, l2 in zip(lemma_features[cmsd[0]], cmsd) + l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd) ) @staticmethod @@ -952,7 +1104,7 @@ class WordMsdRenderer: def available_words(self, lemma, existing_texts): counted_texts = Counter(existing_texts) - for (msd, text), n in counted_texts.most_common(): + for (msd, text), _n in counted_texts.most_common(): yield (msd, text) if lemma in self.frequent_words: @@ -960,11 +1112,17 @@ class WordMsdRenderer: if (msd, text) not in counted_texts: yield (msd, text) - def get_lemma_msd(self, lemma): - if lemma in self.lemma_msd and self.lemma_msd[lemma][0] != '-': - return self.lemma_msd[lemma] + def get_lemma_msd(self, lemma, word_msd): + # should be here, since we collect every lemmas + lemma_msd = self.lemma_msd[lemma] + + if lemma_msd[0] == '-': + if word_msd[0] in self.lemma_features: + return self.lemma_features[word_msd[0]] + else: + return '-' else: - return None + return lemma_msd def is_root_id(id_): return len(id_.split('.')) == 3 @@ -1200,6 +1358,7 @@ class ColocationIds: idx = 1 for _1, sm in tqdm(self.data.items()): ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer) + print(idx) idx += 1 @@ -1227,7 +1386,7 @@ def main(input_file, structures_file, args): logging.debug(str(s)) colocation_ids = ColocationIds() - word_renderer = WordMsdRenderer() + word_renderer = WordMsdRenderer(lemma_msds) # if True: # with open("match_word.p", "rb") as fp: @@ -1279,13 +1438,14 @@ def main(input_file, structures_file, args): word_renderer.add_words(words) # get word renders for lemma/msd - word_renderer.generate_renders(lemma_msds) - # figure out representations! - colocation_ids.set_representations(structures, word_renderer) + word_renderer.generate_renders() + if args.output: + # figure out representations! + colocation_ids.set_representations(structures, word_renderer) + Writer.make_output_writer(args).write_out(structures, colocation_ids) if args.all: Writer.make_all_writer(args).write_out(structures, colocation_ids) - Writer.make_output_writer(args).write_out(structures, colocation_ids) logging.debug([(k, len(v)) for k, v in matches.items()]) logging.debug(sum(len(v) for _, v in matches.items()))