Work to fix #757-104 and #757-89

for word_form all, now removing duplicates
for word_form msd, now word_forms from the collocation, not from whole corpus
determening more specific msd for agreements, so that it gets better match when using backup-lemma representation
for agreements, now ordered by colocation's own number of occurances, not global
removed a bit of debug code
This commit is contained in:
Ozbolt Menegatti 2019-05-29 20:22:22 +02:00
parent 4c2b5f2b13
commit 307007218d

101
wani.py
View File

@ -1,7 +1,7 @@
from xml.etree import ElementTree from xml.etree import ElementTree
import re import re
from enum import Enum from enum import Enum
from collections import defaultdict, namedtuple from collections import defaultdict, namedtuple, Counter
import sys import sys
import logging import logging
import argparse import argparse
@ -218,7 +218,7 @@ class ComponentRendition:
return self.rendition is rendition return self.rendition is rendition
@staticmethod @staticmethod
def set_representations(matches, structure, word_renderer, lemma_msds): def set_representations(matches, structure, word_renderer):
representations = { representations = {
c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""] c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
for c in structure.components for c in structure.components
@ -227,42 +227,39 @@ class ComponentRendition:
word_component_id = {} word_component_id = {}
def render_all(component_id, lst, _bw): def render_all(component_id, lst, _bw):
rep = "/".join([w.text for w in set(lst)]) if len(lst) > 0 else None rep = "/".join(set([w.text for w in set(lst)])) if len(lst) > 0 else None
matches.representations[component_id] = rep matches.representations[component_id] = rep
def render_form(component_id, lst, backup_word): def render_form(component_id, lst, backup_word):
if backup_word is not None: if backup_word is not None:
lst.append(backup_word) lst.append(backup_word)
text_forms = {}
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst])
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
text_forms[(msd, lemma)] = text
lst_ctr = [] lst_ctr = []
for word in lst: for word in lst:
lst_ctr.append((word.msd, word.lemma)) lst_ctr.append((word.msd, word.lemma))
sorted_lst = sorted(set(lst_ctr), key=lst.count) sorted_lst = sorted(set(lst_ctr), key=lst.count)
if len(lst) > 3:
a = 3
for word_msd, word_lemma in sorted_lst: for word_msd, word_lemma in sorted_lst:
if component_id in found_agreements: if component_id in found_agreements:
other_component_id, other_word, agreements = found_agreements[component_id] other_component_id, other_word, agreements, other_texts = found_agreements[component_id]
agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements) agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts)
if agr is None: if agr is None:
continue continue
matches.representations[other_component_id] = agr matches.representations[other_component_id] = agr
if word_lemma is not None: if word_lemma is not None:
matches.representations[component_id] = word_renderer.render(word_lemma, word_msd) matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd)
break break
# othw = are_agreements_ok(word, found_agreements)
# if othw is not None:
# matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer)
# return
def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements): def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts):
for w2_msd, w2_txt in word_renderer.available_words(ow_lemma): for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts):
if ow_msd[0] != w2_msd[0]: if ow_msd[0] != w2_msd[0]:
continue continue
@ -333,22 +330,22 @@ class ComponentRendition:
else: else:
assert(rep.isit(Rendition.WordForm)) assert(rep.isit(Rendition.WordForm))
wf_type, more = rep.more wf_type, more = rep.more
add = True
if wf_type is WordFormSelection.Msd: if wf_type is WordFormSelection.Msd:
add = check_msd(w, more) add = check_msd(w, more)
func = render_form func = render_form
elif wf_type is WordFormSelection.All: elif wf_type is WordFormSelection.All:
add = True
func = render_all func = render_all
elif wf_type is WordFormSelection.Any: elif wf_type is WordFormSelection.Any:
add = True
func = render_form func = render_form
else: else:
assert(wf_type is WordFormSelection.Agreement) assert(wf_type is WordFormSelection.Agreement)
other_w, agreements = more other_w, agreements = more
found_agreements[other_w] = (w_id, w, agreements) if other_w not in found_agreements:
found_agreements[other_w] = (w_id, w, agreements, [])
add = True found_agreements[other_w][-1].append((w.msd, w.text))
func = lambda *x: None func = lambda *x: None
representations[w_id][1] = func representations[w_id][1] = func
@ -370,7 +367,8 @@ class ComponentRendition:
if type(data[1]) is str: if type(data[1]) is str:
matches.representations[w_id] = None if data[0] else data[1] matches.representations[w_id] = None if data[0] else data[1]
else: else:
backup_word = lemma_only_word(lemma_msds[w.msd[0]]) if w.msd[0] in lemma_msds else None backup_msd = word_renderer.get_lemma_msd(w.lemma)
backup_word = lemma_only_word(backup_msd)
data[1](str(w_id), data[0], backup_word) data[1](str(w_id), data[0], backup_word)
def __str__(self): def __str__(self):
@ -853,8 +851,11 @@ def get_msd(comp):
raise NotImplementedError("MSD?") raise NotImplementedError("MSD?")
def lemma_only_word(msd): def lemma_only_word(msd):
WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma') if msd is None:
return WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None) return None
else:
WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text')
return WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None)
class Word: class Word:
def __init__(self, xml, do_msd_translate): def __init__(self, xml, do_msd_translate):
@ -895,11 +896,12 @@ class WordMsdRenderer:
self.all_words = [] self.all_words = []
self.rendered_words = {} self.rendered_words = {}
self.frequent_words = {} self.frequent_words = {}
self.lemma_msd = {}
def add_words(self, words): def add_words(self, words):
self.all_words.extend(words) self.all_words.extend(words)
def generate_renders(self): def generate_renders(self, lemma_features):
data = defaultdict(lambda: defaultdict(list)) data = defaultdict(lambda: defaultdict(list))
for w in self.all_words: for w in self.all_words:
data[w.lemma][w.msd].append(w.text) data[w.lemma][w.msd].append(w.text)
@ -907,6 +909,7 @@ class WordMsdRenderer:
for lemma, ld in data.items(): for lemma, ld in data.items():
self.rendered_words[lemma] = {} self.rendered_words[lemma] = {}
freq_words = defaultdict(int) freq_words = defaultdict(int)
common_msd = "*" * 10
for msd, texts in ld.items(): for msd, texts in ld.items():
rep = max(set(texts), key=texts.count) rep = max(set(texts), key=texts.count)
@ -914,22 +917,54 @@ class WordMsdRenderer:
for txt in texts: for txt in texts:
freq_words[(msd, txt)] += 1 freq_words[(msd, txt)] += 1
common_msd = self.merge_msd(common_msd, msd)
self.lemma_msd[lemma] = common_msd
self.frequent_words[lemma] = [] self.frequent_words[lemma] = []
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]): for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
self.frequent_words[lemma].append((msd, txt, n)) self.frequent_words[lemma].append((msd, txt, n))
for lemma in self.lemma_msd.keys():
cmsd = self.lemma_msd[lemma]
if cmsd[0] in lemma_features:
self.lemma_msd[lemma] = "".join(
l1 if l1 != "-" else l2 for l1, l2 in zip(lemma_features[cmsd[0]], cmsd)
)
@staticmethod
def merge_msd(common_msd, new_msd):
def merge_letter(l1, l2):
if l1 == "*":
return l2
elif l1 != l2:
return "-"
else:
return l1
return "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
def render(self, lemma, msd): def render(self, lemma, msd):
if lemma in self.rendered_words: if lemma in self.rendered_words:
if msd in self.rendered_words[lemma]: if msd in self.rendered_words[lemma]:
return self.rendered_words[lemma][msd][0] return self.rendered_words[lemma][msd][0]
def available_words(self, lemma): def available_words(self, lemma, existing_texts):
counted_texts = Counter(existing_texts)
for (msd, text), n in counted_texts.most_common():
yield (msd, text)
if lemma in self.frequent_words: if lemma in self.frequent_words:
# print("--")
for msd, text, _ in self.frequent_words[lemma]: for msd, text, _ in self.frequent_words[lemma]:
# print(lemma, msd, text, _) if (msd, text) not in counted_texts:
yield (msd, text) yield (msd, text)
def get_lemma_msd(self, lemma):
if lemma in self.lemma_msd and self.lemma_msd[lemma][0] != '-':
return self.lemma_msd[lemma]
else:
return None
def is_root_id(id_): def is_root_id(id_):
return len(id_.split('.')) == 3 return len(id_.split('.')) == 3
@ -1160,13 +1195,11 @@ class ColocationIds:
if group: if group:
break break
def set_representations(self, structures, word_renderer, lemma_msds): def set_representations(self, structures, word_renderer):
components_dict = {structure.id: structure for structure in structures} components_dict = {structure.id: structure for structure in structures}
idx = 1 idx = 1
for _1, sm in tqdm(self.data.items()): for _1, sm in tqdm(self.data.items()):
if idx == 120: ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
a = 3
ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer, lemma_msds)
idx += 1 idx += 1
@ -1246,9 +1279,9 @@ def main(input_file, structures_file, args):
word_renderer.add_words(words) word_renderer.add_words(words)
# get word renders for lemma/msd # get word renders for lemma/msd
word_renderer.generate_renders() word_renderer.generate_renders(lemma_msds)
# figure out representations! # figure out representations!
colocation_ids.set_representations(structures, word_renderer, lemma_msds) colocation_ids.set_representations(structures, word_renderer)
if args.all: if args.all:
Writer.make_all_writer(args).write_out(structures, colocation_ids) Writer.make_all_writer(args).write_out(structures, colocation_ids)