Work to fix #757-104 and #757-89
for word_form all, now removing duplicates for word_form msd, now word_forms from the collocation, not from whole corpus determening more specific msd for agreements, so that it gets better match when using backup-lemma representation for agreements, now ordered by colocation's own number of occurances, not global removed a bit of debug code
This commit is contained in:
parent
4c2b5f2b13
commit
307007218d
101
wani.py
101
wani.py
|
@ -1,7 +1,7 @@
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
import re
|
import re
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from collections import defaultdict, namedtuple
|
from collections import defaultdict, namedtuple, Counter
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
import argparse
|
import argparse
|
||||||
|
@ -218,7 +218,7 @@ class ComponentRendition:
|
||||||
return self.rendition is rendition
|
return self.rendition is rendition
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def set_representations(matches, structure, word_renderer, lemma_msds):
|
def set_representations(matches, structure, word_renderer):
|
||||||
representations = {
|
representations = {
|
||||||
c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
|
c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
|
||||||
for c in structure.components
|
for c in structure.components
|
||||||
|
@ -227,42 +227,39 @@ class ComponentRendition:
|
||||||
word_component_id = {}
|
word_component_id = {}
|
||||||
|
|
||||||
def render_all(component_id, lst, _bw):
|
def render_all(component_id, lst, _bw):
|
||||||
rep = "/".join([w.text for w in set(lst)]) if len(lst) > 0 else None
|
rep = "/".join(set([w.text for w in set(lst)])) if len(lst) > 0 else None
|
||||||
matches.representations[component_id] = rep
|
matches.representations[component_id] = rep
|
||||||
|
|
||||||
def render_form(component_id, lst, backup_word):
|
def render_form(component_id, lst, backup_word):
|
||||||
if backup_word is not None:
|
if backup_word is not None:
|
||||||
lst.append(backup_word)
|
lst.append(backup_word)
|
||||||
|
|
||||||
|
text_forms = {}
|
||||||
|
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst])
|
||||||
|
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
|
||||||
|
text_forms[(msd, lemma)] = text
|
||||||
|
|
||||||
lst_ctr = []
|
lst_ctr = []
|
||||||
for word in lst:
|
for word in lst:
|
||||||
lst_ctr.append((word.msd, word.lemma))
|
lst_ctr.append((word.msd, word.lemma))
|
||||||
sorted_lst = sorted(set(lst_ctr), key=lst.count)
|
sorted_lst = sorted(set(lst_ctr), key=lst.count)
|
||||||
|
|
||||||
if len(lst) > 3:
|
|
||||||
a = 3
|
|
||||||
|
|
||||||
for word_msd, word_lemma in sorted_lst:
|
for word_msd, word_lemma in sorted_lst:
|
||||||
if component_id in found_agreements:
|
if component_id in found_agreements:
|
||||||
other_component_id, other_word, agreements = found_agreements[component_id]
|
other_component_id, other_word, agreements, other_texts = found_agreements[component_id]
|
||||||
agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements)
|
agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts)
|
||||||
if agr is None:
|
if agr is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
matches.representations[other_component_id] = agr
|
matches.representations[other_component_id] = agr
|
||||||
|
|
||||||
if word_lemma is not None:
|
if word_lemma is not None:
|
||||||
matches.representations[component_id] = word_renderer.render(word_lemma, word_msd)
|
matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd)
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# othw = are_agreements_ok(word, found_agreements)
|
|
||||||
# if othw is not None:
|
|
||||||
# matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer)
|
|
||||||
# return
|
|
||||||
|
|
||||||
def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements):
|
def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts):
|
||||||
for w2_msd, w2_txt in word_renderer.available_words(ow_lemma):
|
for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts):
|
||||||
if ow_msd[0] != w2_msd[0]:
|
if ow_msd[0] != w2_msd[0]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -333,22 +330,22 @@ class ComponentRendition:
|
||||||
else:
|
else:
|
||||||
assert(rep.isit(Rendition.WordForm))
|
assert(rep.isit(Rendition.WordForm))
|
||||||
wf_type, more = rep.more
|
wf_type, more = rep.more
|
||||||
|
add = True
|
||||||
|
|
||||||
if wf_type is WordFormSelection.Msd:
|
if wf_type is WordFormSelection.Msd:
|
||||||
add = check_msd(w, more)
|
add = check_msd(w, more)
|
||||||
func = render_form
|
func = render_form
|
||||||
elif wf_type is WordFormSelection.All:
|
elif wf_type is WordFormSelection.All:
|
||||||
add = True
|
|
||||||
func = render_all
|
func = render_all
|
||||||
elif wf_type is WordFormSelection.Any:
|
elif wf_type is WordFormSelection.Any:
|
||||||
add = True
|
|
||||||
func = render_form
|
func = render_form
|
||||||
else:
|
else:
|
||||||
assert(wf_type is WordFormSelection.Agreement)
|
assert(wf_type is WordFormSelection.Agreement)
|
||||||
other_w, agreements = more
|
other_w, agreements = more
|
||||||
found_agreements[other_w] = (w_id, w, agreements)
|
if other_w not in found_agreements:
|
||||||
|
found_agreements[other_w] = (w_id, w, agreements, [])
|
||||||
|
|
||||||
add = True
|
found_agreements[other_w][-1].append((w.msd, w.text))
|
||||||
func = lambda *x: None
|
func = lambda *x: None
|
||||||
|
|
||||||
representations[w_id][1] = func
|
representations[w_id][1] = func
|
||||||
|
@ -370,7 +367,8 @@ class ComponentRendition:
|
||||||
if type(data[1]) is str:
|
if type(data[1]) is str:
|
||||||
matches.representations[w_id] = None if data[0] else data[1]
|
matches.representations[w_id] = None if data[0] else data[1]
|
||||||
else:
|
else:
|
||||||
backup_word = lemma_only_word(lemma_msds[w.msd[0]]) if w.msd[0] in lemma_msds else None
|
backup_msd = word_renderer.get_lemma_msd(w.lemma)
|
||||||
|
backup_word = lemma_only_word(backup_msd)
|
||||||
data[1](str(w_id), data[0], backup_word)
|
data[1](str(w_id), data[0], backup_word)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
@ -853,8 +851,11 @@ def get_msd(comp):
|
||||||
raise NotImplementedError("MSD?")
|
raise NotImplementedError("MSD?")
|
||||||
|
|
||||||
def lemma_only_word(msd):
|
def lemma_only_word(msd):
|
||||||
WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma')
|
if msd is None:
|
||||||
return WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None)
|
return None
|
||||||
|
else:
|
||||||
|
WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text')
|
||||||
|
return WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None)
|
||||||
|
|
||||||
class Word:
|
class Word:
|
||||||
def __init__(self, xml, do_msd_translate):
|
def __init__(self, xml, do_msd_translate):
|
||||||
|
@ -895,11 +896,12 @@ class WordMsdRenderer:
|
||||||
self.all_words = []
|
self.all_words = []
|
||||||
self.rendered_words = {}
|
self.rendered_words = {}
|
||||||
self.frequent_words = {}
|
self.frequent_words = {}
|
||||||
|
self.lemma_msd = {}
|
||||||
|
|
||||||
def add_words(self, words):
|
def add_words(self, words):
|
||||||
self.all_words.extend(words)
|
self.all_words.extend(words)
|
||||||
|
|
||||||
def generate_renders(self):
|
def generate_renders(self, lemma_features):
|
||||||
data = defaultdict(lambda: defaultdict(list))
|
data = defaultdict(lambda: defaultdict(list))
|
||||||
for w in self.all_words:
|
for w in self.all_words:
|
||||||
data[w.lemma][w.msd].append(w.text)
|
data[w.lemma][w.msd].append(w.text)
|
||||||
|
@ -907,6 +909,7 @@ class WordMsdRenderer:
|
||||||
for lemma, ld in data.items():
|
for lemma, ld in data.items():
|
||||||
self.rendered_words[lemma] = {}
|
self.rendered_words[lemma] = {}
|
||||||
freq_words = defaultdict(int)
|
freq_words = defaultdict(int)
|
||||||
|
common_msd = "*" * 10
|
||||||
|
|
||||||
for msd, texts in ld.items():
|
for msd, texts in ld.items():
|
||||||
rep = max(set(texts), key=texts.count)
|
rep = max(set(texts), key=texts.count)
|
||||||
|
@ -914,22 +917,54 @@ class WordMsdRenderer:
|
||||||
|
|
||||||
for txt in texts:
|
for txt in texts:
|
||||||
freq_words[(msd, txt)] += 1
|
freq_words[(msd, txt)] += 1
|
||||||
|
|
||||||
|
common_msd = self.merge_msd(common_msd, msd)
|
||||||
|
|
||||||
|
self.lemma_msd[lemma] = common_msd
|
||||||
|
|
||||||
self.frequent_words[lemma] = []
|
self.frequent_words[lemma] = []
|
||||||
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
|
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
|
||||||
self.frequent_words[lemma].append((msd, txt, n))
|
self.frequent_words[lemma].append((msd, txt, n))
|
||||||
|
|
||||||
|
for lemma in self.lemma_msd.keys():
|
||||||
|
cmsd = self.lemma_msd[lemma]
|
||||||
|
if cmsd[0] in lemma_features:
|
||||||
|
self.lemma_msd[lemma] = "".join(
|
||||||
|
l1 if l1 != "-" else l2 for l1, l2 in zip(lemma_features[cmsd[0]], cmsd)
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def merge_msd(common_msd, new_msd):
|
||||||
|
def merge_letter(l1, l2):
|
||||||
|
if l1 == "*":
|
||||||
|
return l2
|
||||||
|
elif l1 != l2:
|
||||||
|
return "-"
|
||||||
|
else:
|
||||||
|
return l1
|
||||||
|
|
||||||
|
return "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
|
||||||
|
|
||||||
def render(self, lemma, msd):
|
def render(self, lemma, msd):
|
||||||
if lemma in self.rendered_words:
|
if lemma in self.rendered_words:
|
||||||
if msd in self.rendered_words[lemma]:
|
if msd in self.rendered_words[lemma]:
|
||||||
return self.rendered_words[lemma][msd][0]
|
return self.rendered_words[lemma][msd][0]
|
||||||
|
|
||||||
def available_words(self, lemma):
|
def available_words(self, lemma, existing_texts):
|
||||||
|
counted_texts = Counter(existing_texts)
|
||||||
|
for (msd, text), n in counted_texts.most_common():
|
||||||
|
yield (msd, text)
|
||||||
|
|
||||||
if lemma in self.frequent_words:
|
if lemma in self.frequent_words:
|
||||||
# print("--")
|
|
||||||
for msd, text, _ in self.frequent_words[lemma]:
|
for msd, text, _ in self.frequent_words[lemma]:
|
||||||
# print(lemma, msd, text, _)
|
if (msd, text) not in counted_texts:
|
||||||
yield (msd, text)
|
yield (msd, text)
|
||||||
|
|
||||||
|
def get_lemma_msd(self, lemma):
|
||||||
|
if lemma in self.lemma_msd and self.lemma_msd[lemma][0] != '-':
|
||||||
|
return self.lemma_msd[lemma]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def is_root_id(id_):
|
def is_root_id(id_):
|
||||||
return len(id_.split('.')) == 3
|
return len(id_.split('.')) == 3
|
||||||
|
@ -1160,13 +1195,11 @@ class ColocationIds:
|
||||||
if group:
|
if group:
|
||||||
break
|
break
|
||||||
|
|
||||||
def set_representations(self, structures, word_renderer, lemma_msds):
|
def set_representations(self, structures, word_renderer):
|
||||||
components_dict = {structure.id: structure for structure in structures}
|
components_dict = {structure.id: structure for structure in structures}
|
||||||
idx = 1
|
idx = 1
|
||||||
for _1, sm in tqdm(self.data.items()):
|
for _1, sm in tqdm(self.data.items()):
|
||||||
if idx == 120:
|
ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
|
||||||
a = 3
|
|
||||||
ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer, lemma_msds)
|
|
||||||
idx += 1
|
idx += 1
|
||||||
|
|
||||||
|
|
||||||
|
@ -1246,9 +1279,9 @@ def main(input_file, structures_file, args):
|
||||||
word_renderer.add_words(words)
|
word_renderer.add_words(words)
|
||||||
|
|
||||||
# get word renders for lemma/msd
|
# get word renders for lemma/msd
|
||||||
word_renderer.generate_renders()
|
word_renderer.generate_renders(lemma_msds)
|
||||||
# figure out representations!
|
# figure out representations!
|
||||||
colocation_ids.set_representations(structures, word_renderer, lemma_msds)
|
colocation_ids.set_representations(structures, word_renderer)
|
||||||
|
|
||||||
if args.all:
|
if args.all:
|
||||||
Writer.make_all_writer(args).write_out(structures, colocation_ids)
|
Writer.make_all_writer(args).write_out(structures, colocation_ids)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user