lemma/msd representations now global! Need to also use for agreements
This commit is contained in:
parent
d14efff709
commit
e99ba59908
35
wani.py
35
wani.py
|
@ -217,7 +217,7 @@ class ComponentRendition:
|
||||||
return self.rendition is rendition
|
return self.rendition is rendition
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def set_representations(matches, structure):
|
def set_representations(matches, structure, word_renderer):
|
||||||
representations = {
|
representations = {
|
||||||
c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
|
c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
|
||||||
for c in structure.components
|
for c in structure.components
|
||||||
|
@ -236,11 +236,8 @@ class ComponentRendition:
|
||||||
for word in sorted_lst:
|
for word in sorted_lst:
|
||||||
othw = are_agreements_ok(word, representations_to_check)
|
othw = are_agreements_ok(word, representations_to_check)
|
||||||
if othw is not None:
|
if othw is not None:
|
||||||
if doprint:
|
matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer)
|
||||||
print("AOK", othw.text, othw)
|
matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer)
|
||||||
|
|
||||||
matches.representations[word_component_id[othw.id]] = othw.text
|
|
||||||
matches.representations[word_component_id[word.id]] = word.text
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def are_agreements_ok(word, words_to_try):
|
def are_agreements_ok(word, words_to_try):
|
||||||
|
@ -858,17 +855,20 @@ class Word:
|
||||||
self.links[link].extend(self.links[l])
|
self.links[link].extend(self.links[l])
|
||||||
|
|
||||||
return self.links[link]
|
return self.links[link]
|
||||||
|
|
||||||
|
def most_frequent_text(self, word_renderer):
|
||||||
|
return word_renderer.render(self.lemma, self.msd)
|
||||||
|
|
||||||
class WordMsdRenderer:
|
class WordMsdRenderer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.all_words = []
|
self.all_words = []
|
||||||
self.rendered_words = {}
|
self.rendered_words = {}
|
||||||
|
|
||||||
def add_word(self, word):
|
def add_words(self, words):
|
||||||
self.all_words.append(word)
|
self.all_words.extend(words)
|
||||||
|
|
||||||
def generate_renders(self):
|
def generate_renders(self):
|
||||||
data = defaultdict(lambda: defaultdict([]))
|
data = defaultdict(lambda: defaultdict(list))
|
||||||
for w in self.all_words:
|
for w in self.all_words:
|
||||||
data[w.lemma][w.msd].append(w.text)
|
data[w.lemma][w.msd].append(w.text)
|
||||||
|
|
||||||
|
@ -1114,10 +1114,10 @@ class ColocationIds:
|
||||||
if group:
|
if group:
|
||||||
break
|
break
|
||||||
|
|
||||||
def set_representations(self, structures):
|
def set_representations(self, structures, word_renderer):
|
||||||
components_dict = {structure.id: structure for structure in structures}
|
components_dict = {structure.id: structure for structure in structures}
|
||||||
for _1, sm in self.data.items():
|
for _1, sm in self.data.items():
|
||||||
ComponentRendition.set_representations(sm, components_dict[sm.structure_id])
|
ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
|
||||||
|
|
||||||
|
|
||||||
def match_file(words, structures):
|
def match_file(words, structures):
|
||||||
|
@ -1144,6 +1144,7 @@ def main(input_file, structures_file, args):
|
||||||
logging.debug(str(s))
|
logging.debug(str(s))
|
||||||
|
|
||||||
colocation_ids = ColocationIds()
|
colocation_ids = ColocationIds()
|
||||||
|
word_renderer = WordMsdRenderer()
|
||||||
|
|
||||||
if args.parallel:
|
if args.parallel:
|
||||||
num_parallel = int(args.parallel)
|
num_parallel = int(args.parallel)
|
||||||
|
@ -1170,22 +1171,28 @@ def main(input_file, structures_file, args):
|
||||||
# fancy interface to wait for threads to finish
|
# fancy interface to wait for threads to finish
|
||||||
for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
|
for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
|
||||||
with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
|
with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
|
||||||
matches = pickle.load(fp)
|
words, matches = pickle.load(fp)
|
||||||
|
|
||||||
colocation_ids.add_matches(matches)
|
colocation_ids.add_matches(matches)
|
||||||
|
word_renderer.add_words(words)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
for words in load_files(args):
|
for words in load_files(args):
|
||||||
matches = match_file(words, structures)
|
matches = match_file(words, structures)
|
||||||
# just save to temporary file, used for children of a parallel process
|
# just save to temporary file, used for children of a parallel process
|
||||||
|
# MUST NOT have more than one file
|
||||||
if args.match_to_file is not None:
|
if args.match_to_file is not None:
|
||||||
with open(args.match_to_file, "wb") as fp:
|
with open(args.match_to_file, "wb") as fp:
|
||||||
pickle.dump(matches, fp)
|
pickle.dump((words, matches), fp)
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
colocation_ids.add_matches(matches)
|
colocation_ids.add_matches(matches)
|
||||||
|
word_renderer.add_words(words)
|
||||||
|
|
||||||
|
# get word renders for lemma/msd
|
||||||
|
word_renderer.generate_renders()
|
||||||
# figure out representations!
|
# figure out representations!
|
||||||
colocation_ids.set_representations(structures)
|
colocation_ids.set_representations(structures, word_renderer)
|
||||||
|
|
||||||
if args.all:
|
if args.all:
|
||||||
Writer.make_all_writer(args).write_out(structures, colocation_ids)
|
Writer.make_all_writer(args).write_out(structures, colocation_ids)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user