From e99ba599082bd9bd867d421b5bda39f8e8b06a93 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Wed, 22 May 2019 11:55:51 +0200 Subject: [PATCH] lemma/msd representations now global! Need to also use for agreements --- wani.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/wani.py b/wani.py index 8120cd1..3511de4 100644 --- a/wani.py +++ b/wani.py @@ -217,7 +217,7 @@ class ComponentRendition: return self.rendition is rendition @staticmethod - def set_representations(matches, structure): + def set_representations(matches, structure, word_renderer): representations = { c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""] for c in structure.components @@ -236,11 +236,8 @@ class ComponentRendition: for word in sorted_lst: othw = are_agreements_ok(word, representations_to_check) if othw is not None: - if doprint: - print("AOK", othw.text, othw) - - matches.representations[word_component_id[othw.id]] = othw.text - matches.representations[word_component_id[word.id]] = word.text + matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer) + matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer) return def are_agreements_ok(word, words_to_try): @@ -858,17 +855,20 @@ class Word: self.links[link].extend(self.links[l]) return self.links[link] + + def most_frequent_text(self, word_renderer): + return word_renderer.render(self.lemma, self.msd) class WordMsdRenderer: def __init__(self): self.all_words = [] self.rendered_words = {} - def add_word(self, word): - self.all_words.append(word) + def add_words(self, words): + self.all_words.extend(words) def generate_renders(self): - data = defaultdict(lambda: defaultdict([])) + data = defaultdict(lambda: defaultdict(list)) for w in self.all_words: data[w.lemma][w.msd].append(w.text) @@ -1114,10 +1114,10 @@ class ColocationIds: if group: break - def set_representations(self, structures): + def set_representations(self, structures, word_renderer): components_dict = {structure.id: structure for structure in structures} for _1, sm in self.data.items(): - ComponentRendition.set_representations(sm, components_dict[sm.structure_id]) + ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer) def match_file(words, structures): @@ -1144,6 +1144,7 @@ def main(input_file, structures_file, args): logging.debug(str(s)) colocation_ids = ColocationIds() + word_renderer = WordMsdRenderer() if args.parallel: num_parallel = int(args.parallel) @@ -1170,22 +1171,28 @@ def main(input_file, structures_file, args): # fancy interface to wait for threads to finish for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]): with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp: - matches = pickle.load(fp) + words, matches = pickle.load(fp) + colocation_ids.add_matches(matches) + word_renderer.add_words(words) else: for words in load_files(args): matches = match_file(words, structures) # just save to temporary file, used for children of a parallel process + # MUST NOT have more than one file if args.match_to_file is not None: with open(args.match_to_file, "wb") as fp: - pickle.dump(matches, fp) + pickle.dump((words, matches), fp) return else: colocation_ids.add_matches(matches) + word_renderer.add_words(words) + # get word renders for lemma/msd + word_renderer.generate_renders() # figure out representations! - colocation_ids.set_representations(structures) + colocation_ids.set_representations(structures, word_renderer) if args.all: Writer.make_all_writer(args).write_out(structures, colocation_ids)