From e99ba599082bd9bd867d421b5bda39f8e8b06a93 Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Wed, 22 May 2019 11:55:51 +0200
Subject: [PATCH] lemma/msd representations now global! Need to also use for
 agreements

---
 wani.py | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/wani.py b/wani.py
index 8120cd1..3511de4 100644
--- a/wani.py
+++ b/wani.py
@@ -217,7 +217,7 @@ class ComponentRendition:
         return self.rendition is rendition
     
     @staticmethod
-    def set_representations(matches, structure):
+    def set_representations(matches, structure, word_renderer):
         representations = {
             c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
             for c in structure.components
@@ -236,11 +236,8 @@ class ComponentRendition:
             for word in sorted_lst:
                 othw = are_agreements_ok(word, representations_to_check)
                 if othw is not None:
-                    if doprint:
-                        print("AOK", othw.text, othw)
-
-                    matches.representations[word_component_id[othw.id]] = othw.text
-                    matches.representations[word_component_id[word.id]] = word.text
+                    matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer)
+                    matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer)
                     return
         
         def are_agreements_ok(word, words_to_try):
@@ -858,17 +855,20 @@ class Word:
                 self.links[link].extend(self.links[l])
 
         return self.links[link]
+    
+    def most_frequent_text(self, word_renderer):
+        return word_renderer.render(self.lemma, self.msd)
 
 class WordMsdRenderer:
     def __init__(self):
         self.all_words = []
         self.rendered_words = {}
     
-    def add_word(self, word):
-        self.all_words.append(word)
+    def add_words(self, words):
+        self.all_words.extend(words)
     
     def generate_renders(self):
-        data = defaultdict(lambda: defaultdict([]))
+        data = defaultdict(lambda: defaultdict(list))
         for w in self.all_words:
             data[w.lemma][w.msd].append(w.text)
 
@@ -1114,10 +1114,10 @@ class ColocationIds:
                 if group:
                     break
 
-    def set_representations(self, structures):
+    def set_representations(self, structures, word_renderer):
         components_dict = {structure.id: structure for structure in structures}
         for _1, sm in self.data.items():
-            ComponentRendition.set_representations(sm, components_dict[sm.structure_id])
+            ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
 
 
 def match_file(words, structures):
@@ -1144,6 +1144,7 @@ def main(input_file, structures_file, args):
         logging.debug(str(s))
 
     colocation_ids = ColocationIds()
+    word_renderer = WordMsdRenderer()
 
     if args.parallel:
         num_parallel = int(args.parallel)
@@ -1170,22 +1171,28 @@ def main(input_file, structures_file, args):
                 # fancy interface to wait for threads to finish
                 for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
                     with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
-                        matches = pickle.load(fp)
+                        words, matches = pickle.load(fp)
+
                     colocation_ids.add_matches(matches)
+                    word_renderer.add_words(words)
 
     else:
         for words in load_files(args):
             matches = match_file(words, structures)
             # just save to temporary file, used for children of a parallel process
+            # MUST NOT have more than one file
             if args.match_to_file is not None:
                 with open(args.match_to_file, "wb") as fp:
-                    pickle.dump(matches, fp)
+                    pickle.dump((words, matches), fp)
                     return
             else:
                 colocation_ids.add_matches(matches)
+                word_renderer.add_words(words)
 
+    # get word renders for lemma/msd
+    word_renderer.generate_renders()
     # figure out representations!
-    colocation_ids.set_representations(structures)
+    colocation_ids.set_representations(structures, word_renderer)
 
     if args.all:
         Writer.make_all_writer(args).write_out(structures, colocation_ids)