From de3e52c57cdaa737285d6a956dbc2492c31a7867 Mon Sep 17 00:00:00 2001
From: Luka <krsnik.luka92@gmail.com>
Date: Fri, 10 Jul 2020 13:43:52 +0200
Subject: [PATCH] Changed output document to reflect most frequent word order

---
 src/formatter.py      | 20 +++++++++++---------
 src/representation.py |  2 ++
 src/wani.py           | 13 +++++++++++++
 src/writer.py         | 21 +++++++++++++++++----
 4 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/src/formatter.py b/src/formatter.py
index 932062f..363ad21 100644
--- a/src/formatter.py
+++ b/src/formatter.py
@@ -34,7 +34,7 @@ class Formatter:
 
 class OutNoStatFormatter(Formatter):
     def additional_init(self):
-        self.representation = ""
+        self.representation = {}
 
     def header_repeat(self):
         return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
@@ -49,16 +49,18 @@ class OutNoStatFormatter(Formatter):
 
         rep = representations[idx]
         if rep is None:
-            self.representation += " " + word.lemma
+            self.representation[idx] = word.lemma
             return [word.lemma, word.lemma, "", "lemma_fallback"]
         else:
-            self.representation += " " + rep
+            self.representation[idx] = rep
             return [word.lemma, rep, word.msd, "ok"]
 
-    def content_right(self, freq):
-        rep = re.sub(' +', ' ', self.representation.strip())
+    def content_right(self, freq, best_word_order=None):
+        if best_word_order is None:
+            best_word_order = sorted(self.representation.keys())
+        rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation])
         result = [rep, str(freq)]
-        self.representation = ""
+        self.representation = {}
         return result
 
     def group(self):
@@ -181,13 +183,13 @@ class OutFormatter(Formatter):
     def header_right(self):
         return self.f1.header_right() + self.f2.header_right()
 
-    def content_repeat(self, words, representations, idx, sidx):
+    def content_repeat(self, words, representations, idx, sidx, best_word_order=None):
         cr1 = self.f1.content_repeat(words, representations, idx, sidx)
         cr2 = self.f2.content_repeat(words, representations, idx, sidx)
         return cr1 + cr2
 
-    def content_right(self, freq):
-        return self.f1.content_right(freq) + self.f2.content_right(freq)
+    def content_right(self, freq, best_word_order=None):
+        return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq)
 
     def group(self):
         return self.f1.group() and self.f2.group()
diff --git a/src/representation.py b/src/representation.py
index 70d0e66..31bdeff 100644
--- a/src/representation.py
+++ b/src/representation.py
@@ -113,6 +113,8 @@ class WordFormMsdCR(WordFormAnyCR):
         for key, value in selectors.items():
             t = word_msd[0]
             v = TAGSET[t].index(key.lower())
+            if v + 1 >= len(word_msd):
+                return False
             f1 = word_msd[v + 1]
             f2 = CODES[value]
 
diff --git a/src/wani.py b/src/wani.py
index 9eeb9cf..b386697 100644
--- a/src/wani.py
+++ b/src/wani.py
@@ -38,6 +38,19 @@ def match_file(words, structures, postprocessor):
                 colocation_id = tuple(colocation_id)
 
                 matches[s].append((match, colocation_id))
+                # for key, val in matches.items():
+                #     if key.id == '15':
+                #         for el in val:
+                #             if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje':
+                #                 word_id = '.'.join(words[0].id.split('.')[:-1])
+                #                 print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}")
+                #                 print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
+
+                # if s.id == '15':
+                #     if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje':
+                #         word_id = '.'.join(match['1'].id.split('.')[:-1])
+                #         print(f"ID: {word_id}")
+                #         print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
 
     return matches
 
diff --git a/src/writer.py b/src/writer.py
index 8695de4..613cc18 100644
--- a/src/writer.py
+++ b/src/writer.py
@@ -71,21 +71,26 @@ class Writer:
         return sorted(rows, key=key, reverse=self.sort_order)
 
     def write_header(self, file_handler):
-        file_handler.write(", ".join(self.header()) + "\n")
+        file_handler.write(",".join(self.header()) + "\n")
 
     def write_out_worker(self, file_handler, structure, colocation_ids):
         rows = []
         components = structure.components
-
         for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
             if len(match) < self.min_frequency:
                 continue
 
             self.formatter.new_match(match)
 
+            best_word_order = self.find_best_word_order(match.matches)
+
             for words in match.matches:
                 to_write = []
 
+                # TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare
+                #  word.int_id and return most popular order and append to it remaining numbers to len(components)
+
+
                 for idx, _comp in enumerate(components):
                     idx = str(idx + 1)
                     if idx not in words:
@@ -100,7 +105,7 @@ class Writer:
                 to_write = [structure.id] + to_write + [match.match_id]
 
                 # header_right
-                to_write.extend(self.formatter.content_right(len(match)))
+                to_write.extend(self.formatter.content_right(len(match), best_word_order))
                 rows.append(to_write)
 
                 if self.formatter.group():
@@ -108,7 +113,7 @@ class Writer:
 
         if rows != []:
             rows = self.sorted_rows(rows)
-            file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
+            file_handler.write("\n".join([",".join(row) for row in rows]) + "\n")
             file_handler.flush()
 
     def write_out(self, structures, colocation_ids):
@@ -141,3 +146,11 @@ class Writer:
 
         if not self.multiple_output:
             fp_close(fp)
+
+    @staticmethod
+    def find_best_word_order(matches):
+        orders = {}
+        for words in matches:
+            order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])
+            orders[order] = orders.get(order, 0) + 1
+        return max(orders, key=orders.get)