From 6a221ae8fedd7dab5270b60940c2b9f09a17006a Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Fri, 25 Jan 2019 11:58:40 +0100
Subject: [PATCH] Fixes for msd length matching and pc matching

Also some cleanup and fix output formatting
---
 msd_translate.py |  3 +-
 wani.py          | 80 ++++++++++++++++++++++++------------------------
 2 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/msd_translate.py b/msd_translate.py
index ccab003..016f918 100644
--- a/msd_translate.py
+++ b/msd_translate.py
@@ -1900,4 +1900,5 @@ MSD_TRANSLATE = {
     "N": "X",
     "Nj": "Xf",
     "Nt": "Xt",
-    "Np": "Xp"}
+    "Np": "Xp",
+    "U": "N"}
diff --git a/wani.py b/wani.py
index 5da191d..0c9d8cc 100644
--- a/wani.py
+++ b/wani.py
@@ -8,6 +8,9 @@ import logging
 from msd_translate import MSD_TRANSLATE
 
 
+MAX_NUM_COMPONENTS = 5
+
+
 STAVKI = sys.argv[1]
 STRUKTURE = sys.argv[2]
 FILE_OUT = sys.argv[3]
@@ -215,6 +218,7 @@ def build_morphology_regex(restriction):
     rgx = [cat_code] + CATEGORY_BASES[cat_code]
 
     del restr_dict['POS']
+    min_msd_length = 1
 
     for attribute, (value, typ) in restr_dict.items():
         index = TAGSET[cat_code].index(attribute.lower())
@@ -228,7 +232,13 @@ def build_morphology_regex(restriction):
         match = "[{}{}]".format("" if typ else "^", match)
         rgx[index + 1] = match
 
+        if typ:
+            min_msd_length = max(index + 1, min_msd_length)
+
     def matcher(text):
+        if len(text) <= min_msd_length:
+            return False
+
         for c, r in zip(text, rgx):
             if not re.match(r, c):
                 return False
@@ -443,16 +453,6 @@ class Component:
 
         logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched))
 
-        # check with status
-        # if self.status is ComponentStatus.Optional:
-        #     if not matched:
-        #         # nothing to add, but still good...
-        #         return {}
-        # elif self.status is ComponentStatus.Forbidden:
-        #     # forbiddent is handled at return stage in _match_next
-        #     # just process normally...
-        #     pass
-
         # recurse to next
         if not matched:
             return None
@@ -465,12 +465,13 @@ class Component:
 
         # need to get all links that match
         for next, link in self.next_element:
-            logging.debug("FIND LINKS FOR: {} -> {}".format(self.idx, next.idx))
+            next_links = word.get_links(link) 
+            logging.debug("FIND LINKS FOR: {} -> {}: #{}".format(self.idx, next.idx, len(next_links)))
             to_ret.append([])
 
             # good flag
             good = next.status != ComponentStatus.Required
-            for next_word in word.get_links(link):
+            for next_word in next_links:
                 logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id))
                 match = next.match(next_word)
 
@@ -679,6 +680,11 @@ class Word:
 
         assert(None not in (self.id, self.lemma, self.msd))
 
+    @staticmethod
+    def pcWord(pc):
+        pc.set('lemma', pc.text)
+        return Word(pc)
+
     def add_link(self, link, to):
         self.links[link].append(to)
 
@@ -700,14 +706,11 @@ def load_corpus(filename):
         xmlstring = xmlstring.replace(' xml:', ' ')
         et = ElementTree.XML(xmlstring)
 
-    root_words = set()
     words = {}
     for w in et.iter("w"):
         words[w.get('id')] = Word(w)
-
-    pcs = set()
     for pc in et.iter("pc"):
-        pcs.add(pc.get('id'))
+        words[pc.get('id')] = Word.pcWord(pc)
 
     for l in et.iter("link"):
         if 'dep' in l.keys():
@@ -723,29 +726,20 @@ def load_corpus(filename):
 
         if lfrom in words:
             if is_root_id(lfrom):
-                logging.error("NOO: ", lfrom, file=sys.stderr)
+                logging.error("NOO: ", lfrom)
                 sys.exit(1)
 
             if dest in words:
                 next_word = words[dest]
                 words[lfrom].add_link(ana, next_word)
-
-        # catch links from root
-        elif is_root_id(lfrom):
-            root_words.add(dest)
-
-        # catch links from <pc> :S
-        elif lfrom in pcs:
-            logging.warning(str(("link from <pc>: ", lfrom)))
+            else:
+                logging.error("Unknown id: {}".format(dest))
+                sys.exit(1)
 
         else:
             # strange errors, just skip...
             pass
 
-    no_root_words = [w for k, w in words.items() if k in root_words]
-    missing = root_words - set(w.id for w in no_root_words)
-    # what should i do with this I forgot :(
-
     return list(words.values())
 
 
@@ -774,15 +768,16 @@ def main():
             logging.debug("  GOT: {}".format(len(mhere)))
             for match, reason in mhere: 
                 matches[s.id].append((match, reason))
+
     print("")
 
-    header = [
-            "Structure_ID", "Component_ID", "Token_ID", "Word_form", 
-            "Lemma", "Msd", "Representative_form_1", "Component_ID", 
-            "Token_ID", "Word_form", "Lemma", "Msd", "Representative_form_2", 
-            "Collocation_ID", "Joint_representative_form"]
-    csv = [", ".join(header)]
+    header = ["Structure_ID"]
+    for i in range(MAX_NUM_COMPONENTS):
+        header.extend("C{}_{}".format(i + 1, thd) for thd in 
+                ["Token_ID", "Word_form", "Lemma", "Msd", "Representative_form"])
+    header.extend(["Collocation_ID", "Joint_representative_form"])
 
+    csv = [", ".join(header)]
     colocation_ids = {}
 
     for s in structures:
@@ -790,28 +785,33 @@ def main():
 
         for m, reason in ms:
             colocation_id = [s.id]
-            to_print = [s.id]
+            to_print = []
 
             m_sorted = defaultdict(lambda: None, m.items())
             for idx, comp in enumerate(s.components):
                 idx = str(idx + 1)
                 if idx not in m_sorted:
-                    to_print.extend([idx, "", "", "", "", ""])
+                    to_print.extend(["", "", "", "", ""])
                 else:
                     w = m_sorted[idx]
                     # if comp.render_word(m_sorted[idx]) is not None:
                     if True:
-                        to_print.extend([idx, w.id, w.text, w.lemma, w.msd, ""])
+                        to_print.extend([w.id, w.text, w.lemma, w.msd, ""])
                         colocation_id.append(w.lemma)
 
             colocation_id = tuple(colocation_id)
             if colocation_id in colocation_ids:
                 cid = colocation_ids[colocation_id]
             else:
-                cid = len(colocation_ids)
+                cid = len(colocation_ids) + 1
                 colocation_ids[colocation_id] = cid
-                
+
+            to_print = [s.id] + to_print
+            length = 1 + MAX_NUM_COMPONENTS * 5
+            # make them equal size
+            to_print.extend([""] * (length - len(to_print))) 
             to_print.extend([str(cid), ""])
+
             csv.append(", ".join(to_print))