From 06435aa3a2237dfe381ffcaa2cdee2e42c27cb4f Mon Sep 17 00:00:00 2001
From: Luka <krsnik.luka92@gmail.com>
Date: Fri, 9 Oct 2020 15:18:52 +0200
Subject: [PATCH] Added options for "modra"

---
 luscenje_struktur/component.py           | 27 +++++++-----------------
 luscenje_struktur/loader.py              | 10 +++++++--
 luscenje_struktur/msd_translate.py       |  2 +-
 luscenje_struktur/restriction.py         |  9 +-------
 luscenje_struktur/syntactic_structure.py | 14 +++++++++---
 luscenje_struktur/word.py                |  8 ++++++-
 luscenje_struktur/word_stats.py          |  2 ++
 luscenje_struktur/writer.py              | 27 ++++++++++++++----------
 wani.py                                  |  2 ++
 9 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/luscenje_struktur/component.py b/luscenje_struktur/component.py
index c162278..b5ce25a 100644
--- a/luscenje_struktur/component.py
+++ b/luscenje_struktur/component.py
@@ -38,7 +38,7 @@ class Component:
         self.status = status
         self.name = name
         self.idx = idx
-        self.restrictions = []
+        self.restrictions = [Restriction(None)] if 'restriction' in info else []
         self.next_element = []
         self.representation = []
         self.selection = {}
@@ -104,24 +104,13 @@ class Component:
             if len(cmatch) == 0:
                 continue
 
-            # if more than one match found for particular component
-            elif len(cmatch) > 1:
-                # if more than one match in multiple components, NOPE!
-                if len(to_ret) > 1:
-                    logging.warning("Strange multiple match: {}".format(
-                        str([w.id for w in cmatch[0].values()])))
-
-                    for tr in to_ret:
-                        tr.update(cmatch[0])
-                    continue
-
-                # yeah, so we have found more than one match, =>
-                # more than one element in to_ret
-                to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]
-
-            else:
-                for tr in to_ret:
-                    tr.update(cmatch[0])
+            # create new to_ret, to which extend all results
+            new_to_ret = []
+            for tr in to_ret:
+                # make sure that one word is not used twice in same to_ret
+                new_to_ret.extend([{**dict(tr), **m} for m in cmatch if any([m_v not in dict(tr).values() for m_v in m.values()])])
+            to_ret = new_to_ret
+            del new_to_ret
 
         return to_ret
 
diff --git a/luscenje_struktur/loader.py b/luscenje_struktur/loader.py
index 77fdd34..8da969b 100644
--- a/luscenje_struktur/loader.py
+++ b/luscenje_struktur/loader.py
@@ -102,6 +102,8 @@ def load_csv(filename, compressed):
         line_split = line_fixed.split("\t")
 
         if line_split[1] == "1" and len(words) > 0:
+            # adding fake word
+            words['0'] = Word('', '', '0', '', False, True)
             sentence_end(bad_sentence)
             bad_sentence = False
             links = []
@@ -114,9 +116,11 @@ def load_csv(filename, compressed):
         full_id = "{}.{}".format(sid, wid)
 
         words[wid] = Word(lemma, msd, full_id, text, True)
-        if link_src != '0':
-            links.append((link_src, wid, link_type))
+        # if link_src != '0':
+        links.append((link_src, wid, link_type))
 
+    # adding fake word
+    words['0'] = Word('', '', '0', '', False, True)
     sentence_end(bad_sentence)
     return result
 
@@ -189,6 +193,8 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
     words = {}
     sentences = list(et.iter('s'))
     for sentence in progress(sentences, "load-text"):
+        # create fake root word
+        words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
         for w in sentence.iter("w"):
             words[w.get('id')] = Word.from_xml(w, do_msd_translate)
         for pc in sentence.iter(pc_tag):
diff --git a/luscenje_struktur/msd_translate.py b/luscenje_struktur/msd_translate.py
index 24ab90b..6b526e6 100644
--- a/luscenje_struktur/msd_translate.py
+++ b/luscenje_struktur/msd_translate.py
@@ -1911,4 +1911,4 @@ MSD_TRANSLATE = {
     "Ne": "Ne",
     "Nh": "Nh",
     "Na": "Na",
-    "U": "N"}
+    "U": "Z"}
diff --git a/luscenje_struktur/restriction.py b/luscenje_struktur/restriction.py
index afd1da4..af4dd68 100644
--- a/luscenje_struktur/restriction.py
+++ b/luscenje_struktur/restriction.py
@@ -95,17 +95,10 @@ class MorphologyRegex:
             self.re_objects.append([re.compile(r) for r in rgx])
             self.rgxs.append(rgx)
             self.min_msd_lengths.append(min_msd_length)
-
-        # self.re_objects = [re.compile(r) for r in rgx]
-        # self.rgx = rgx
     
     def __call__(self, text):
-        # if len(text) <= self.min_msd_length:
-        #     return False
-        # if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
-        #     a = 1
         for i, re_object in enumerate(self.re_objects):
-            if len(text) <= self.min_msd_lengths[i]:
+            if len(text) < self.min_msd_lengths[i]:
                 continue
             match = True
 
diff --git a/luscenje_struktur/syntactic_structure.py b/luscenje_struktur/syntactic_structure.py
index aa6ff7e..49a5a81 100644
--- a/luscenje_struktur/syntactic_structure.py
+++ b/luscenje_struktur/syntactic_structure.py
@@ -10,6 +10,7 @@ class SyntacticStructure:
         self.id = None
         self.lbs = None
         self.components = []
+        self.fake_root_included = False
 
     @staticmethod
     def from_xml(xml, no_stats):
@@ -44,8 +45,14 @@ class SyntacticStructure:
                     raise NotImplementedError("Unknown definition: {} in structure {}"
                                               .format(el.tag, st.id))
 
-        fake_root_component = Component({'cid': '#', 'type': 'other'})
-        st.components = fake_root_component.find_next(deps, comps, restrs, forms)
+        fake_root_component = Component({'cid': '#', 'type': 'other', 'restriction': None})
+        fake_root_component_children = fake_root_component.find_next(deps, comps, restrs, forms)
+        # all dep with value modra point to artificial root - fake_root_component
+        if any([dep[2] == 'modra' for dep in deps]):
+            st.fake_root_included = True
+            st.components = [fake_root_component] + fake_root_component_children
+        else:
+            st.components = fake_root_component_children
 
         if not no_stats:
             st.determine_core2w()
@@ -112,7 +119,8 @@ def build_structures(args):
             continue
 
         structures.append(to_append)
-        max_num_components = max(max_num_components, len(to_append.components))
+        to_append_len = len(to_append.components) if not to_append.fake_root_included else len(to_append.components) - 1
+        max_num_components = max(max_num_components, to_append_len)
     
     lemma_features = get_lemma_features(et)
     return structures, lemma_features, max_num_components
diff --git a/luscenje_struktur/word.py b/luscenje_struktur/word.py
index d168070..bf5c423 100644
--- a/luscenje_struktur/word.py
+++ b/luscenje_struktur/word.py
@@ -32,13 +32,14 @@ class WordDummy:
 
 
 class Word:
-    def __init__(self, lemma, msd, wid, text, do_msd_translate):
+    def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False):
         self.lemma = lemma
         self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
         self.id = wid
         self.idi = None
         self.text = text
         self.glue = ''
+        self.fake_word = fake_word
 
         self.links = defaultdict(list)
 
@@ -74,6 +75,11 @@ class Word:
         pc.set('msd', "N" if do_msd_translate else "U")
         return Word.from_xml(pc, do_msd_translate)
 
+    @staticmethod
+    def fake_root_word(sentence_id):
+        wid = sentence_id
+        return Word('', '', wid, '', False, True)
+
     def add_link(self, link, to):
         self.links[link].append(to)
 
diff --git a/luscenje_struktur/word_stats.py b/luscenje_struktur/word_stats.py
index 7ca87a4..64472e5 100644
--- a/luscenje_struktur/word_stats.py
+++ b/luscenje_struktur/word_stats.py
@@ -25,6 +25,8 @@ class WordStats:
 
     def add_words(self, words):
         for w in progress(words, "adding-words"):
+            if w.fake_word:
+                continue
             params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
             res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
                 WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)
diff --git a/luscenje_struktur/writer.py b/luscenje_struktur/writer.py
index c6c14b8..7e9a035 100644
--- a/luscenje_struktur/writer.py
+++ b/luscenje_struktur/writer.py
@@ -16,23 +16,23 @@ class Writer:
     @staticmethod
     def make_output_writer(args, num_components, colocation_ids, word_renderer):
         params = Writer.other_params(args)
-        return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params)
+        return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator)
 
     @staticmethod
     def make_output_no_stat_writer(args, num_components, colocation_ids, word_renderer):
         params = Writer.other_params(args)
-        return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params)
+        return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator)
 
     @staticmethod
     def make_all_writer(args, num_components, colocation_ids, word_renderer):
-        return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, None)
+        return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, None, args.separator)
 
     @staticmethod
     def make_stats_writer(args, num_components, colocation_ids, word_renderer):
         params = Writer.other_params(args)
-        return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params)
+        return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator)
 
-    def __init__(self, file_out, num_components, formatter, collocation_sentence_map_dest, params):
+    def __init__(self, file_out, num_components, formatter, collocation_sentence_map_dest, params, separator):
         # TODO FIX THIS
         self.collocation_sentence_map_dest = collocation_sentence_map_dest
         if params is None:
@@ -49,6 +49,7 @@ class Writer:
         self.num_components = num_components
         self.output_file = file_out
         self.formatter = formatter
+        self.separator = separator
 
     def header(self):
         repeating_cols = self.formatter.header_repeat()
@@ -78,7 +79,7 @@ class Writer:
         return sorted(rows, key=key, reverse=self.sort_order)
 
     def write_header(self, file_handler):
-        file_handler.write(",".join(self.header()) + "\n")
+        file_handler.write(self.separator.join(self.header()) + "\n")
 
     def write_out_worker(self, file_handler, structure, colocation_ids, col_sent_map):
         rows = []
@@ -99,12 +100,16 @@ class Writer:
             for words in match.matches:
                 to_write = []
 
-                for idx, _comp in enumerate(components):
-                    idx = str(idx + 1)
-                    if idx not in words:
+                idx = 1
+                for _comp in components:
+                    if _comp.idx == '#':
+                        continue
+                    idx_s = str(idx)
+                    idx += 1
+                    if idx_s not in words:
                         to_write.extend([""] * self.formatter.length())
                     else:
-                        to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id))
+                        to_write.extend(self.formatter.content_repeat(words, match.representations, idx_s, structure.id))
 
                 # make them equal size
                 to_write.extend([""] * (self.num_components * self.formatter.length() - len(to_write)))
@@ -121,7 +126,7 @@ class Writer:
 
         if rows != []:
             rows = self.sorted_rows(rows)
-            file_handler.write("\n".join([",".join(row) for row in rows]) + "\n")
+            file_handler.write("\n".join([self.separator.join(row) for row in rows]) + "\n")
             file_handler.flush()
 
     def write_out(self, structures, colocation_ids):
diff --git a/wani.py b/wani.py
index 021f7a2..1ba4b7a 100644
--- a/wani.py
+++ b/wani.py
@@ -151,6 +151,8 @@ if __name__ == '__main__':
 
     parser.add_argument('--pc-tag',
                         help='Tag for separators, usually pc or c', default="pc")
+    parser.add_argument('--separator',
+                        help='Separator in output file', default="\t")
 
     args = parser.parse_args()
     logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())