accomodating for #773

2019-01-19 22:42:51 +01:00
parent 106db9394e
commit cddeb9c4e4
1 changed files with 521 additions and 116 deletions
@@ -3,12 +3,14 @@ import re
 from enum import Enum
 from collections import defaultdict
 import sys
+import logging

 from msd_translate import MSD_TRANSLATE


 STAVKI = sys.argv[1]
-STRUKTURE = sys.argv[2] # "Kolokacije_strukture_09_new-system.xml"
+STRUKTURE = sys.argv[2]
+FILE_OUT = sys.argv[3]

 CODES = {
    "Noun": "N",
@@ -98,41 +100,96 @@ TAGSET = {
 }

 CATEGORY_BASES = {
-    "N": ['.', '.', '.', '.', '.?'],
-    "V": ['.', '.', '.', '.', '.?', '.?', '.?'],
-    "A": ['.', '.', '.', '.', '.', '.?'],
-    "R": ['.', '.?'],
-    "P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'],
-    "M": ['.', '.', '.', '.?', '.?', '.?'],
-    "S": ['.'],
-    "C": ['.'],
+    "N": ['.'] * 5,
+    "V": ['.'] * 7,
+    "A": ['.'] * 6,
+    "R": ['.'] * 2,
+    "P": ['.'] * 6,
+    "M": ['.'] * 6,
+    "S": ['.'] * 1,
+    "C": ['.'] * 1,
    "Q": [],
    "I": [],
    "Y": [],
-    "X": ['.?']
+    "X": ['.'] * 1
 }


 class RestrictionType(Enum):
    Morphology = 0
    Lexis = 1
+    MatchAll = 2


-class ComponentLevel(Enum):
+class Rendition(Enum):
    Lemma = 0
    WordForm = 1
+    Unknown = 2
+
+class ComponentRendition:
+    def __init__(self, rendition=Rendition.Unknown):
+        self.word_form = {}
+        self.rendition = rendition
+
+    def render(self, word):
+        if self.rendition == Rendition.Lemma:
+            return word.lemma
+        elif self.rendition == Rendition.WordForm:
+            return word.text
+        elif self.rendition == Rendition.Unknown:
+            return None
+        else:
+            raise RuntimeError("Unknown rendition: {}".format(self.rendition))
+    
+    def __str__(self):
+        return str(self.rendition)
+
+
+# dont know...
+class StructureSelection(Enum):
+    All = 0
+    Frequency = 1
+
+class ComponentRepresentation:
+    def new(s):
+        if 'rendition' in s:
+            if s['rendition'] == "lemma":
+                return ComponentRendition(Rendition.Lemma)
+            elif s['rendition'] == "word_form":
+                return ComponentRendition(Rendition.WordForm)
+            else:
+                raise NotImplementedError("Rendition: {}".format(s))
+        elif 'selection' in s:
+            if s['selection'] == "frequency":
+                return StructureSelection.Frequency
+            elif s['selection'] == "all":
+                return StructureSelection.All
+            else:
+                return {s['selection']: s['value']}
+        else:
+            raise NotImplementedError("Representation: {}".format(s))
+
+
+class ComponentStatus(Enum):
+    Optional = 0
+    Required = 1
+    Forbidden = 2
+
+    def __str__(self):
+        if self == ComponentStatus.Optional:
+            return "?"
+        elif self == ComponentStatus.Required:
+            return "!"
+        else: #Forbidden
+            return "X"


 def get_level(restriction):
    for feature in restriction:
        if "level" in feature.keys():
            lvl = feature.get("level")
-            if lvl == "lemma":
-                return ComponentLevel.Lemma
-            elif lvl == "word_form":
-                return ComponentLevel.WordForm
-            else:
-                continue
+        else:
+            continue

    raise RuntimeError("Unreachable!")

@@ -140,28 +197,44 @@ def get_level(restriction):
 def build_morphology_regex(restriction):
    restr_dict = {}
    for feature in restriction:
-        restr_dict.update(feature.items())
+        feature_dict = dict(feature.items())
+
+        match_type = True
+        if "filter" in feature_dict:
+            assert(feature_dict['filter'] == "negative")
+            match_type = False
+            del feature_dict['filter']
+
+        assert(len(feature_dict) == 1)
+        key, value = next(iter(feature_dict.items()))
+        restr_dict[key] = (value, match_type)

    assert('POS' in restr_dict)
-    category = restr_dict['POS'].capitalize()
+    category = restr_dict['POS'][0].capitalize()
    cat_code = CODES[category]
    rgx = [cat_code] + CATEGORY_BASES[cat_code]

    del restr_dict['POS']
-    del restr_dict['level']

-    for attribute, value in restr_dict.items():
+    for attribute, (value, typ) in restr_dict.items():
        index = TAGSET[cat_code].index(attribute.lower())
        assert(index >= 0)

        if '|' in value:
-            match = '[' + "".join(CODES[val] for val in value.split('|')) + ']'
+            match = "".join(CODES[val] for val in value.split('|'))
        else:
            match = CODES[value]

+        match = "[{}{}]".format("" if typ else "^", match)
        rgx[index + 1] = match

-    return re.compile("".join(rgx))
+    def matcher(text):
+        for c, r in zip(text, rgx):
+            if not re.match(r, c):
+                return False
+        return True
+
+    return " ".join(rgx), matcher


 def build_lexis_regex(restriction):
@@ -169,18 +242,27 @@ def build_lexis_regex(restriction):
    for feature in restriction:
        restr_dict.update(feature.items())

-    return re.compile(restr_dict['lemma'])
+    assert("lemma" in restr_dict)
+    match_list = restr_dict['lemma'].split('|')
+
+    return match_list, lambda text: text in match_list


 class Restriction:
    def __init__(self, restriction_tag):
+        if restriction_tag is None:
+            self.type = RestrictionType.MatchAll
+            self.matcher = None
+            self.present = None
+            return
+        
        restriction_type = restriction_tag.get('type')
        if restriction_type == "morphology":
            self.type = RestrictionType.Morphology
-            self.matcher = build_morphology_regex(list(restriction_tag))
+            self.present, self.matcher = build_morphology_regex(list(restriction_tag))
        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
-            self.matcher = build_lexis_regex(list(restriction_tag))
+            self.present, self.matcher = build_lexis_regex(list(restriction_tag))
        else:
            raise NotImplementedError()

@@ -189,155 +271,380 @@ class Restriction:
            match_to = word.msd
        elif self.type == RestrictionType.Lexis:
            match_to = word.lemma
+        elif self.type == RestrictionType.MatchAll:
+            return True
        else:
            raise RuntimeError("Unreachable!")

-        return self.matcher.match(match_to)
+        return self.matcher(match_to)

    def __str__(self):
-        return "({:s} {})".format(str(self.type).split('.')[1], self.matcher)
+        return "({:s} {})".format(str(self.type).split('.')[1], self.present)

    def __repr__(self):
        return str(self)


 class Component:
-    def __init__(self, name, idx):
-        assert(idx is not None)
+    def __init__(self, info):
+        idx = info['cid']
+        name = info['name'] if 'name' in info else None

-        self.name = name if name is not None else ""  # for printing...
+        if 'status' not in info:
+            status = ComponentStatus.Required
+        elif info['status'] == 'forbidden':
+            status = ComponentStatus.Forbidden
+        elif info['status'] == 'obligatory':
+            status = ComponentStatus.Required
+        elif info['status'] == 'optional':
+            status = ComponentStatus.Optional
+        else:
+            raise NotImplementedError("strange status: {}".format(info['status']))
+
+        self.status = status
+        self.name = name
        self.idx = idx
        self.restriction = None
        self.next_element = []
-        self.level = None
+        self.rendition = ComponentRendition()
+        self.selection = {}

        self.iter_ctr = 0

-    def word_to_str(self, word):
-        if self.level == ComponentLevel.Lemma:
-            return word.lemma, word.msd
-        elif self.level == ComponentLevel.WordForm:
-            return word.text, word.msd
-        else:
-            raise RuntimeError("Unreachable")
-
-    def __iter__(self):
-        self.iter_ctr = 0
-        return self
-
-    def __next__(self):
-        if self.iter_ctr < len(self.next_element):
-            to_ret = self.next_element[self.iter_ctr]
-            self.iter_ctr += 1
-            return to_ret
-        else:
-            raise StopIteration
+    def render_word(self, word):
+        return self.rendition.render(word)

    def add_next(self, next_component, link_label):
        self.next_element.append((next_component, link_label))

    def set_restriction(self, restrictions_tag):
-        if restrictions_tag.tag == "restriction":
+        if restrictions_tag is None:
+            self.restriction = Restriction(None)
+
+        elif restrictions_tag.tag == "restriction":
            self.restriction = Restriction(restrictions_tag)
-            self.level = get_level(restrictions_tag)

        elif restrictions_tag.tag == "restriction_or":
            self.restriction = [Restriction(el) for el in restrictions_tag]
-            self.level = get_level(restrictions_tag[0])
-
-            # same level for every restriction for now and only or available
-            levels = [get_level(el) for el in restrictions_tag]
-            assert(len(set(levels)) == 1)

        else:
            raise RuntimeError("Unreachable")

-    def find_next(self, deps, comps, restrs):
+    def set_representation(self, representation):
+        cr = None
+        if representation is not None:
+            self.representation = []
+
+            for feature in representation:
+                f = ComponentRepresentation.new(dict(feature.attrib))
+
+                if type(f) is StructureSelection:
+                    assert(cr is None)
+                    cr = f
+                elif type(f) is ComponentRendition:
+                    self.rendition = f
+                elif type(f) is dict:
+                    self.selection.update(f)
+                else:
+                    raise RuntimeError("Unreachable: {}".format(f))
+
+        return cr
+
+    def find_next(self, deps, comps, restrs, reprs):
+        representation = StructureSelection.All
+
+        to_ret = []
        for d in deps:
            if d[0] == self.idx:
                _, idx, dep_label = d

-                next_component = Component(comps[idx], idx)
+                next_component = Component(comps[idx])
                next_component.set_restriction(restrs[idx])
+                r1 = next_component.set_representation(reprs[idx])
+                to_ret.append(next_component)

                self.add_next(next_component, dep_label)
-                next_component.find_next(deps, comps, restrs)
+                others, r2 = next_component.find_next(deps, comps, restrs, reprs)
+                to_ret.extend(others)
+
+                if StructureSelection.Frequency in (r1, r2):
+                    representation = StructureSelection.Frequency
+
+        return to_ret, representation
+
+    def name_str(self):
+        return "_" if self.name is None else self.name
+

    def __str__(self):
-        el = "({:10} {})".format(self.name, str(self.restriction))
-        for next, link in self:
-            el += "\n{:10} -- {:10} --> {}".format(self.name, link, str(next))
+        n = self.name_str()
+        return "{:s}) {:7s}:{} [{}] :{}".format(
+                self.idx, n, self.status, self.restriction, self.rendition)
+
+    def tree(self):
+        el = []
+        for next, link in self.next_element:
+            el.append("{:3} -- {:5} --> {:3}".format(self.idx, link, next.idx))
+            el.extend(next.tree())
        return el

    def __repr__(self):
        return str(self)

    def match(self, word):
+        m1 = self._match_self(word)
+        if m1 is None:
+            return None
+
+        mn = self._match_next(word)
+        if mn is None:
+            return None
+        
+        to_ret = [m1]
+        for cmatch in mn:
+            # if good match but nothing to add, just continue
+            if len(cmatch) == 0:
+                continue
+
+            # if more than one match found for particular component
+            elif len(cmatch) > 1:
+                logging.debug("MULTIPLE: {}, {}".format(self.idx, cmatch))
+                # if more than one match in multiple components, NOPE!
+                if len(to_ret) > 1:
+                    logging.warning("Strange multiple match: {}".format(
+                        str([w.id for w in cmatch[0].values()])))
+
+                    for tr in to_ret:
+                        tr.update(cmatch[0])
+                    continue
+
+                # yeah, so we have found more than one match, =>
+                # more than one element in to_ret
+                to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]
+
+            else:
+                for tr in to_ret:
+                    tr.update(cmatch[0])
+
+        logging.debug("MA: {}".format(str(to_ret)))
+        return to_ret
+
+    def _match_self(self, word):
        matched = None

        # matching
        if type(self.restriction) is list:
            for restr in self.restriction:
                matched = restr.match(word)
-                if matched is not None:
+                if matched: # match either
                    break
        else:
            matched = self.restriction.match(word)

-        # recurse to next
-        if matched:
-            to_ret = [self.word_to_str(word)]
+        logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched))

-            for next, link in self:
-                # need to get all links that match
-                for next_word in word.get_links(link):
-                    match = next.match(next_word)
-                    # if matches, return
-                    if match is not None:
-                        to_ret.extend(match)
+        # check with status
+        # if self.status is ComponentStatus.Optional:
+        #     if not matched:
+        #         # nothing to add, but still good...
+        #         return {}
+        # elif self.status is ComponentStatus.Forbidden:
+        #     # forbiddent is handled at return stage in _match_next
+        #     # just process normally...
+        #     pass
+
+        # recurse to next
+        if not matched:
+            return None
+        else:
+            return {self.idx: word}
+
+    def _match_next(self, word):
+        # matches for every component in links from this component
+        to_ret = []
+
+        # need to get all links that match
+        for next, link in self.next_element:
+            logging.debug("FIND LINKS FOR: {} -> {}".format(self.idx, next.idx))
+            to_ret.append([])
+
+            # good flag
+            good = next.status != ComponentStatus.Required
+            for next_word in word.get_links(link):
+                logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id))
+                match = next.match(next_word)
+
+                if match is not None:
+                    # special treatement for forbidden
+                    if next.status == ComponentStatus.Forbidden:
+                        good = False
                        break

-                # if none matched, nothing found!
-                else:
-                    return None
+                    else:
+                        assert(type(match) is list)
+                        to_ret[-1].extend(match)
+                        good = True

-            return to_ret
+            # if none matched, nothing found!
+            if not good:
+                logging.debug("BAD")
+                return None

-        # return None...
+        return to_ret


 class SyntacticStructure:
    def __init__(self):
-        self.root_component = Component("", 'root')
        self.id = None
        self.lbs = None
+        self.agreements = []
+        self.components = []
+        self.selection = StructureSelection.All

    @staticmethod
    def from_xml(xml):
        st = SyntacticStructure()
        st.id = xml.get('id')
        st.lbs = xml.get('LBS')
+        
+        if float(st.id.replace('-','.')) >= 17:
+            return None

-        components, system = list(xml)
-        dependencies, restrictions = list(system)
+        assert(len(list(xml)) == 1)
+        system = next(iter(xml))

        assert(system.get('type') == 'JOS')
+        components, dependencies, definitions = list(system)

        deps = [ (dep.get('from'), dep.get('to'), dep.get('label')) for dep in dependencies ]
-        comps = { comp.get('cid'): comp.get('name') for comp in components }
-        restrs = { r.get('cid'): next(iter(r)) for r in restrictions }
+        comps = { comp.get('cid'): dict(comp.items()) for comp in components }

-        st.root_component.find_next(deps, comps, restrs)
-        st.root_component = list(st.root_component)[0][0]  # get first next
+        restrs, forms = {}, {}

+        for comp in definitions:
+            n = comp.get('cid')
+            restrs[n] = None
+            forms[n] = None
+
+            for el in comp:
+                if el.tag.startswith("restriction"):
+                    assert(restrs[n] is None)
+                    restrs[n] = el
+                elif el.tag.startswith("representation"):
+                    st.add_representation(n, el, forms)
+                else:
+                    raise NotImplementedError("definition??")
+
+        fake_root_component = Component({'cid': '#', 'type': 'other'})
+        st.components, st.selection = fake_root_component.find_next(deps, comps, restrs, forms)
        return st

+    def add_representation(self, n, el, forms):
+        if el.tag == "representation":
+            els = [el]
+        elif el.tag == "representation_and":
+            els = list(el)
+        else:
+            raise NotImplementedError("repr what?: {}".format(el.tag))
+        
+        for el in els:
+            if el.get('basic') == 'form':
+                assert(forms[n] is None)
+                forms[n] = el
+            elif el.get('basic') == "agreement":
+                self.add_agreement(n, el)
+            else:
+                raise NotImplementedError("representation?: {}".format(el.tag))
+
+    def add_agreement(self, n, el):
+        assert(el.get('head')[:4] == 'cid_')
+
+        n1 = n
+        n2 = el.get('head')[4:]
+        agreement_str = next(iter(el)).get('agreement')
+
+        self.agreements.append({
+            'n1': n1,
+            'n2': n2,
+            'match': agreement_str.split('|')})
+
    def __str__(self):
-        arrow = "root       -- modra      --> "
-        return "{} LBS {}\n------\n{}{}".format(self.id, self.lbs, arrow, str(self.root_component))
+        comp_str = "\n".join(str(comp) for comp in self.components)
+
+        agrs = "\n".join("({} -[{}]- {}) ".format(
+            a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
+
+        links_str = "\n".join(self.components[0].tree())
+
+        return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
+                self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
+
+    def get_component(self, idx):
+        for c in self.components:
+            if c.idx == idx:
+                return c
+        raise RuntimeError("Unknown component id: {}".format(idx))
+
+    def check_agreements(self, match):
+        for agr in self.agreements:
+            w1 = match[agr['n1']]
+            w2 = match[agr['n2']]
+
+            for agr_case in agr['match']:
+                t1 = w1.msd[0]
+                v1 = TAGSET[t1].index(agr_case)
+                assert(v1 >= 0)
+                # if none specified: nedolocnik, always agrees
+                if v1 + 1 >= len(w1.msd): 
+                    continue 
+                # first is uppercase, not in TAGSET
+                m1 = w1.msd[v1 + 1]
+
+                # REPEAT (not DRY!)
+                t2 = w2.msd[0]
+                v2 = TAGSET[t2].index(agr_case)
+                assert(v2 >= 0)
+                if v2 + 1 >= len(w2.msd): 
+                    continue 
+                m2 = w2.msd[v2 + 1]
+
+                # match!
+                if '-' not in [m1, m2] and m1 != m2:
+                    return False
+
+        return True
+
+    def check_form(self, match):
+        for midx, w in match.items():
+            c = self.get_component(midx)
+            for key, value in c.selection.items():
+                t = w.msd[0]
+                v = TAGSET[t].index(key.lower())
+                f1 = w.msd[v + 1]
+                f2 = CODES[value]
+                
+                if '-' not in [f1, f2] and f1 != f2:
+                    return False
+
+        return True

    def match(self, word):
-        return self.root_component.match(word)
+        matches = self.components[0].match(word)
+        if matches is None:
+            return []
+
+        to_ret = []
+        for m in matches:
+            if not self.check_agreements(m):
+                bad = "Agreement"
+            elif not self.check_form(m):
+                bad = "Form"
+            else:
+                bad = "OK"
+
+            to_ret.append((m, bad))
+
+        return to_ret


 def build_structures(filename):
@@ -345,14 +652,27 @@ def build_structures(filename):
    with open(filename, 'r') as fp:
        et = ElementTree.XML(fp.read())
        for structure in et.iter('syntactic_structure'):
-            structures.append(SyntacticStructure.from_xml(structure))
+            to_append = SyntacticStructure.from_xml(structure)
+            if to_append is None:
+                continue
+            structures.append(to_append)
    return structures


+def get_msd(comp):
+    d = dict(comp.items())
+    if 'msd' in d:
+        return d['msd']
+    elif 'ana' in d:
+        return d['ana'][4:]
+    else:
+        logging.error(d, file=sys.stderr)
+        raise NotImplementedError("MSD?")
+
 class Word:
    def __init__(self, xml):
        self.lemma = xml.get('lemma')
-        self.msd = MSD_TRANSLATE[xml.get('msd')]
+        self.msd = MSD_TRANSLATE[get_msd(xml)]
        self.id = xml.get('id')
        self.text = xml.text
        self.links = defaultdict(list)
@@ -370,6 +690,10 @@ class Word:
        return self.links[link]


+def is_root_id(id_):
+    return len(id_.split('.')) == 3
+
+
 def load_corpus(filename):
    with open(filename, 'r') as fp:
        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
@@ -381,20 +705,38 @@ def load_corpus(filename):
    for w in et.iter("w"):
        words[w.get('id')] = Word(w)

+    pcs = set()
+    for pc in et.iter("pc"):
+        pcs.add(pc.get('id'))
+
    for l in et.iter("link"):
-        assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys())
+        if 'dep' in l.keys():
+            ana = l.get('afun')
+            lfrom = l.get('from')
+            dest = l.get('dep')
+        else:
+            ana = l.get('ana')
+            if ana[:4] != 'syn:': # dont bother...
+                continue
+            ana = ana[4:]
+            lfrom, dest = l.get('target').replace('#', '').split()

-        lfrom = l.get('from')
        if lfrom in words:
-            assert(not lfrom.endswith('.0'))
-            next_word_id = l.get('dep')
-            if next_word_id in words:
-                next_word = words[next_word_id]
-                words[l.get('from')].add_link(l.get('afun'), next_word)
+            if is_root_id(lfrom):
+                logging.error("NOO: ", lfrom, file=sys.stderr)
+                sys.exit(1)

-        # catch modra links from root
-        elif lfrom[-1] == '0' and l.get('afun') == 'modra':
-            root_words.add(l.get('dep'))
+            if dest in words:
+                next_word = words[dest]
+                words[lfrom].add_link(ana, next_word)
+
+        # catch links from root
+        elif is_root_id(lfrom):
+            root_words.add(dest)
+
+        # catch links from <pc> :S
+        elif lfrom in pcs:
+            logging.warning(str(("link from <pc>: ", lfrom)))

        else:
            # strange errors, just skip...
@@ -408,8 +750,6 @@ def load_corpus(filename):


 def main():
-    words = load_corpus(STAVKI)
-
    import time
    t = time.time()

@@ -417,21 +757,86 @@ def main():
    for s in structures:
        print(s)

-    num_matches = 0
-    for w in words:
-        for s in structures:
-            m = s.match(w)
-            if m is not None:
-                num_matches += 1
-                print(s.id, m)
+    # words = load_corpus(STAVKI)
+    import pickle
+    # with open("words.p", "wb") as fp:
+    #     pickle.dump(words, fp)
+    with open("words.p", "rb") as fp:
+        words = pickle.load(fp)

+    print("MATCHES...")
+    matches = {s.id: [] for s in structures}
+
+    for idx, s in enumerate(structures):
+        print("\r{}/{}: {:7s}".format(idx, len(structures), s.id)) #, end="")
+        for w in words:
+            mhere = s.match(w)
+            logging.debug("  GOT: {}".format(len(mhere)))
+            for match, reason in mhere: 
+                matches[s.id].append((match, reason))
+    print("")
+
+    header = [
+            "Structure_ID", "Component_ID", "Token_ID", "Word_form", 
+            "Lemma", "Msd", "Representative_form_1", "Component_ID", 
+            "Token_ID", "Word_form", "Lemma", "Msd", "Representative_form_2", 
+            "Collocation_ID", "Joint_representative_form"]
+    csv = [", ".join(header)]
+
+    colocation_ids = {}
+
+    for s in structures:
+        ms = matches[s.id]
+
+        for m, reason in ms:
+            colocation_id = [s.id]
+            to_print = [s.id]
+
+            m_sorted = defaultdict(lambda: None, m.items())
+            for idx, comp in enumerate(s.components):
+                idx = str(idx + 1)
+                if idx not in m_sorted:
+                    to_print.extend([idx, "", "", "", "", ""])
+                else:
+                    w = m_sorted[idx]
+                    # if comp.render_word(m_sorted[idx]) is not None:
+                    if True:
+                        to_print.extend([idx, w.id, w.text, w.lemma, w.msd, ""])
+                        colocation_id.append(w.lemma)
+
+            colocation_id = tuple(colocation_id)
+            if colocation_id in colocation_ids:
+                cid = colocation_ids[colocation_id]
+            else:
+                cid = len(colocation_ids)
+                colocation_ids[colocation_id] = cid
+                
+            to_print.extend([str(cid), ""])
+            csv.append(", ".join(to_print))
+
+
+    with open(FILE_OUT, "w") as fp:
+        print("\n".join(csv), file=fp)
+
+        # groups = defaultdict(int)
+        # for m, reason in ms:
+        #     if reason != "OK":
+        #         continue
+        #     lemmas = [(n, w.lemma) for n, w in m.items()]
+        #     lemmas = tuple(sorted(lemmas, key=lambda x: x[0]))
+        #     groups[lemmas] += 1
+
+        # print(s.id)
+        # print(groups)
+
+
+    print("")
    print("TIME", time.time() - t)
-    print(num_matches)
-
+    print([(k, len(v)) for k, v in matches.items()])
+    print(sum(len(v) for _, v in matches.items()))

 if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
    main()

-
-
-
+# 6, 7 primeri laznih zadetkov?