36 changed files with 669 additions and 6770 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,4 @@
 *.xml
 !collocation-structures.xml
 *.tbl
 *.csv
 *.pdf
@ -8,9 +7,5 @@
 .vscode
 __pycache__
 run.sh
 prev
 old
 data
 venv
 issue992/output
--- a/README.md
+++ b/README.md
@ -9,78 +9,3 @@ Potrebne datoteke:
 Priporocam: pypy3 paket za hitrejse poganjanje.
 Primer uporabe: `python3 wani.py ssj500k.xml Kolokacije_strukture.xml  izhod.csv`
 # About
 This script was developed to extract collocations from text in TEI format. Collocations are extracted and presented based on rules provided in structure file (example in `collocation-structures.xml`).
 # Setup
 Script may be run via python3 or pypy3. We suggest usage of virtual environments.
 ```bash
 pip install -r requirements.txt
 ```
 # Running
 ```bash
 python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE>
 ```
 ## Most important optional parameters
 ### --sloleks_db
 This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement. 
 To use this sqlalchemy has to be installed as well.
 This parameter has to include information about database in following order:
 <DB_USERNAME>:<DB_PASSWORD>:<DB_NAME>:<DB_URL>
 ### --collocation_sentence_map_dest
 If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids.
 ### --db
 This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
 We suggest to put this sqlite file in RAM for faster execution. To do this follow these instructions:
 ```bash
 sudo mkdir /mnt/tmp
 sudo mount -t tmpfs tmpfs /mnt/tmp
 ```
 If running on big corpuses (ie. Gigafida have database in RAM):
 ```bash
 sudo mkdir /mnt/tmp
 sudo mount -t tmpfs tmpfs /mnt/tmp
 sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
 ```
 Pass path to specific file when running `wani.py`. For example:
 ```bash
 python3 wani.py ... --db /mnt/tmp/mysql-wani-ssj500k ...
 ```
 ### --multiple-output
 Used when we want multiple output files (one file per structure_id).
 ## Instructions for running on big files (ie. Gigafida)
 Suggested running with saved mysql file in tmpfs. Instructions:
 ```bash
 sudo mkdir /mnt/tmp
 sudo mount -t tmpfs tmpfs /mnt/tmp
 ```
 If running on big corpuses (ie. Gigafida have database in RAM):
 ```bash
 sudo mkdir /mnt/tmp
 sudo mount -t tmpfs tmpfs /mnt/tmp
 sudo mount -o remount,size=110G,noexec,nosuid,nodev,noatime /mnt/tmp
 ```
--- a/collocation-structures.xml
+++ b/collocation-structures.xml
--- a/issue992/extract.py
+++ b/issue992/extract.py
@ -1,49 +0,0 @@
 import argparse
 import os
 import sys
 import tqdm
 import logging
 good_lemmas = ["absurd", "absurdnost", "akuten", "akutno", "alkohol", "alkoholen", "aluminijast", "ananas", "aplikacija", "aplikativen", "aranžma", "arbiter", "armada", "avtomatičen", "avtomatiziran", "babica", "bajen", "bajka", "bakren", "bambusov", "barvan", "barvanje", "baseballski", "bazar", "bazičen", "belina", "bezgov", "bičati", "bife", "bilka", "biomasa", "biotop", "birma", "bivol", "blago", "blaženost", "bliskavica", "bobnič", "bolha", "bolnišnica", "bor", "borov", "borovničev", "brati", "briljant", "briti", "brusiti", "bučanje", "cikličen", "civilizacija", "dopust", "drama", "drezati", "duda", "dvorezen", "embalaža", "faks", "farsa", "glasno", "informiranje", "interier", "intima", "intimno", "investirati", "ironično", "istovetiti", "izvožen", "jagoda", "jeklar", "jezik", "karbon", "kitara", "kodrast", "molče", "mučiti", "novinarski", "obala", "občevati", "okrasiti", "pajčevina", "panoga", "prevajanje", "prevajati", "previti", "prihraniti", "priloga", "prisluškovati", "sopara"]
 def main(args):
    filepaths = [os.path.join(args.input, fn) for fn in os.listdir(args.input)]
    filepaths = sorted(filepaths, key=lambda x: int(x.split('.')[-1]))
    N1 = len(good_lemmas)
    N2 = len(filepaths) - 1
    files_to_write = [open("output/{}".format(l), 'w') for l in good_lemmas]
    for fidx, filename in enumerate(filepaths):
        with open(filename, 'r') as fp:
            logging.info("loading next...")
            line = fp.readline()
            lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell]
            file_lines = fp.read().split("\n")
        for lidx, good_lemma in enumerate(good_lemmas):
            spaces = " " * 20 if lidx == 0 else ""
            logging.info("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces))
            for line in file_lines:
                if good_lemma not in line:
                    continue
                line_split = line.split(',')
                for lemma_idx in lemma_rows:
                    lemma = line_split[lemma_idx]
                    if lemma == good_lemma:
                        print(line, file=files_to_write[lidx])
                        break
    for fp in files_to_write:
        fp.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Extract structures from a parsed corpus.')
    parser.add_argument('input',
                        help='Path to folder with files')
    args = parser.parse_args()
    main(args)
--- a/luscenje_struktur/init.py
+++ b/luscenje_struktur/init.py
--- a/luscenje_struktur/codes_tagset.py
+++ b/luscenje_struktur/codes_tagset.py
@ -1,248 +0,0 @@
 POSSIBLE_WORD_FORM_FEATURE_VALUES = {
        "singular",
        "dual",
        "plural",
        "nominative",
        "genitive",
        "dative",
        "accusative",
        "locative",
        "instrumental",
        "infinitive",
        "supine",
        "participle",
        "present",
        "future",
        "conditional",
        "imperative",
        'masculine',
        'feminine',
        'neuter',
 }
 CODES_TRANSLATION = {
    "N": {
        2: {
            '-': 'masculine',
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        3: {
            "-": "singular",
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        4: {
            "-": "nominative",
            "n": "nominative",
            "g": "genitive",
            "d": "dative",
            "a": "accusative",
            "l": "locative",
            "i": "instrumental",
        },
    },
    "V": {
        1: {
            "-": "main",
            "m": "main",
            "a": "auxiliary",
        },
        3: {
            "-": "infinitive",
            "n": "infinitive",
            "u": "supine",
            "p": "participle",
            "r": "present",
            "f": "future",
            "c": "conditional",
            "m": "imperative",
        },
        4: {
            "-": "first",
            "1": "first",
            "2": "second",
            "3": "third",
        },
        5: {
            "-": "singular",
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        6: {
            '-': 'masculine',
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        8: {
            "-": "no",
            "n": "no",
            "y": "yes",
        },
    },
    "A": {
        1: {
            "-": "general",
            "g": "general",
            "s": "possessive",
        },
        2: {
            "-": "positive",
            "p": "positive",
            "c": "comparative",
            "s": "superlative",
        },
        3: {
            '-': 'masculine',
            'm': 'masculine',
            'f': 'feminine',
            'n': 'neuter',
        },
        4: {
            "-": "singular",
            "s": "singular",
            "d": "dual",
            "p": "plural",
        },
        5: {
            "-": "nominative",
            "n": "nominative",
            "g": "genitive",
            "d": "dative",
            "a": "accusative",
            "l": "locative",
            "i": "instrumental",
        },
    }
 }
 CODES_UD = {
    "ADJ",
    "ADP",
    "PUNCT",
    "ADV",
    "AUX",
    "SYM",
    "INTJ",
    "CCONJ",
    "X",
    "NOUN",
    "DET",
    "PROPN",
    "NUM",
    "VERB",
    "PART",
    "PRON",
    "SCONJ"
 }
 CODES = {
    "Noun": "N",
    "Verb": "V",
    "Adjective": "A",
    "Adverb": "R",
    "Pronoun": "P",
    "Numeral": "M",
    "Preposition": "S",
    "Conjunction": "C",
    "Particle": "Q",
    "Interjection": "I",
    "Abbreviation": "Y",
    "Residual": "X",
    "Punctuation": "Z",
    'common': 'c',
    'proper': 'p',
    'masculine': 'm',
    'feminine': 'f',
    'neuter': 'n',
    "singular": "s",
    "dual": "d",
    "plural": "p",
    "nominative": "n",
    "genitive": "g",
    "dative": "d",
    "accusative": "a",
    "locative": "l",
    "instrumental": "i",
    "no": "n",
    "yes": "y",
    "main": "m",
    "auxiliary": "a",
    "perfective": "e",
    "progressive": "p",
    "biaspectual": "b",
    "infinitive": "n",
    "supine": "u",
    "participle": "p",
    "present": "r",
    "future": "f",
    "conditional": "c",
    "imperative": "m",
    "first": "1",
    "second": "2",
    "third": "3",
    "general": "g",
    "possessive": "s",
    "positive": "p",
    "comparative": "c",
    "superlative": "s",
    "personal": "p",
    "demonstrative": "d",
    "relative": "r",
    "reflexive": "x",
    "interrogative": "q",
    "indefinite": "i",
    "negative": "z",
    "bound": "b",
    "digit": "d",
    "roman": "r",
    "letter": "l",
    "cardinal": "c",
    "ordinal": "o",
    "pronominal": "p",
    "special": "s",
    "coordinating": "c",
    "subordinating": "s",
    "foreign": "f",
    "typo": "t",
    "program": "p",
    "web": "w",
    "emo": "e",
    "hashtag": "h",
    "at": "a"
 }
 TAGSET = {
    "N": ['type', 'gender', 'number', 'case', 'animate'],
    "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
    "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
    "R": ['type', 'degree'],
    "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
    "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
    "S": ['case'],
    "C": ['type'],
    "Q": [],
    "I": [],
    "Y": [],
    "X": ['type']
 }
 PPB_DEPRELS = [
    "advmod",
    "amod",
    "compound",
    "conj",
    "fixed",
    "flat",
    "iobj",
    "nmod",
    "nsubj",
    "nummod",
    "obj",
    "obl"
 ]
--- a/luscenje_struktur/collocation_sentence_mapper.py
+++ b/luscenje_struktur/collocation_sentence_mapper.py
@ -1,11 +0,0 @@
 class CollocationSentenceMapper:
    def __init__(self, output_dir):
        self.output = open(output_dir, "w")
        self.output.write(f'Collocation_id\tSentence_id\n')
    def close(self):
        self.output.close()
    def add_map(self, collocation_id, sentence_id):
        self.output.write(f'{collocation_id}\t{sentence_id}\n')
--- a/luscenje_struktur/loader.py
+++ b/luscenje_struktur/loader.py
@ -1,421 +0,0 @@
 import os
 from xml.etree import ElementTree
 import logging
 import re
 import sys
 import gzip
 import pathlib
 from io import StringIO
 from luscenje_struktur.progress_bar import progress
 from luscenje_struktur.word import Word
 def is_root_id(id_):
    return len(id_.split('.')) == 3
 def load_files(args, database, w_collection=None, input_corpus=None):
    filenames = input_corpus if input_corpus is not None else args.input
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate
    if len(filenames) == 1 and os.path.isdir(filenames[0]):
        filenames = [os.path.join(filenames[0], file) for file in os.listdir(filenames[0]) if file[-5:] != '.zstd']
    if len(filenames) > 1:
        filenames = [filename for filename in filenames if filename[-5:] != '.zstd']
        filenames = sorted(filenames, key=lambda x: int(x.split('.')[-1]))
    database.init("CREATE TABLE Files ( filename varchar(2048) )")
    for idx, fname in enumerate(filenames):
        logging.info("FILE " + fname + "{}/{}".format(idx, len(filenames)))
        extension = pathlib.Path(fname).suffix
        # check if file with the same name already loaded...
        loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone()
        if loaded is not None:
            logging.info("ALREADY LOADED")
            continue
        if extension == ".xml":
            et = load_xml(fname)
            if input_corpus is None:
                yield file_sentence_generator(et, args)
            else:
                sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection)
                for sent_id, sentence, othr_attributes in sentence_generator:
                    yield sent_id, sentence, othr_attributes
        elif extension == ".gz":
            if input_corpus is None:
                yield load_csv(fname, True)
            else:
                sentences = load_csv_valency(fname, True, w_collection)
                for sentence in sentences:
                    yield sentence
        elif extension == ".conllu":
            if input_corpus is None:
                yield load_conllu(fname)
            else:
                raise Exception('conllu with input_corpus is not supported!')
        else:
            if input_corpus is None:
                yield load_csv(fname, False)
            else:
                sentences = load_csv_valency(fname, False, w_collection)
                for sentence in sentences:
                    yield sentence
        database.execute("INSERT INTO Files (filename) VALUES (?)", (fname,))
        database.commit()
 def lines_gz(filename):
    with gzip.open(filename, 'r') as fp:
        for line in progress(fp, 'load-gz'):
            yield line.decode('utf8')
 def lines_csv(filename):
    with open(filename, 'r') as fp:
        for line in progress(fp, 'load-csv'):
            yield line
 def load_conllu(filename):
    import conllu
    result = []
    bad_sentence = False
    words = {}
    links = []
    def sentence_end(bad_sentence, sent_id):
        if bad_sentence:
            return
        for lfrom, ldest, ana in links:
            if lfrom not in words or ldest not in words:
                logging.warning("Bad link in sentence: " + sent_id)
                continue
            words[lfrom].add_link(ana, words[ldest])
        result.extend(words.values())
    with open(filename, 'r') as f:
        data = f.read()
        # conlls = conllu.parse_incr(StringIO(data))
        # for sent in conlls:
        #     try:
        #         for word in sent:
        #             full_id = "{}.{}".format(sent.metadata['sent_id'], str(word['id']))
        #             words[str(word['id'])] = Word(word['id'], word['xpos'], full_id, word['form'], False)
        #     except:
        #         logging.error(f"Error while reading file {filename} in sentence {sent.metadata['sent_id']}. Check if required data is available!")
        conlls = conllu.parse_incr(StringIO(data))
        # build dep parse
        for sent in conlls:
            try:
                # adding fake word
                words['0'] = Word('', '', '0', '', False, True)
                for word in sent:
                    if type(word['id']) == tuple:
                        continue
                    full_id = "{}.{}".format(sent.metadata['sent_id'], str(word['id']))
                    words[str(word['id'])] = Word(word['lemma'], word['upos'], full_id, word['form'], False)
                    links.append((str(word['head']), str(word['id']), word['deprel']))
                sentence_end(False, sent.metadata['sent_id'])
                links = []
                words = {}
            except:
                links = []
                words = {}
                logging.error(f"Error while reading file {filename} in sentence {sent.metadata['sent_id']}. Check if required data is available!")
    return result
 def load_csv(filename, compressed):
    result = []
    bad_sentence = False
    words = {}
    links = []
    def sentence_end(bad_sentence):
        if bad_sentence:
            return
        for lfrom, ldest, ana in links:
            if lfrom not in words or ldest not in words:
                logging.warning("Bad link in sentence: " + line_split[0])
                continue
            words[lfrom].add_link(ana, words[ldest])
        result.extend(words.values())
    line_gen = lines_gz if compressed else lines_csv
    for line in line_gen(filename):
        line_str = line.strip()
        line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
        line_split = line_fixed.split("\t")
        if line_split[1] == "1" and len(words) > 0:
            # adding fake word
            words['0'] = Word('', '', '0', '', False, True)
            sentence_end(bad_sentence)
            bad_sentence = False
            links = []
            words = {}
        try:
            sid, wid, text, msd, lemma, link_src, link_type = line_split
        except ValueError:
            bad_sentence = True
        full_id = "{}.{}".format(sid, wid)
        words[wid] = Word(lemma, msd, full_id, text, True)
        # if link_src != '0':
        links.append((link_src, wid, link_type))
    # adding fake word
    words['0'] = Word('', '', '0', '', False, True)
    sentence_end(bad_sentence)
    return result
 def load_csv_valency(filename, compressed, w_collection):
    # TODO skip sentences that are not in sentences of interest!!!
    result = {}
    bad_sentence = False
    words = {}
    links = []
    idi = 0
    def sentence_end(bad_sentence, sid):
        if bad_sentence:
            return
        for lfrom, ldest, ana in links:
            if lfrom not in words or ldest not in words:
                logging.warning("Bad link in sentence: " + line_split[0])
                continue
            words[lfrom].add_link(ana, words[ldest])
        result[sid] = list(words.values())
    line_gen = lines_gz if compressed else lines_csv
    for line in line_gen(filename):
        line_str = line.strip()
        line_fixed = line_str.replace('\t\t\t', '\t,\t')
        line_split = line_fixed.split("\t")
        if line_split[1] == "1" and len(words) > 0:
            sentence_end(bad_sentence, sid)
            bad_sentence = False
            links = []
            words = {}
            idi = 0
        try:
            sid, wid, text, msd, lemma, link_src, link_type = line_split
        except ValueError:
            bad_sentence = True
        full_id = "{}.{}".format(sid, wid)
        words[wid] = Word(lemma, msd, full_id, text, True)
        if not (len(text[0]) == 1 and re.match('^[\w]+$', text[0]) is None):
            words[wid].idi = str(idi)
            idi += 1
        if link_src != '0':
            links.append((link_src, wid, link_type))
    sentence_end(bad_sentence, sid)
    sentence_ids = list(result.keys())
    cur = w_collection.find({'_id': {'$in': sentence_ids}})
    cur = [c for c in cur]
    unsorted_result = [(c['_id'], result[c['_id']], {k: v for k, v in c.items() if k != '_id'}) for c in cur]
    return sorted(unsorted_result, key=lambda x: (x[0].split('.')[0], int(x[0].split('.')[1]), int(x[0].split('.')[2])))
 def load_xml(filename):
    with open(filename, 'r') as fp:
        content = fp.read()
    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
    xmlstring = xmlstring.replace(' xml:', ' ')
    return ElementTree.XML(xmlstring)
 def file_sentence_generator(et, args):
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate
    pc_tag = args.pc_tag
    use_punctuations = not args.ignore_punctuations
    previous_pc = False
    words = {}
    paragraphs = list(et.iter('p'))
    for paragraph in progress(paragraphs, "load-text"):
        previous_glue = ''
        sentences = list(paragraph.iter('s'))
        for sentence in sentences:
            # create fake root word
            words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
            last_word_id = None
            if args.new_tei:
                for w in sentence.iter():
                    if w.tag == 'w':
                        words[w.get('id')] = Word.from_xml(w, do_msd_translate)
                        if use_punctuations:
                            previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
                    elif w.tag == pc_tag:
                        words[w.get('id')] = Word.pc_word(w, do_msd_translate)
                        if use_punctuations:
                            words[w.get('id')].previous_glue = previous_glue
                            words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
                            previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
            else:
                for w in sentence.iter():
                    if w.tag == 'w':
                        words[w.get('id')] = Word.from_xml(w, do_msd_translate)
                        if use_punctuations:
                            previous_glue = ''
                            last_word_id = None
                    elif w.tag == pc_tag:
                        words[w.get('id')] = Word.pc_word(w, do_msd_translate)
                        if use_punctuations:
                            last_word_id = w.get('id')
                            words[w.get('id')].previous_glue = previous_glue
                            previous_glue = ''
                    elif use_punctuations and w.tag == 'c':
                        # always save previous glue
                        previous_glue = w.text
                        if last_word_id:
                            words[last_word_id].glue += w.text
            for l in sentence.iter("link"):
                if 'dep' in l.keys():
                    ana = l.get('afun')
                    lfrom = l.get('from')
                    dest = l.get('dep')
                else:
                    ana = l.get('ana')
                    if ana[:8] != 'jos-syn:': # dont bother...
                        continue
                    ana = ana[8:]
                    lfrom, dest = l.get('target').replace('#', '').split()
                if lfrom in words:
                    if not skip_id_check and is_root_id(lfrom):
                        logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom))
                        sys.exit(1)
                    if dest in words:
                        next_word = words[dest]
                        words[lfrom].add_link(ana, next_word)
                    else:
                        logging.error("Unknown id: {}".format(dest))
                        sys.exit(1)
                else:
                    # strange errors, just skip...
                    pass
    a = list(words.values())
    return list(words.values())
 def file_sentence_generator_valency(et, skip_id_check, do_msd_translate, pc_tag, w_collection):
    words = {}
    sentences = list(et.iter('s'))
    sentence_ids = [s.attrib['id'] for s in sentences]
    cur = w_collection.find({'_id': {'$in': sentence_ids}})
    sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
    for sentence in progress(sentences, "load-text"):
        if sentence.attrib['id'] not in sentences_of_interest:
            continue
        idi = 0
        last_word_id = None
        for w in sentence.iter():
            if w.tag == 'w':
                last_word_id = w.get('id')
                words[last_word_id] = Word.from_xml(w, do_msd_translate)
                words[last_word_id].idi = str(idi)
                idi += 1
            elif w.tag == pc_tag:
                last_word_id = w.get('id')
                words[last_word_id] = Word.pc_word(w, do_msd_translate)
            elif w.tag == 'c':
                if last_word_id:
                    words[last_word_id].glue += w.text
        for l in sentence.iter("link"):
            if 'dep' in l.keys():
                ana = l.get('afun')
                lfrom = l.get('from')
                dest = l.get('dep')
            else:
                ana = l.get('ana')
                if ana[:8] != 'jos-syn:':  # dont bother...
                    continue
                ana = ana[8:]
                lfrom, dest = l.get('target').replace('#', '').split()
            if lfrom in words:
                if not skip_id_check and is_root_id(lfrom):
                    logging.error("NOO: {}".format(lfrom))
                    sys.exit(1)
                if dest in words:
                    next_word = words[dest]
                    words[lfrom].add_link(ana, next_word)
                else:
                    logging.error("Unknown id: {}".format(dest))
                    sys.exit(1)
            else:
                # strange errors, just skip...
                pass
        yield sentence.attrib['id'], list(words.values()), sentences_of_interest[sentence.attrib['id']]
        words = {}
 def file_sentence_glue_generator(files, pc_tag, w_collection):
    for fname in files:
        et = load_xml(fname)
        words = {}
        sentences = list(et.iter('s'))
        sentence_ids = [s.attrib['id'] for s in sentences]
        cur = w_collection.find({'_id': {'$in': sentence_ids}})
        sentences_of_interest = {c['_id']: {k: v for k, v in c.items() if k != '_id'} for c in cur}
        for sentence in progress(sentences, "load-text"):
            if sentence.attrib['id'] not in sentences_of_interest:
                continue
            w_id = 1
            last_word_id = None
            sentence_id = None
            for w in sentence.iter():
                if w.tag == 'w':
                    last_word_id = w_id
                    words[last_word_id] = [w.text, last_word_id, '']
                    w_id += 1
                elif w.tag == pc_tag:
                    last_word_id = w_id
                    words[last_word_id] = [w.text, last_word_id, '']
                    w_id += 1
                elif w.tag == 'c':
                    if last_word_id:
                        words[last_word_id][2] += w.text
                elif w.tag == 's':
                    sentence_id = w.attrib['id']
            yield (sentence_id, list(words.values()))
            words = {}
--- a/luscenje_struktur/postprocessor.py
+++ b/luscenje_struktur/postprocessor.py
@ -1,47 +0,0 @@
 class Postprocessor:
    def __init__(self, fix_one_letter_words=True, fixed_restriction_order=False):
        self.fix_one_letter_words = fix_one_letter_words
        self.fixed_restriction_order = fixed_restriction_order
    @staticmethod
    def fix_sz(next_word):
        if next_word[0] in ['c', 'č', 'f', 'h', 'k', 'p', 's', 'š', 't']:
            return 's'
        return 'z'
    @staticmethod
    def fix_kh(next_word):
        if next_word[0] in ['g', 'k']:
            return 'h'
        return 'k'
    def process(self, match, collocation_id):
        if len(collocation_id) > 2:
            for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
                if word in ['s', 'z']:
                    correct_letter = self.fix_sz(collocation_id[idx + 2][1])
                    collocation_id[idx + 1][1] = correct_letter
                    match[col_id].text = correct_letter
                elif word in ['k', 'h']:
                    correct_letter = self.fix_kh(collocation_id[idx + 2][1])
                    collocation_id[idx + 1][1] = correct_letter
                    match[col_id].text = correct_letter
        collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]]
        return match, collocation_id
    def is_fixed_restriction_order(self, match):
        if not self.fixed_restriction_order:
            return True
        sorted_dict = {k: v for k, v in sorted(match.items(), key=lambda item: item[1].int_id)}
        prev_id = -1
        for key in sorted_dict.keys():
            if key == '#':
                continue
            int_key = int(key)
            if prev_id > int_key:
                return False
            prev_id = int_key
        return True
--- a/luscenje_struktur/restriction.py
+++ b/luscenje_struktur/restriction.py
@ -1,288 +0,0 @@
 import re
 from enum import Enum
 from luscenje_struktur.codes_tagset import CODES, TAGSET, CODES_UD
 class RestrictionType(Enum):
    Morphology = 0
    Lexis = 1
    MatchAll = 2
    Space = 3
    MorphologyUD = 4
 def determine_ppb_ud(rgxs):
    if len(rgxs) != 1:
        return 0
    rgx = rgxs[0]
    if rgx in ("ADJ", "NOUN", "ADV"):
        return 0
    elif rgx == "AUX":
        return 3
    elif rgx == "VERB":
        return 2
    else:
        return 4
 def determine_ppb(rgxs):
    if len(rgxs) != 1:
        return 0
    rgx = rgxs[0]
    if rgx[0] in ("A", "N", "R"):
        return 0
    elif rgx[0] == "V":
        if len(rgx) == 1:
            return 2
        elif 'a' in rgx[1]:
            return 3
        elif 'm' in rgx[1]:
            return 1
        else:
            return 2
    else:
        return 4
 class MorphologyRegex:
    def __init__(self, restriction):
        # self.min_msd_length = 1
        restr_dict = {}
        for feature in restriction:
            feature_dict = dict(feature.items())
            match_type = True
            if "filter" in feature_dict:
                assert feature_dict['filter'] == "negative"
                match_type = False
                del feature_dict['filter']
            assert len(feature_dict) == 1
            key, value = next(iter(feature_dict.items()))
            restr_dict[key] = (value, match_type)
        assert 'POS' in restr_dict
        # handle multiple word types
        if '|' in restr_dict['POS'][0]:
            categories = restr_dict['POS'][0].split('|')
        else:
            categories = [restr_dict['POS'][0]]
        self.rgxs = []
        self.re_objects = []
        self.min_msd_lengths = []
        del restr_dict['POS']
        for category in categories:
            min_msd_length = 1
            category = category.capitalize()
            cat_code = CODES[category]
            rgx = [cat_code] + ['.'] * 10
            for attribute, (value, typ) in restr_dict.items():
                if attribute.lower() not in TAGSET[cat_code]:
                    continue
                index = TAGSET[cat_code].index(attribute.lower())
                assert index >= 0
                if '|' in value:
                    match = "".join(CODES[val] for val in value.split('|'))
                else:
                    match = CODES[value]
                match = "[{}{}]".format("" if typ else "^", match)
                rgx[index + 1] = match
                if typ:
                    min_msd_length = max(index + 1, min_msd_length)
            # strip rgx
            for i in reversed(range(len(rgx))):
                if rgx[i] == '.':
                    rgx = rgx[:-1]
                else:
                    break
            self.re_objects.append([re.compile(r) for r in rgx])
            self.rgxs.append(rgx)
            self.min_msd_lengths.append(min_msd_length)
    def __call__(self, text):
        for i, re_object in enumerate(self.re_objects):
            if len(text) < self.min_msd_lengths[i]:
                continue
            match = True
            for c, r in zip(text, re_object):
                if not r.match(c):
                    match = False
                    break
            if match:
                return True
        return False
 class MorphologyUDRegex:
    def __init__(self, restriction):
        # self.min_msd_length = 1
        restr_dict = {}
        for feature in restriction:
            feature_dict = dict(feature.items())
            match_type = True
            # if "filter" in feature_dict:
            #     assert feature_dict['filter'] == "negative"
            #     match_type = False
            #     del feature_dict['filter']
            assert len(feature_dict) == 1
            key, value = next(iter(feature_dict.items()))
            restr_dict[key] = (value, match_type)
        assert 'POS' in restr_dict
        # handle multiple word types
        if '|' in restr_dict['POS'][0]:
            categories = restr_dict['POS'][0].split('|')
        else:
            categories = [restr_dict['POS'][0]]
        self.rgxs = []
        self.re_objects = []
        self.min_msd_lengths = []
        del restr_dict['POS']
        for category in categories:
            min_msd_length = 1
            category = category.upper()
            assert category in CODES_UD
            cat_code = category
            rgx = category
            # for attribute, (value, typ) in restr_dict.items():
            #     if attribute.lower() not in TAGSET[cat_code]:
            #         continue
            #     index = TAGSET[cat_code].index(attribute.lower())
            #     assert index >= 0
            #
            #     if '|' in value:
            #         match = "".join(CODES[val] for val in value.split('|'))
            #     else:
            #         match = CODES[value]
            #
            #     match = "[{}{}]".format("" if typ else "^", match)
            #     rgx[index + 1] = match
            #
            #     if typ:
            #         min_msd_length = max(index + 1, min_msd_length)
            # strip rgx
            # for i in reversed(range(len(rgx))):
            #     if rgx[i] == '.':
            #         rgx = rgx[:-1]
            #     else:
            #         break
            # self.re_objects.append([re.compile(r) for r in rgx])
            self.rgxs.append(rgx)
            self.min_msd_lengths.append(min_msd_length)
    def __call__(self, text):
        assert len(self.rgxs) == 1
        return self.rgxs[0] == text
 class LexisRegex:
    def __init__(self, restriction):
        restr_dict = {}
        for feature in restriction:
            restr_dict.update(feature.items())
        assert "lemma" in restr_dict
        self.match_list = restr_dict['lemma'].split('|')
    def __call__(self, text):
        return text in self.match_list
 class SpaceRegex:
    def __init__(self, restriction):
        restr_dict = {}
        for feature in restriction:
            restr_dict.update(feature.items())
        assert "contact" in restr_dict
        self.space = restr_dict['contact'].split('|')
        for el in self.space:
            if el not in ['both', 'right', 'left', 'neither']:
                raise Exception('Value of space restriction is not supported (it may be both, left, right or neither).')
    def __call__(self, word):
        match = False
        if 'neither' in self.space:
            match = match or (word.previous_glue != '' and word.glue != '')
        if 'left' in self.space:
            match = match or (word.previous_glue == '' and word.glue != '')
        if 'right' in self.space:
            match = match or (word.previous_glue != '' and word.glue == '')
        if 'both' in self.space:
            match = match or (word.previous_glue == '' and word.glue == '')
        return match
 class Restriction:
    def __init__(self, restriction_tag, system_type='JOS'):
        self.ppb = 4 # polnopomenska beseda (0-4)
        if restriction_tag is None:
            self.type = RestrictionType.MatchAll
            self.matcher = None
            self.present = None
            return
        restriction_type = restriction_tag.get('type')
        if restriction_type == "morphology":
            if system_type == 'JOS':
                self.type = RestrictionType.Morphology
                self.matcher = MorphologyRegex(list(restriction_tag))
                self.ppb = determine_ppb(self.matcher.rgxs)
            # UD system is handled based on deprel
            elif system_type == 'UD':
                self.type = RestrictionType.MorphologyUD
                self.matcher = MorphologyUDRegex(list(restriction_tag))
            #     self.ppb = determine_ppb_ud(self.matcher.rgxs)
        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
            self.matcher = LexisRegex(list(restriction_tag))
        elif restriction_type == "space":
            self.type = RestrictionType.Space
            self.matcher = SpaceRegex(list(restriction_tag))
        else:
            raise NotImplementedError()
    def match(self, word):
        if self.type == RestrictionType.Morphology or self.type == RestrictionType.MorphologyUD:
            match_to = word.msd
        elif self.type == RestrictionType.Lexis:
            match_to = word.lemma
        elif self.type == RestrictionType.MatchAll:
            return True
        elif self.type == RestrictionType.Space:
            match_to = word
        else:
            raise RuntimeError("Unreachable!")
        return self.matcher(match_to)
--- a/luscenje_struktur/restriction_group.py
+++ b/luscenje_struktur/restriction_group.py
@ -1,24 +0,0 @@
 from luscenje_struktur.restriction import Restriction
 class RestrictionGroup:
    def __init__(self, restrictions_tag, system_type, group_type='and'):
        self.restrictions = [Restriction(el, system_type) for el in restrictions_tag]
        self.group_type = group_type
    def __iter__(self):
        for restriction in self.restrictions:
            yield restriction
    def match(self, word):
        if self.group_type == 'or':
            for restr in self.restrictions:
                if restr.match(word): # match either
                    return True
            return False
        elif self.group_type == 'and':
            for restr in self.restrictions:
                if not restr.match(word): # match and
                    return False
            return True
        else:
            raise Exception("Unsupported group_type - it may only be 'and' or 'or'")
--- a/luscenje_struktur/sloleks_db.py
+++ b/luscenje_struktur/sloleks_db.py
@ -1,228 +0,0 @@
 import gc
 from luscenje_struktur.codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
 class SloleksDatabase:
    def __init__(self, db, load_sloleks):
        from psycopg2cffi import compat
        compat.register()
        from sqlalchemy.ext.declarative import declarative_base
        from sqlalchemy.orm import Session
        from sqlalchemy import create_engine
        global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
        [db_user, db_password, db_database, db_host] = db.split(':')
        engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
                               pool_recycle=14400)
        Base = declarative_base()
        Base.metadata.reflect(engine)
        class Lexeme(Base):
            __table__ = Base.metadata.tables['jedro_lexeme']
        class LexemeFeature(Base):
            __table__ = Base.metadata.tables['jedro_lexeme_feature']
        class SyntacticStructure(Base):
            __table__ = Base.metadata.tables['jedro_syntacticstructure']
        class StructureComponent(Base):
            __table__ = Base.metadata.tables['jedro_structurecomponent']
        class Feature(Base):
            __table__ = Base.metadata.tables['jedro_feature']
        class LexicalUnitLexeme(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
        class LexicalUnit(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunit']
        class LexicalUnitType(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunittype']
        class Category(Base):
            __table__ = Base.metadata.tables['jedro_category']
        class Sense(Base):
            __table__ = Base.metadata.tables['jedro_sense']
        class Measure(Base):
            __table__ = Base.metadata.tables['jedro_measure']
        class LexicalUnitMeasure(Base):
            __table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
        class Corpus(Base):
            __table__ = Base.metadata.tables['jedro_corpus']
        class Definition(Base):
            __table__ = Base.metadata.tables['jedro_definition']
        class WordForm(Base):
            __table__ = Base.metadata.tables['jedro_wordform']
        class WordFormFeature(Base):
            __table__ = Base.metadata.tables['jedro_wordform_feature']
        class FormRepresentation(Base):
            __table__ = Base.metadata.tables['jedro_formrepresentation']
        class FormEncoding(Base):
            __table__ = Base.metadata.tables['jedro_formencoding']
        self.session = Session(engine)
        self.load_sloleks = load_sloleks
        if self.load_sloleks:
            self.init_load_sloleks()
    # def init_load_sloleks2(self):
    def init_load_sloleks(self):
        query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
        word_form_features = query_word_form_features.all()
        query_form_representations = self.session.query(FormRepresentation.word_form_id)
        form_representations = query_form_representations.all()
        query_form_encoding = self.session.query(FormEncoding.form_representation_id, FormEncoding.text)
        form_encodings = query_form_encoding.all()
        query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
        word_forms = query_word_forms.all()
        query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
        lexemes = query_lexemes.all()
        self.lemmas = {}
        for lexeme in lexemes:
            if lexeme.lemma not in self.lemmas:
                self.lemmas[lexeme.lemma] = []
            self.lemmas[lexeme.lemma].append(lexeme.id)
        self.word_form_features = {}
        for word_form_feature in word_form_features:
            if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES:
                continue
            if word_form_feature.word_form_id not in self.word_form_features:
                self.word_form_features[word_form_feature.word_form_id] = set()
            self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
        form_encodings_dict = {form_encoding.form_representation_id: form_encoding.text for form_encoding
                                     in form_encodings}
        self.form_representations = {form_representation.word_form_id: form_encodings_dict[form_representation.word_form_id] for form_representation
                                     in form_representations}
        self.word_forms = {}
        for word_form in word_forms:
            if word_form.lexeme_id not in self.word_forms:
                self.word_forms[word_form.lexeme_id] = []
            self.word_forms[word_form.lexeme_id].append(word_form.id)
        self.connected_lemmas = {}
        for lemma, lemma_ids in self.lemmas.items():
            for lemma_id in lemma_ids:
                if lemma_id in self.word_forms:
                    for word_form_id in self.word_forms[lemma_id]:
                        if word_form_id in self.word_form_features and word_form_id in self.form_representations:
                            if lemma not in self.connected_lemmas:
                                self.connected_lemmas[lemma] = []
                            self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id]))
        del self.lemmas, self.word_form_features, self.form_representations, self.word_forms
        gc.collect()
    def close(self):
        self.session.close()
    def decypher_msd(self, msd):
        t = msd[0]
        decypher = []
        # IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES
        if t == 'N':
            # gender = CODES_TRANSLATION[t][2][msd[2]]
            number = CODES_TRANSLATION[t][3][msd[3]]
            case = CODES_TRANSLATION[t][4][msd[4]]
            decypher = [number, case]
        elif t == 'V':
            # gender = CODES_TRANSLATION[t][6][msd[6]]
            vform = CODES_TRANSLATION[t][3][msd[3]]
            number = CODES_TRANSLATION[t][5][msd[5]]
            person = 'third'
            decypher = [vform, number, person]
        elif t == 'A':
            gender = CODES_TRANSLATION[t][3][msd[3]]
            number = CODES_TRANSLATION[t][4][msd[4]]
            case = CODES_TRANSLATION[t][5][msd[5]]
            decypher = [gender, number, case]
        return decypher
    def get_word_form(self, lemma, msd, data, align_msd=False):
        # modify msd as required
        from sqlalchemy.orm import aliased
        msd = list(msd)
        if 'msd' in data:
            for key, value in data['msd'].items():
                t = msd[0]
                v = TAGSET[t].index(key.lower())
                if v + 1 >= len(msd):
                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
                msd[v + 1] = CODES[value]
        if align_msd and 'agreement' in data:
            align_msd = list(align_msd)
            t_align_msd = align_msd[0]
            t = msd[0]
            for att in data['agreement']:
                v_align_msd = TAGSET[t_align_msd].index(att.lower())
                v = TAGSET[t].index(att.lower())
                # fix for verbs with short msds
                if v + 1 >= len(msd):
                    msd = msd + ['-' for _ in range(v - len(msd) + 2)]
                msd[v + 1] = align_msd[v_align_msd + 1]
        decypher_msd = self.decypher_msd(msd)
        if not decypher_msd:
            return None, None, None
        if self.load_sloleks and lemma in self.connected_lemmas:
            for (word_form_features, form_representations) in self.connected_lemmas[lemma]:
                fits = True
                for d_m in decypher_msd:
                    if d_m not in word_form_features:
                        fits = False
                        break
                if fits:
                    break
            return ''.join(msd), lemma, form_representations
        else:
            wfs = [aliased(WordFormFeature) for _ in decypher_msd]
            # self.session.query(FormEncoding.form_representation_id, FormEncoding.text)
            query_preposition = self.session.query(FormEncoding.text) \
                .join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
                .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
                .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
            # query_preposition = self.session.query(FormRepresentation.form) \
            #     .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
            #     .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
            for wf in wfs:
                query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
            query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
            for wf, msd_el in zip(wfs, decypher_msd):
                query_preposition = query_preposition.filter(wf.value == msd_el)
            pattern_translation_hws = query_preposition.limit(1).all()
            if len(pattern_translation_hws) > 0:
                return ''.join(msd), lemma, pattern_translation_hws[0][0]
        return None, None, None
--- a/run.sh.example
+++ b/run.sh.example
@ -1 +0,0 @@
 pypy3 wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<sloleks db data>' --collocation_sentence_map_dest data/collocation-sentence-mapper --db /mnt/tmp/mysql-wani --multiple-output  --load-sloleks
--- a/scripts/recalculate_statistics.py
+++ b/scripts/recalculate_statistics.py
@ -1,240 +0,0 @@
 import argparse
 import csv
 import logging
 import os
 import sys
 import time
 from math import log2
 CORE_RESTRICTIONS = ['s', 'p', 'r', 'gg']
 ALL_RESTRICTIONS = CORE_RESTRICTIONS + ['vp', 'vd', 'd']
 LEMMA_COLUMNS = ['C1_Lemma', 'C2_Lemma', 'C3_Lemma', 'C4_Lemma', 'C5_Lemma']
 def load_word_order(word_order_file):
    with open(word_order_file, 'r') as f:
        lines = {}
        for line in f:
            l = line.split('|')
            if l[6] not in [e[0] for e in lines] and l[6] != '' and l[6] != 'NSSS':
                pos_tags = l[2].split('-')
                core_rest = sorted([str(pt_i + 1) for cr in CORE_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])
                assert len(core_rest) == 2, 'Core restrictions are incorrect!'
                all_rest = sorted([str(pt_i + 1) for cr in ALL_RESTRICTIONS for pt_i, pt in enumerate(pos_tags) if pt[:len(cr)] == cr])
                lines[l[6]] = [core_rest, all_rest]
        return lines
 def add_word(stats, pos, word, freq):
    if word == '':
        return
    if word not in stats['words'][pos]:
        stats['words'][pos][word] = int(freq)
    else:
        stats['words'][pos][word] += int(freq)
 def get_new_stats(f):
    lines = []
    stats = {}
    stats['words'] = {}
    stats['words']['1'] = {}
    stats['words']['2'] = {}
    stats['words']['3'] = {}
    stats['words']['4'] = {}
    stats['words']['5'] = {}
    stats['words']['total'] = 0
    first_line = True
    positions = {}
    for line in f.readlines():
        line = line.split(',')
        lines.append(line)
        if first_line:
            positions['freq'] = line.index('Frequency')
            for lci, lc in enumerate(LEMMA_COLUMNS):
                positions[str(lci + 1)] = line.index(lc)
            first_line = False
            continue
        for pos in range(1, 6):
            pos = str(pos)
            word = line[positions[pos]]
            add_word(stats, pos, word, line[positions['freq']])
        stats['words']['total'] += int(line[positions['freq']])
    return lines, stats
 def logDice_new(stats, positions, line, rest):
    fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]
    res = 14 + log2(2 * int(line[positions['freq']]) / sum(fi))
    return res
 def deltaP_new(stats, positions, line, rest, delta21=True):
    fi = [int(stats['words'][r][line[positions[r]]]) for r in rest]
    fx = fi[0] if delta21 else fi[1]
    fy = fi[1] if delta21 else fi[0]
    freq = int(line[positions['freq']])
    N = int(stats['words']['total'])
    res = (freq / fx) - ((fy - freq) / (N - fx))
    return res
 def write_new_stats(wf, original_text, stats, file_name, word_order):
    structure_id = file_name.split('.')[-1]
    core_rest, all_rest = word_order[structure_id]
    first_line = True
    positions = {}
    for line in original_text:
        line[-1] = line[-1][:-1]
        # handle header file
        if first_line:
            line += ['structure_frequency', 'logDice_core', 'logDice_all',
                     'weighted_logDice_frequency', 'deltaP12_structure',
                     'deltaP21_structure', 'deltaP_structure']
            for i in range(5):
                new_pos = 6 + i + i * 5
                line = line[:new_pos] + ['C' + str(i + 1) + '_lemma_structure_frequency'] + line[new_pos:]
            positions['freq'] = line.index('Frequency')
            for lci, lc in enumerate(LEMMA_COLUMNS):
                positions[str(lci + 1)] = line.index(lc)
            positions['delta12'] = line.index('Delta_p12')
            positions['delta21'] = line.index('Delta_p21')
            positions['logDice_core'] = line.index('LogDice_core')
            positions['logDice_all'] = line.index('LogDice_all')
            line[positions['logDice_core']] = 'logDice_core_corpus'
            line[positions['logDice_all']] = 'logDice_all_corpus'
            first_line = False
            line = line[:positions['logDice_all'] + 1] + ['weighted_logDice_frequency_corpus'] + line[positions['logDice_all'] + 1:]
            line = line[:positions['delta21'] + 1] + ['deltaP'] + line[positions['delta21'] + 1:]
            # TODO INSERT 'deltaP',  and weightedlogDice_frequency and , 'weighted_logDice_frequency_corpus'
            wf.write(','.join(line) + '\n')
            continue
        lemma_struct_freq = []
        for i in range(5):
            new_pos = 1 + i * 5
            freq = str(stats['words'][str(i + 1)][line[new_pos]]) if line[new_pos] != '' else '0'
            lemma_struct_freq.append(freq)
        for i in range(5):
            new_pos = 6 + i + i * 5
            line = line[:new_pos] + [lemma_struct_freq[i]] + line[new_pos:]
        # add structure_frequency
        structure_frequency = int(stats['words']['total'])
        line.append("{:.5f}".format(structure_frequency))
        # add logDice_core_new
        logDice_core_new = logDice_new(stats, positions, line, core_rest)
        line.append("{:.5f}".format(logDice_core_new))
        # add logDice_all_new
        logDice_all_new = logDice_new(stats, positions, line, all_rest)
        line.append("{:.5f}".format(logDice_all_new))
        weighted_logDice_frequency_corpus = 0.3 * int(line[positions['freq']]) + 0.7 * float(
            line[positions['logDice_core']])
        # line.append("{:.5f}".format(weighted_logDice_frequency_corpus))
        weighted_logDice_frequency = 0.3 * int(line[positions['freq']]) + 0.7 * logDice_core_new
        line.append("{:.5f}".format(weighted_logDice_frequency))
        # add deltaP12_structure
        deltaP12_structure = deltaP_new(stats, positions, line, core_rest, delta21=False)
        line.append("{:.5f}".format(deltaP12_structure))
        # add deltaP21_structure
        deltaP21_structure = deltaP_new(stats, positions, line, core_rest, delta21=True)
        line.append("{:.5f}".format(deltaP21_structure))
        deltaP12 = float(line[positions['delta12']])
        deltaP21 = float(line[positions['delta21']])
        deltaP = abs(deltaP12 - deltaP21)
        # line.append("{:.5f}".format(deltaP))
        deltaP_structure = abs(deltaP12_structure - deltaP21_structure)
        line.append("{:.5f}".format(deltaP_structure))
        line = line[:positions['logDice_all'] + 1] + ["{:.5f}".format(weighted_logDice_frequency_corpus)] + line[positions[
                                                                                                      'logDice_all'] + 1:]
        line = line[:positions['delta21'] + 1] + ["{:.5f}".format(deltaP)] + line[positions['delta21'] + 1:]
        # TODO ADD OTHER COLUMNS AS IN #823 task
        wf.write(','.join(line) + '\n')
 def main(args):
    if not args.ignore_recalculation:
        word_order = load_word_order(args.word_order_file)
        for file_name in os.listdir(args.input):
            read_file_path = os.path.join(args.input, file_name)
            write_file_path = os.path.join(args.output, file_name)
            with open(read_file_path, 'r') as rf, open(write_file_path, 'w') as wf:
                original_text, stats = get_new_stats(rf)
                freq_pos = original_text[0].index('Frequency')
                if args.frequency_limit > 1:
                    original_text = [original_text[0]] + [l for l in original_text[1:] if int(l[freq_pos]) >= 10]
                if args.sorted:
                    if len(original_text) > 1:
                        original_text = [original_text[0]] + sorted(original_text[1:], key=lambda x: -1 * int(x[freq_pos]))
                    else:
                        original_text = [original_text[0]]
                write_new_stats(wf, original_text, stats, file_name, word_order)
    if args.format_output:
        for file_name in os.listdir(args.output):
            read_file_path = os.path.join(args.output, file_name)
            write_file_path = os.path.join(args.formatted_output, file_name)
            with open(read_file_path, 'r', encoding="utf-8") as rf, open(write_file_path, 'w') as wf:
                first_line = True
                lines = []
                formatted_output = []
                for line in rf:
                    line = line[:-1].split(',')
                    if first_line:
                        # sorting
                        a = line[-17]
                        b = line[-15]
                        # post frequency
                        c = line[-6]
                        d = line[-8]
                        formatted_output.append(line[:-14] + [line[-6], line[-8]])
                        first_line = False
                        continue
                    lines.append(line[:-14] + [line[-6], line[-8]])
                lines = [line for line in lines if int(line[-3]) >= 10]
                lines = sorted(lines, key=lambda x: (-int(x[-3]), x[-5]))
                formatted_output += lines
                for line in formatted_output:
                    wf.write(','.join(line) + '\n')
            break
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Extract structures from a parsed corpus.')
    parser.add_argument('input',
                        help='Path to folder that contains all input files.')
    parser.add_argument('output',
                        help='Path to folder that contains all input files.')
    parser.add_argument('--word_order_file', type=str, help='File that contains word order for DeltaP calculations.')
    parser.add_argument('--frequency_limit', type=int, default=1, help='File that contains word order for DeltaP calculations.')
    parser.add_argument('--sorted', action='store_true', help='File that contains word order for DeltaP calculations.')
    parser.add_argument('--format_output', action='store_true', help='Format and cut data as specified in #1808 on redmine.')
    parser.add_argument('--ignore_recalculation', action='store_true', help='Ignore recalculation.')
    parser.add_argument('--formatted_output', default=None, help='Destination of final results.')
    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr)
    start = time.time()
    main(args)
    logging.info("TIME: {}".format(time.time() - start))
--- a/setup.py
+++ b/setup.py
@ -1,10 +0,0 @@
 from setuptools import setup, find_packages
 setup(name='luscenje_struktur_loc',
  version='0.0.1',
  description=u"Parser for collocability",
  author=u"CJVT",
  author_email='fake@mail.com',
  license='MIT',
  packages=find_packages(),
 )
--- a/src/codes_tagset.py
+++ b/src/codes_tagset.py
@ -0,0 +1,90 @@
 CODES = {
    "Noun": "N",
    "Verb": "V",
    "Adjective": "A",
    "Adverb": "R",
    "Pronoun": "P",
    "Numeral": "M",
    "Preposition": "S",
    "Conjunction": "C",
    "Particle": "Q",
    "Interjection": "I",
    "Abbreviation": "Y",
    "Residual": "X",
    'common': 'c',
    'proper': 'p',
    'masculine': 'm',
    'feminine': 'f',
    'neuter': 'n',
    "singular": "s",
    "dual": "d",
    "plural": "p",
    "nominative": "n",
    "genitive": "g",
    "dative": "d",
    "accusative": "a",
    "locative": "l",
    "instrumental": "i",
    "no": "n",
    "yes": "y",
    "main": "m",
    "auxiliary": "a",
    "perfective": "e",
    "progressive": "p",
    "biaspectual": "b",
    "infinitive": "n",
    "supine": "u",
    "participle": "p",
    "present": "r",
    "future": "f",
    "conditional": "c",
    "imperative": "m",
    "first": "1",
    "second": "2",
    "third": "3",
    "general": "g",
    "possessive": "s",
    "positive": "p",
    "comparative": "c",
    "superlative": "s",
    "personal": "p",
    "demonstrative": "d",
    "relative": "r",
    "reflexive": "x",
    "interrogative": "q",
    "indefinite": "i",
    "negative": "z",
    "bound": "b",
    "digit": "d",
    "roman": "r",
    "letter": "l",
    "cardinal": "c",
    "ordinal": "o",
    "pronominal": "p",
    "special": "s",
    "coordinating": "c",
    "subordinating": "s",
    "foreign": "f",
    "typo": "t",
    "program": "p",
    "web": "w",
    "emo": "e",
    "hashtag": "h",
    "at": "a"
 }
 TAGSET = {
    "N": ['type', 'gender', 'number', 'case', 'animate'],
    "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
    "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
    "R": ['type', 'degree'],
    "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
    "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
    "S": ['case'],
    "C": ['type'],
    "Q": [],
    "I": [],
    "Y": [],
    "X": ['type']
 }
--- a/luscenje_struktur/component.py
+++ b/luscenje_struktur/component.py
@ -1,10 +1,9 @@
 from enum import Enum
 import logging
-# from luscenje_struktur.restriction import Restriction
+from restriction import Restriction
-from luscenje_struktur.order import Order
+from order import Order
-from luscenje_struktur.representation_assigner import RepresentationAssigner
+from representation_assigner import RepresentationAssigner
 from luscenje_struktur.restriction_group import RestrictionGroup
 class ComponentStatus(Enum):
@ -20,9 +19,9 @@ class ComponentType(Enum):
 class Component:
-    def __init__(self, info, system_type):
+    def __init__(self, info):
        idx = info['cid']
-        name = info['label'] if 'label' in info else None
+        name = info['name'] if 'name' in info else None
        typ = ComponentType.Core if info['type'] == "core" else ComponentType.Other
        if 'status' not in info:
@ -39,7 +38,7 @@ class Component:
        self.status = status
        self.name = name
        self.idx = idx
-        self.restrictions = RestrictionGroup([None], system_type) if 'restriction' in info else []
+        self.restrictions = []
        self.next_element = []
        self.representation = []
        self.selection = {}
@ -50,17 +49,15 @@ class Component:
    def add_next(self, next_component, link_label, order):
        self.next_element.append((next_component, link_label, Order.new(order)))
-    def set_restriction(self, restrictions_tags, system_type):
+    def set_restriction(self, restrictions_tag):
-        if not restrictions_tags:
+        if restrictions_tag is None:
-            self.restrictions = RestrictionGroup([None], system_type)
+            self.restrictions = [Restriction(None)]
-        # if first element is of type restriction all following are as well
+        elif restrictions_tag.tag == "restriction":
-        elif restrictions_tags[0].tag == "restriction":
+            self.restrictions = [Restriction(restrictions_tag)]
            self.restrictions = RestrictionGroup(restrictions_tags, system_type)
-        # combinations of 'and' and 'or' restrictions are currently not implemented
+        elif restrictions_tag.tag == "restriction_or":
-        elif restrictions_tags[0].tag == "restriction_or":
+            self.restrictions = [Restriction(el) for el in restrictions_tag]
            self.restrictions = RestrictionGroup(restrictions_tags[0], system_type, group_type='or')
        else:
            raise RuntimeError("Unreachable")
@ -72,19 +69,19 @@ class Component:
                crend.add_feature(feature.attrib)
            self.representation.append(crend)
-    def find_next(self, deps, comps, restrs, reprs, system_type):
+    def find_next(self, deps, comps, restrs, reprs):
        to_ret = []
        for d in deps:
            if d[0] == self.idx:
                _, idx, dep_label, order = d
-                next_component = Component(comps[idx], system_type)
+                next_component = Component(comps[idx])
-                next_component.set_restriction(restrs[idx], system_type)
+                next_component.set_restriction(restrs[idx])
                next_component.set_representation(reprs[idx])
                to_ret.append(next_component)
                self.add_next(next_component, dep_label, order)
-                others = next_component.find_next(deps, comps, restrs, reprs, system_type)
+                others = next_component.find_next(deps, comps, restrs, reprs)
                to_ret.extend(others)
        return to_ret
@ -107,28 +104,37 @@ class Component:
            if len(cmatch) == 0:
                continue
-            # create new to_ret, to which extend all results
+            # if more than one match found for particular component
-            new_to_ret = []
+            elif len(cmatch) > 1:
-            for tr in to_ret:
+                # if more than one match in multiple components, NOPE!
-                # make sure that one word is not used twice in same to_ret
+                if len(to_ret) > 1:
-                new_to_ret.extend([{**dict(tr), **m} for m in cmatch if all([m_v not in dict(tr).values() for m_v in m.values()])])
+                    logging.warning("Strange multiple match: {}".format(
-            if len(new_to_ret) == 0:
+                        str([w.id for w in cmatch[0].values()])))
-                return None
+
-            to_ret = new_to_ret
+                    for tr in to_ret:
-            del new_to_ret
+                        tr.update(cmatch[0])
                    continue
                # yeah, so we have found more than one match, =>
                # more than one element in to_ret
                to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]
            else:
                for tr in to_ret:
                    tr.update(cmatch[0])
        return to_ret
    def _match_self(self, word):
        # matching
-        if self.restrictions.match(word):
+        for restr in self.restrictions:
-            return {self.idx: word}
+            if restr.match(word): # match either
                return {self.idx: word}
    def _match_next(self, word):
        # matches for every component in links from this component
        to_ret = []
        # need to get all links that match
        for next, link, order in self.next_element:
            next_links = word.get_links(link)
--- a/luscenje_struktur/database.py
+++ b/luscenje_struktur/database.py
--- a/luscenje_struktur/formatter.py
+++ b/luscenje_struktur/formatter.py
@ -1,8 +1,7 @@
 from math import log2
 import re
 import logging
-from luscenje_struktur.component import ComponentType
+from component import ComponentType
 class Formatter:
@ -35,35 +34,31 @@ class Formatter:
 class OutNoStatFormatter(Formatter):
    def additional_init(self):
-        self.representation = {}
+        self.representation = ""
    def header_repeat(self):
        return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
    def header_right(self):
-        return ["Joint_representative_form_fixed", "Joint_representative_form_variable", "Frequency"]
+        return ["Joint_representative_form", "Frequency"]
    def content_repeat(self, words, representations, idx, _sidx):
        word = words[idx]
        if idx not in representations:
            return [word.lemma, "", ""]
-        rep_text, rep_msd = representations[idx]
+        rep = representations[idx]
-        if rep_text is None:
+        if rep is None:
-            self.representation[idx] = word.lemma
+            self.representation += " " + word.lemma
            return [word.lemma, word.lemma, "", "lemma_fallback"]
        else:
-            self.representation[idx] = rep_text
+            self.representation += " " + rep
-            return [word.lemma, rep_text, rep_msd, "ok"]
+            return [word.lemma, rep, word.msd, "ok"]
-    def content_right(self, freq, variable_word_order=None):
+    def content_right(self, freq):
-        fixed_word_order = sorted(self.representation.keys())
+        rep = re.sub(' +', ' ', self.representation.strip())
-        if variable_word_order is None:
+        result = [rep, str(freq)]
-            variable_word_order = fixed_word_order
+        self.representation = ""
        rep_fixed_word_order = ' '.join([self.representation[o] for o in fixed_word_order if o in self.representation])
        rep_variable_word_order = ' '.join([self.representation[o] for o in variable_word_order if o in self.representation])
        result = [rep_fixed_word_order, rep_variable_word_order, str(freq)]
        self.representation = {}
        return result
    def group(self):
@ -83,7 +78,7 @@ class AllFormatter(Formatter):
        word = words[idx]
        return [word.id, word.text, word.lemma, word.msd]
-    def content_right(self, _freq, variable_word_order=None):
+    def content_right(self, _freq):
        return []
    def group(self):
@ -152,27 +147,7 @@ class StatsFormatter(Formatter):
        word = words[idx]
        key = (sidx, idx, word.lemma)
-        # try to fix missing dispersions
+        distribution = self.colocation_ids.dispersions[key]
        if key not in self.colocation_ids.dispersions:
            if word.lemma == 'k':
                new_key = (sidx, idx, 'h')
            elif word.lemma == 'h':
                new_key = (sidx, idx, 'k')
            elif word.lemma == 's':
                new_key = (sidx, idx, 'z')
            elif word.lemma == 'z':
                new_key = (sidx, idx, 's')
            else:
                new_key = (sidx, idx, '')
            if new_key in self.colocation_ids.dispersions:
                key = new_key
                logging.info('Dispersions fixed.')
            else:
                logging.info('Dispersions not fixed.')
        if key in self.colocation_ids.dispersions:
            distribution = self.colocation_ids.dispersions[key]
        else:
            distribution = 1
        return [self.stat_str(distribution)]
    def content_right(self, freq):
@ -206,13 +181,13 @@ class OutFormatter(Formatter):
    def header_right(self):
        return self.f1.header_right() + self.f2.header_right()
-    def content_repeat(self, words, representations, idx, sidx, variable_word_order=None):
+    def content_repeat(self, words, representations, idx, sidx):
        cr1 = self.f1.content_repeat(words, representations, idx, sidx)
        cr2 = self.f2.content_repeat(words, representations, idx, sidx)
        return cr1 + cr2
-    def content_right(self, freq, variable_word_order=None):
+    def content_right(self, freq):
-        return self.f1.content_right(freq, variable_word_order) + self.f2.content_right(freq)
+        return self.f1.content_right(freq) + self.f2.content_right(freq)
    def group(self):
        return self.f1.group() and self.f2.group()
--- a/luscenje_struktur/lemma_features.py
+++ b/luscenje_struktur/lemma_features.py
@ -1,4 +1,4 @@
-from luscenje_struktur.restriction import MorphologyRegex
+from restriction import MorphologyRegex
 def get_lemma_features(et):
@ -8,7 +8,7 @@ def get_lemma_features(et):
    result = {}
    for pos in lf.iter('POS'):
-        rgx_list = MorphologyRegex(pos).rgxs[0]
+        rgx_list = MorphologyRegex(pos).rgx
        rgx_str = ""
        for position in rgx_list:
            if position == ".":
--- a/src/loader.py
+++ b/src/loader.py
@ -0,0 +1,148 @@
 from xml.etree import ElementTree
 import logging
 import re
 import sys
 import gzip
 import pathlib
 from progress_bar import progress
 from word import Word
 def is_root_id(id_):
    return len(id_.split('.')) == 3
 def load_files(args, database):
    filenames = args.input
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate
    database.init("CREATE TABLE Files ( filename varchar(2048) )")
    for idx, fname in enumerate(filenames):
        print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
        extension = pathlib.Path(fname).suffix
        # check if file with the same name already loaded...
        loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone()
        if loaded is not None:
            print("ALREADY LOADED")
            continue
        if extension == ".xml":
            et = load_xml(fname)
            yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
        elif extension == ".gz":
            yield load_csv(fname, True)
        else:
            yield load_csv(fname, False)
        # else:
        #     raise NotImplementedError("Unknown file extension: {}".format(extension))
        database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, ))
        database.commit()
 def lines_gz(filename):
    with gzip.open(filename, 'r') as fp:
        for line in progress(fp, 'load-gz'):
            yield line.decode('utf8')
 def lines_csv(filename):
    with open(filename, 'r') as fp:
        for line in progress(fp, 'load-csv'):
            yield line
 def load_csv(filename, compressed):
    result = []
    bad_sentence = False
    words = {}
    links = []
    def sentence_end(bad_sentence):
        if bad_sentence:
            return
        for lfrom, ldest, ana in links:
            if lfrom not in words or ldest not in words:
                logging.warning("Bad link in sentence: " + line_split[0])
                continue
            words[lfrom].add_link(ana, words[ldest])
        result.extend(words.values())
    line_gen = lines_gz if compressed else lines_csv
    for line in line_gen(filename):
        line_str = line.strip()
        line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
        line_split = line_fixed.split("\t")
        if line_split[1] == "1" and len(words) > 0:
            sentence_end(bad_sentence)
            bad_sentence = False
            links = []
            words = {}
        try:
            sid, wid, text, msd, lemma, link_src, link_type = line_split
        except ValueError:
            bad_sentence = True
        full_id = "{}.{}".format(sid, wid)
        words[wid] = Word(lemma, msd, full_id, text, True)
        if link_src != '0':
            links.append((link_src, wid, link_type))
    sentence_end(bad_sentence)
    return result
 def load_xml(filename):
    with open(filename, 'r') as fp:
        content = fp.read()
    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
    xmlstring = xmlstring.replace(' xml:', ' ')
    return ElementTree.XML(xmlstring)
 def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
    words = {}
    sentences = list(et.iter('s'))
    for sentence in progress(sentences, "load-text"):
        for w in sentence.iter("w"):
            words[w.get('id')] = Word.from_xml(w, do_msd_translate)
        for pc in sentence.iter(pc_tag):
            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
        for l in sentence.iter("link"):
            if 'dep' in l.keys():
                ana = l.get('afun')
                lfrom = l.get('from')
                dest = l.get('dep')
            else:
                ana = l.get('ana')
                if ana[:4] != 'syn:': # dont bother...
                    continue
                ana = ana[4:]
                lfrom, dest = l.get('target').replace('#', '').split()
            if lfrom in words:
                if not skip_id_check and is_root_id(lfrom):
                    logging.error("NOO: {}".format(lfrom))
                    sys.exit(1)
                if dest in words:
                    next_word = words[dest]
                    words[lfrom].add_link(ana, next_word)
                else:
                    logging.error("Unknown id: {}".format(dest))
                    sys.exit(1)
            else:
                # strange errors, just skip...
                pass
    return list(words.values())
--- a/luscenje_struktur/match.py
+++ b/luscenje_struktur/match.py
@ -1,4 +1,4 @@
-from luscenje_struktur.word import Word
+from word import Word
 class StructureMatch:
    def __init__(self, match_id, structure):
@ -28,8 +28,8 @@ class StructureMatch:
            result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
-        for component_id, text, msd in db.execute("SELECT component_id, text, msd FROM Representations WHERE colocation_id=?", (colocation_id,)):
+        for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
-            result.representations[str(component_id)] = (text, msd)
+            result.representations[str(component_id)] = text
        return result
--- a/luscenje_struktur/match_store.py
+++ b/luscenje_struktur/match_store.py
@ -1,12 +1,9 @@
 import gc
 from collections import defaultdict
 from ast import literal_eval
 from time import time
 import logging
-from luscenje_struktur.match import StructureMatch
+from match import StructureMatch
-from luscenje_struktur.representation_assigner import RepresentationAssigner
+from representation_assigner import RepresentationAssigner
-from luscenje_struktur.progress_bar import progress
+from progress_bar import progress
 class MatchStore:
    def __init__(self, args, db):
@ -38,7 +35,6 @@ class MatchStore:
            colocation_id INTEGER,
            component_id INTEGER,
            text varchar(32),
            msd varchar(32),
            FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
            """)
        self.db.init("""CREATE TABLE Dispersions (
@ -95,17 +91,10 @@ class MatchStore:
                                   (structure.id,)):
            yield StructureMatch.from_db(self.db, cid[0], structure)
-    def add_inserts(self, inserts):
+    def set_representations(self, word_renderer, structures):
        for match in inserts:
            for component_id, (text, msd) in match.representations.items():
                self.db.execute("""
                    INSERT INTO Representations (colocation_id, component_id, text, msd) 
                    VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd))
    def set_representations(self, word_renderer, structures, sloleks_db=None):
        step_name = 'representation'
        if self.db.is_step_done(step_name):
-            logging.info("Representation step already done, skipping")
+            print("Representation step already done, skipping")
            return
        num_inserts = 1000
@ -113,20 +102,20 @@ class MatchStore:
        structures_dict = {s.id: s for s in structures}
        num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
        start_time = time()
        for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
            structure = structures_dict[sid]
            match = StructureMatch.from_db(self.db, cid, structure)
-            RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db)
+            RepresentationAssigner.set_representations(match, word_renderer)
            inserts.append(match)
            if len(inserts) > num_inserts:
-                self.add_inserts(inserts)
+                for match in inserts:
                    for component_id, text in match.representations.items():
                        self.db.execute("""
                            INSERT INTO Representations (colocation_id, component_id, text) 
                            VALUES (?,?,?)""", (match.match_id, component_id, text))
                inserts = []
-            if time() - start_time > 5:
+
                start_time = time()
                gc.collect()
        self.add_inserts(inserts)
        self.db.step_is_done(step_name)
    def has_colocation_id_enough_frequency(self, colocation_id):
@ -149,7 +138,7 @@ class MatchStore:
                dispersions[(str(structure_id), component_id, lemma)] += 1
        self.dispersions = dict(dispersions)
-        logging.info("Storing dispersions...")
+        print("Storing dispersions...")
        self.store_dispersions()
        self.db.step_is_done(step_name)
--- a/luscenje_struktur/msd_translate.py
+++ b/luscenje_struktur/msd_translate.py
@ -1911,4 +1911,4 @@ MSD_TRANSLATE = {
    "Ne": "Ne",
    "Nh": "Nh",
    "Na": "Na",
-    "U": "Z"}
+    "U": "N"}
--- a/luscenje_struktur/order.py
+++ b/luscenje_struktur/order.py
--- a/luscenje_struktur/progress_bar.py
+++ b/luscenje_struktur/progress_bar.py
@ -1,5 +1,4 @@
 import time
 import logging
 try:
    from tqdm import tqdm
@ -22,10 +21,10 @@ class Progress:
            for n, el in enumerate(iterable):
                now = time.time()
                if now - last_report > REPORT_ON:
-                    logging.info("\r{}: {}/{}".format(description, n, total), end="")
+                    print("\r{}: {}/{}".format(description, n, total), end="")
                    last_report = now
                yield el
-            logging.info(" -> {}".format(time.time() - start_time))
+            print(" -> {}".format(time.time() - start_time))
        else:
            yield from tqdm(iterable, desc=description, total=total)
--- a/luscenje_struktur/representation.py
+++ b/luscenje_struktur/representation.py
@ -1,11 +1,8 @@
 import logging
 from collections import Counter
-from luscenje_struktur.codes_tagset import TAGSET, CODES
+from codes_tagset import TAGSET, CODES
-from luscenje_struktur.word import WordMsdOnly
+from word import WordMsdOnly
 from luscenje_struktur.word import WordDummy
 class ComponentRepresentation:
    def __init__(self, data, word_renderer):
@ -14,7 +11,6 @@ class ComponentRepresentation:
        self.words = []
        self.rendition_text = None
        self.rendition_msd = None
        self.agreement = []
    def get_agreement(self):
@ -23,37 +19,31 @@ class ComponentRepresentation:
    def add_word(self, word):
        self.words.append(word)
-    def render(self, sloleks_db=None):
+    def render(self):
        if self.rendition_text is None:
-            self.rendition_text, self.rendition_msd = self._render(sloleks_db=sloleks_db)
+            self.rendition_text = self._render()
-    def _render(self, sloleks_db=None):
+    def _render(self):
        raise NotImplementedError("Not implemented for class: {}".format(type(self)))
 class LemmaCR(ComponentRepresentation):
-    def _render(self, sloleks_db=None):
+    def _render(self):
-        # TODO FIX THIS TO LEMMA MSD
+        return self.words[0].lemma if len(self.words) > 0 else None
        if len(self.words) > 0:
            return self.words[0].lemma, self.words[0].msd
        else:
            return None, None
 class LexisCR(ComponentRepresentation):
-    def _render(self, sloleks_db=None):
+    def _render(self):
-        return self.data['lexis'], 'Q'
+        return self.data['lexis']
 class WordFormAllCR(ComponentRepresentation):
-    def _render(self, sloleks_db=None):
+    def _render(self):
        if len(self.words) == 0:
-            return None, None
+            return None
        else:
            forms = [w.text.lower() for w in self.words]
-            msds = [w.msd for w in self.words]
+            return "/".join(set(forms))
            return "/".join(set(forms)), "/".join(set(msds))
 class WordFormAnyCR(ComponentRepresentation):
-    def _render(self, sloleks_db=None):
+    def _render(self):
        text_forms = {}
        msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
        for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
@ -70,38 +60,20 @@ class WordFormAnyCR(ComponentRepresentation):
            # check if agreements match
            agreements_matched = [agr.match(word_msd) for agr in self.agreement]
            # in case all agreements do not match try to get data from sloleks and change properly
            if sloleks_db is not None and not all(agreements_matched):
                for i, agr in enumerate(self.agreement):
                    if not agr.match(word_msd):
                        msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
                        if msd is not None:
                            agr.msds[0] = msd
                            agr.words.append(WordDummy(msd, lemma, text))
                            # when we find element in sloleks automatically add it (no need for second checks, since msd
                            # is tailored to pass tests by default)
                            agr.rendition_candidate = text
                            agr.rendition_msd_candidate = msd
                            agreements_matched[i] = True
                        else:
                            break
            # if we are at the last "backup word", then confirm matches 
            # that worked for this one and return
            if word_lemma is None:
                for agr, matched in zip(self.agreement, agreements_matched):
                    if matched:
                        agr.confirm_match()
-                return None, None
+                return None
            # if all agreements match, we win!
            if all(agreements_matched):
                for agr in self.agreement:
                    agr.confirm_match()
-                return text_forms[(word_msd, word_lemma)], word_msd
+                return text_forms[(word_msd, word_lemma)]
        return None, None
 class WordFormMsdCR(WordFormAnyCR):
@ -121,8 +93,6 @@ class WordFormMsdCR(WordFormAnyCR):
        for key, value in selectors.items():
            t = word_msd[0]
            v = TAGSET[t].index(key.lower())
            if v + 1 >= len(word_msd):
                return False
            f1 = word_msd[v + 1]
            f2 = CODES[value]
@ -139,19 +109,16 @@ class WordFormMsdCR(WordFormAnyCR):
        if self.check_msd(word.msd):
            super().add_word(word)
-    def _render(self, sloleks_db=None):
+    def _render(self):
        if len(self.words) == 0 and sloleks_db is not None:
            msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
            if msd is not None:
                self.words.append(WordDummy(msd, lemma, text))
        self.words.append(WordMsdOnly(self._common_msd()))
-        return super()._render(sloleks_db)
+        return super()._render()
    def _common_msd(self):
        msds = sorted(self.msds, key=len)
        common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds))) 
                      else msds[0][idx] for idx in range(len(msds[0]))]
        common_msd = "".join(common_msd)
        iommon_msd = "".join(common_msd)
        return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
@ -159,7 +126,6 @@ class WordFormAgreementCR(WordFormMsdCR):
    def __init__(self, data, word_renderer):
        super().__init__(data, word_renderer)
        self.rendition_candidate = None
        self.rendition_msd_candidate = None
    def get_agreement(self):
        return self.data['other']
@ -175,14 +141,12 @@ class WordFormAgreementCR(WordFormMsdCR):
            if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
                if self.check_msd(candidate_msd):
                    self.rendition_candidate = candidate_text
                    self.rendition_msd_candidate = candidate_msd
                    return True
        return False
    def confirm_match(self):
        self.rendition_text = self.rendition_candidate
        self.rendition_msd = self.rendition_msd_candidate
    @staticmethod
    def check_agreement(msd1, msd2, agreements):
@ -218,5 +182,5 @@ class WordFormAgreementCR(WordFormMsdCR):
        return True
-    def render(self, sloleks_db=None):
+    def render(self):
        pass
--- a/luscenje_struktur/representation_assigner.py
+++ b/luscenje_struktur/representation_assigner.py
@ -1,4 +1,4 @@
-from luscenje_struktur.representation import ComponentRepresentation, LemmaCR, LexisCR, WordFormAgreementCR, WordFormAnyCR, WordFormMsdCR, WordFormAllCR
+from representation import ComponentRepresentation, LemmaCR, LexisCR, WordFormAgreementCR, WordFormAnyCR, WordFormMsdCR, WordFormAllCR
 class RepresentationAssigner:
    def __init__(self):
@ -27,10 +27,11 @@ class RepresentationAssigner:
            elif feature['selection'] == "all":
                self.representation_factory = WordFormAllCR
            elif feature['selection'] == 'agreement':
                assert feature['head'][:4] == 'cid_'
                assert feature['msd'] is not None
                self.representation_factory = WordFormAgreementCR
                self.more['agreement'] = feature['msd'].split('+')
-                self.more['other'] = feature['head_cid']
+                self.more['other'] = feature['head'][4:]
            else:
                raise NotImplementedError("Representation selection: {}".format(feature))
@ -38,7 +39,7 @@ class RepresentationAssigner:
        return self.representation_factory(self.more, word_renderer)
    @staticmethod
-    def set_representations(match, word_renderer, sloleks_db=None):
+    def set_representations(match, word_renderer):
        representations = {}
        for c in match.structure.components:
            representations[c.idx] = []
@ -69,14 +70,13 @@ class RepresentationAssigner:
        for cid, reps in representations.items():
            for rep in reps:
-                rep.render(sloleks_db=sloleks_db)
+                rep.render()
        for cid, reps in representations.items():
-            reps_text = [rep.rendition_text for rep in reps]
+            reps = [rep.rendition_text for rep in reps]
-            reps_msd = [rep.rendition_msd for rep in reps]
+            if reps == []:
            if reps_text == []:
                pass
-            elif all(r is None for r in reps_text):
+            elif all(r is None for r in reps):
-                match.representations[cid] = (None, None)
+                match.representations[cid] = None
            else:
-                match.representations[cid] = (" ".join(("" if r is None else r) for r in reps_text), " ".join(("" if r is None else r) for r in reps_msd))
+                match.representations[cid] = " ".join(("" if r is None else r) for r in reps)
--- a/src/restriction.py
+++ b/src/restriction.py
@ -0,0 +1,133 @@
 import re
 from enum import Enum
 from codes_tagset import CODES, TAGSET
 class RestrictionType(Enum):
    Morphology = 0
    Lexis = 1
    MatchAll = 2
 def determine_ppb(rgx):
    if rgx[0] in ("A", "N", "R"):
        return 0
    elif rgx[0] == "V":
        if len(rgx) == 1:
            return 2
        elif 'a' in rgx[1]:
            return 3
        elif 'm' in rgx[1]:
            return 1
        else:
            return 2
    else:
        return 4
 class MorphologyRegex:
    def __init__(self, restriction):
        self.min_msd_length = 1
        restr_dict = {}
        for feature in restriction:
            feature_dict = dict(feature.items())
            match_type = True
            if "filter" in feature_dict:
                assert feature_dict['filter'] == "negative"
                match_type = False
                del feature_dict['filter']
            assert len(feature_dict) == 1
            key, value = next(iter(feature_dict.items()))
            restr_dict[key] = (value, match_type)
        assert 'POS' in restr_dict
        category = restr_dict['POS'][0].capitalize()
        cat_code = CODES[category]
        rgx = [cat_code] + ['.'] * 10
        del restr_dict['POS']
        for attribute, (value, typ) in restr_dict.items():
            index = TAGSET[cat_code].index(attribute.lower())
            assert index >= 0
            if '|' in value:
                match = "".join(CODES[val] for val in value.split('|'))
            else:
                match = CODES[value]
            match = "[{}{}]".format("" if typ else "^", match)
            rgx[index + 1] = match
            if typ:
                self.min_msd_length = max(index + 1, self.min_msd_length)
        # strip rgx
        for i in reversed(range(len(rgx))):
            if rgx[i] == '.':
                rgx = rgx[:-1]
            else:
                break
        self.re_objects = [re.compile(r) for r in rgx]
        self.rgx = rgx
    def __call__(self, text):
        if len(text) <= self.min_msd_length:
            return False
        for c, r in zip(text, self.re_objects):
            if not r.match(c):
                return False
        return True
 class LexisRegex:
    def __init__(self, restriction):
        restr_dict = {}
        for feature in restriction:
            restr_dict.update(feature.items())
        assert "lemma" in restr_dict
        self.match_list = restr_dict['lemma'].split('|')
    def __call__(self, text):
        return text in self.match_list
 class Restriction:
    def __init__(self, restriction_tag):
        self.ppb = 4 # polnopomenska beseda (0-4)
        if restriction_tag is None:
            self.type = RestrictionType.MatchAll
            self.matcher = None
            self.present = None
            return
        restriction_type = restriction_tag.get('type')
        if restriction_type == "morphology":
            self.type = RestrictionType.Morphology
            self.matcher = MorphologyRegex(list(restriction_tag))
            self.ppb = determine_ppb(self.matcher.rgx)
        elif restriction_type == "lexis":
            self.type = RestrictionType.Lexis
            self.matcher = LexisRegex(list(restriction_tag))
        else:
            raise NotImplementedError()
    def match(self, word):
        if self.type == RestrictionType.Morphology:
            match_to = word.msd
        elif self.type == RestrictionType.Lexis:
            match_to = word.lemma
        elif self.type == RestrictionType.MatchAll:
            return True
        else:
            raise RuntimeError("Unreachable!")
        return self.matcher(match_to)
--- a/luscenje_struktur/syntactic_structure.py
+++ b/luscenje_struktur/syntactic_structure.py
@ -2,31 +2,25 @@ from xml.etree import ElementTree
 import logging
 import pickle
-from luscenje_struktur.codes_tagset import PPB_DEPRELS
+from component import Component, ComponentType
-from luscenje_struktur.component import Component, ComponentType
+from lemma_features import get_lemma_features
 from luscenje_struktur.lemma_features import get_lemma_features
 class SyntacticStructure:
    def __init__(self):
        self.id = None
-        # self.lbs = None
+        self.lbs = None
        self.components = []
        self.fake_root_included = False
    @staticmethod
-    def from_xml(xml, no_stats):
+    def from_xml(xml):
        st = SyntacticStructure()
        st.id = xml.get('id')
-        if st.id is None:
+        st.lbs = xml.get('LBS')
            st.id = xml.get('tempId')
        # st.lbs = xml.get('LBS')
        assert len(list(xml)) == 1
        system = next(iter(xml))
-        assert system.get('type') == 'JOS' or system.get('type') == 'UD'
+        assert system.get('type') == 'JOS'
        system_type = system.get('type')
        components, dependencies, definitions = list(system)
        deps = [(dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order'))
@ -37,50 +31,25 @@ class SyntacticStructure:
        for comp in definitions:
            n = comp.get('cid')
-            restrs[n] = []
+            restrs[n] = None
            forms[n] = []
            for el in comp:
                if el.tag.startswith("restriction"):
-                    restrs[n].append(el)
+                    assert restrs[n] is None
                    restrs[n] = el
                elif el.tag.startswith("representation"):
                    st.add_representation(n, el, forms)
                else:
                    raise NotImplementedError("Unknown definition: {} in structure {}"
                                              .format(el.tag, st.id))
-        fake_root_component = Component({'cid': '#', 'type': 'other', 'restriction': None}, system_type)
+        fake_root_component = Component({'cid': '#', 'type': 'other'})
-        fake_root_component_children = fake_root_component.find_next(deps, comps, restrs, forms, system_type)
+        st.components = fake_root_component.find_next(deps, comps, restrs, forms)
        # all dep with value modra point to artificial root - fake_root_component
        if any([dep[2] == 'modra' for dep in deps]):
            st.fake_root_included = True
            st.components = [fake_root_component] + fake_root_component_children
        else:
            st.components = fake_root_component_children
-        if not no_stats:
+        st.determine_core2w()
            if system_type == 'JOS':
                st.determine_core2w()
            elif system_type == 'UD':
                st.determine_core2w_ud()
        return st
    def determine_core2w_ud(self):
        deprels = {}
        for c in self.components:
            for next_el in c.next_element:
                deprels[next_el[0]] = next_el[1]
        ppb_components_num = 0
        for c in self.components:
            if c.type != ComponentType.Core:
                continue
            if c in deprels and deprels[c] not in PPB_DEPRELS:
                continue
            ppb_components_num += 1
            c.type = ComponentType.Core2w
        assert ppb_components_num == 2, RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id)
    def determine_core2w(self):
        ppb_components = []
        for c in self.components:
@ -129,7 +98,6 @@ class SyntacticStructure:
 def build_structures(args):
    filename = args.structures
    no_stats = args.out is None and args.stats is None
    max_num_components = -1
    with open(filename, 'r') as fp:
@ -137,15 +105,12 @@ def build_structures(args):
    structures = []
    for structure in et.iter('syntactic_structure'):
-        if structure.attrib['type'] != 'collocation':
+        to_append = SyntacticStructure.from_xml(structure)
            continue
        to_append = SyntacticStructure.from_xml(structure, no_stats)
        if to_append is None:
            continue
        structures.append(to_append)
-        to_append_len = len(to_append.components) if not to_append.fake_root_included else len(to_append.components) - 1
+        max_num_components = max(max_num_components, len(to_append.components))
        max_num_components = max(max_num_components, to_append_len)
    lemma_features = get_lemma_features(et)
    return structures, lemma_features, max_num_components
--- a/luscenje_struktur/time_info.py
+++ b/luscenje_struktur/time_info.py
@ -1,5 +1,4 @@
 from datetime import timedelta, datetime
 import logging
 class TimeInfo:
    def __init__(self, to_go):
@ -15,5 +14,5 @@ class TimeInfo:
        seconds = sum(self.times) / len(self.times)
        td = timedelta(seconds = int(seconds * self.to_go))
        ft = datetime.now() + td
-        logging.info("Going to finish in {}".format(ft.strftime("%d/%m @ %H:%M")))
+        print("Going to finish in {}".format(ft.strftime("%d/%m @ %H:%M")))
--- a/src/wani.py
+++ b/src/wani.py
@ -10,32 +10,26 @@ import subprocess
 import concurrent.futures
 import tempfile
-from luscenje_struktur.progress_bar import progress
+from progress_bar import progress
-from luscenje_struktur.sloleks_db import SloleksDatabase
+from word import Word
-from luscenje_struktur.word import Word
+from syntactic_structure import build_structures
-from luscenje_struktur.syntactic_structure import build_structures
+from match_store import MatchStore
-from luscenje_struktur.match_store import MatchStore
+from word_stats import WordStats
-from luscenje_struktur.word_stats import WordStats
+from writer import Writer
-from luscenje_struktur.writer import Writer
+from loader import load_files
-from luscenje_struktur.loader import load_files
+from database import Database
-from luscenje_struktur.database import Database
+from time_info import TimeInfo
 from luscenje_struktur.time_info import TimeInfo
 from luscenje_struktur.postprocessor import Postprocessor
-def match_file(words, structures, postprocessor):
+def match_file(words, structures):
    matches = {s: [] for s in structures}
    for s in progress(structures, "matching"):
        for w in words:
            mhere = s.match(w)
            for match in mhere:
-                if not postprocessor.is_fixed_restriction_order(match):
+                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                    continue
                colocation_id = [[idx, w.lemma] for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
                match, collocation_id = postprocessor.process(match, colocation_id)
                colocation_id = tuple(colocation_id)
                matches[s].append((match, colocation_id))
@ -50,7 +44,6 @@ def main(args):
    database = Database(args)
    match_store = MatchStore(args, database)
    word_stats = WordStats(lemma_msds, database)
    postprocessor = Postprocessor(fixed_restriction_order=args.fixed_restriction_order)
    for words in load_files(args, database):
        if words is None:
@ -58,8 +51,7 @@ def main(args):
            continue
        start_time = time.time()
-        matches = match_file(words, structures, postprocessor)
+        matches = match_file(words, structures)
        match_store.add_matches(matches)
        word_stats.add_words(words)
        database.commit()
@ -82,13 +74,7 @@ def main(args):
    # figure out representations!
    if args.out or args.out_no_stat:
-        if args.sloleks_db is not None:
+        match_store.set_representations(word_stats, structures)
            sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
        else:
            sloleks_db = None
        match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
        if args.sloleks_db is not None:
            sloleks_db.close()
    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
@ -99,8 +85,6 @@ def main(args):
    Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Extract structures from a parsed corpus.')
@ -108,7 +92,6 @@ if __name__ == '__main__':
                        help='Structures definitions in xml file')
    parser.add_argument('input',
                        help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
    parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials')
    parser.add_argument('--out',
                        help='Classic output file')
    parser.add_argument('--out-no-stat',
@ -117,7 +100,7 @@ if __name__ == '__main__':
                        help='Additional output file, writes more data')
    parser.add_argument('--stats',
                        help='Output file for statistics')
-#
+
    parser.add_argument('--no-msd-translate',
                        help='MSDs are translated from slovene to english by default',
                        action='store_true')
@ -135,10 +118,6 @@ if __name__ == '__main__':
                        help='Generate one output for each syntactic structure',
                        action='store_true')
    parser.add_argument('--load-sloleks',
                        help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
                        action='store_true')
    parser.add_argument('--sort-by',
                        help="Sort by a this column (index)", type=int, default=-1)
    parser.add_argument('--sort-reversed',
@ -146,23 +125,12 @@ if __name__ == '__main__':
    parser.add_argument('--db',
                        help="Database file to use (instead of memory)", default=None)
    parser.add_argument('--collocation_sentence_map_dest',
                        help="Destination to folder where collocation-sentence mapper (mappers in case of multiple-output).", default=None)
    parser.add_argument('--new-db',
                        help="Writes over database file, if there exists one", action='store_true')
    parser.add_argument('--pc-tag',
                        help='Tag for separators, usually pc or c', default="pc")
-    parser.add_argument('--separator',
+
                        help='Separator in output file', default="\t")
    parser.add_argument('--ignore-punctuations',
                        help="Sort in reversed ored", action='store_true')
    parser.add_argument('--fixed-restriction-order',
                        help='If used, words have to be in the same order as components.',
                        action='store_true')
    parser.add_argument('--new-tei',
                        help='Attribute to be used, when using new version of tei. (default=False)',
                        action='store_true')
    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
--- a/luscenje_struktur/word.py
+++ b/luscenje_struktur/word.py
@ -1,14 +1,7 @@
 from collections import defaultdict
 import logging
-from luscenje_struktur.msd_translate import MSD_TRANSLATE
+from msd_translate import MSD_TRANSLATE
 class WordCompressed:
    def __init__(self, text, collocation, dependency_tree):
        self.text = text
        self.collocation = collocation
        self.dependency_tree = dependency_tree
 class WordMsdOnly:
@ -21,26 +14,12 @@ class WordMsdOnly:
        return None
 class WordDummy:
    def __init__(self, msd, lemma, text):
        self.msd = msd
        self.lemma = lemma
        self.text = text
    def most_frequent_text(self, word_renderer):
        return word_renderer.render(self.lemma, self.msd)
 class Word:
-    def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None):
+    def __init__(self, lemma, msd, wid, text, do_msd_translate):
        self.lemma = lemma
        self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
        self.id = wid
        self.idi = None
        self.text = text
        self.glue = ''
        self.previous_glue = '' if previous_punctuation is None else previous_punctuation
        self.fake_word = fake_word
        self.links = defaultdict(list)
@ -62,10 +41,10 @@ class Word:
    @staticmethod
    def get_msd(comp):
        d = dict(comp.items())
-        if 'ana' in d:
+        if 'msd' in d:
            return d['ana'][4:]
        elif 'msd' in d:
            return d['msd']
        elif 'ana' in d:
            return d['ana'][4:]
        else:
            logging.error(d)
            raise NotImplementedError("MSD?")
@ -76,11 +55,6 @@ class Word:
        pc.set('msd', "N" if do_msd_translate else "U")
        return Word.from_xml(pc, do_msd_translate)
    @staticmethod
    def fake_root_word(sentence_id):
        wid = sentence_id
        return Word('', '', wid, '', False, True)
    def add_link(self, link, to):
        self.links[link].append(to)
--- a/luscenje_struktur/word_stats.py
+++ b/luscenje_struktur/word_stats.py
@ -1,7 +1,7 @@
 from collections import defaultdict, Counter
-from luscenje_struktur.progress_bar import progress
+from progress_bar import progress
-import logging
+
 class WordStats:
    def __init__(self, lemma_features, db):
@ -25,8 +25,6 @@ class WordStats:
    def add_words(self, words):
        for w in progress(words, "adding-words"):
            if w.fake_word:
                continue
            params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
            res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
                WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)
@ -46,7 +44,7 @@ class WordStats:
    def generate_renders(self):
        step_name = 'generate_renders'
        if self.db.is_step_done(step_name):
-            logging.info("Skipping GenerateRenders, already complete")
+            print("Skipping GenerateRenders, already complete")
            return
        lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
--- a/luscenje_struktur/writer.py
+++ b/luscenje_struktur/writer.py
@ -1,12 +1,7 @@
 import logging 
-import os
+from progress_bar import progress
 from luscenje_struktur.progress_bar import progress
 from luscenje_struktur.formatter import OutFormatter, OutNoStatFormatter, AllFormatter, StatsFormatter
 from luscenje_struktur.collocation_sentence_mapper import CollocationSentenceMapper
 from formatter import OutFormatter, OutNoStatFormatter, AllFormatter, StatsFormatter
 class Writer:
    @staticmethod
@ -16,25 +11,23 @@ class Writer:
    @staticmethod
    def make_output_writer(args, num_components, colocation_ids, word_renderer):
        params = Writer.other_params(args)
-        return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator)
+        return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), params)
    @staticmethod
    def make_output_no_stat_writer(args, num_components, colocation_ids, word_renderer):
        params = Writer.other_params(args)
-        return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator)
+        return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), params)
    @staticmethod
    def make_all_writer(args, num_components, colocation_ids, word_renderer):
-        return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, None, args.separator)
+        return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), None)
    @staticmethod
    def make_stats_writer(args, num_components, colocation_ids, word_renderer):
        params = Writer.other_params(args)
-        return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator)
+        return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), params)
-    def __init__(self, file_out, num_components, formatter, collocation_sentence_map_dest, params, separator):
+    def __init__(self, file_out, num_components, formatter, params):
        # TODO FIX THIS
        self.collocation_sentence_map_dest = collocation_sentence_map_dest
        if params is None:
            self.multiple_output = False
            self.sort_by = -1
@ -49,7 +42,6 @@ class Writer:
        self.num_components = num_components
        self.output_file = file_out
        self.formatter = formatter
        self.separator = separator
    def header(self):
        repeating_cols = self.formatter.header_repeat()
@ -79,37 +71,27 @@ class Writer:
        return sorted(rows, key=key, reverse=self.sort_order)
    def write_header(self, file_handler):
-        file_handler.write(self.separator.join(self.header()) + "\n")
+        file_handler.write(", ".join(self.header()) + "\n")
-    def write_out_worker(self, file_handler, structure, colocation_ids, col_sent_map):
+    def write_out_worker(self, file_handler, structure, colocation_ids):
        rows = []
        components = structure.components
        for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
            if len(match) < self.min_frequency:
                continue
            self.formatter.new_match(match)
            variable_word_order = self.find_variable_word_order(match.matches)
            if col_sent_map is not None:
                # TODO find better way to get sentence_id
                for words in match.matches:
                    col_sent_map.add_map(match.match_id, '.'.join(words['1'].id.split('.')[:-1]))
            for words in match.matches:
                to_write = []
-                idx = 1
+                for idx, _comp in enumerate(components):
-                for _comp in components:
+                    idx = str(idx + 1)
-                    if _comp.idx == '#':
+                    if idx not in words:
                        continue
                    idx_s = str(idx)
                    idx += 1
                    if idx_s not in words:
                        to_write.extend([""] * self.formatter.length())
                    else:
-                        to_write.extend(self.formatter.content_repeat(words, match.representations, idx_s, structure.id))
+                        to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id))
                # make them equal size
                to_write.extend([""] * (self.num_components * self.formatter.length() - len(to_write)))
@ -118,7 +100,7 @@ class Writer:
                to_write = [structure.id] + to_write + [match.match_id]
                # header_right
-                to_write.extend(self.formatter.content_right(len(match), variable_word_order))
+                to_write.extend(self.formatter.content_right(len(match)))
                rows.append(to_write)
                if self.formatter.group():
@ -126,7 +108,7 @@ class Writer:
        if rows != []:
            rows = self.sorted_rows(rows)
-            file_handler.write("\n".join([self.separator.join(row) for row in rows]) + "\n")
+            file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
            file_handler.flush()
    def write_out(self, structures, colocation_ids):
@ -145,29 +127,17 @@ class Writer:
        if not self.multiple_output:
            fp = fp_open()
            self.write_header(fp)
            col_sent_map = CollocationSentenceMapper(os.path.join(self.collocation_sentence_map_dest, 'mapper.txt')) \
                if self.collocation_sentence_map_dest is not None else None
        for s in progress(structures, "writing:{}".format(self.formatter)):
            if self.multiple_output:
                fp = fp_open(s.id)
                self.write_header(fp)
                col_sent_map = CollocationSentenceMapper(os.path.join(self.collocation_sentence_map_dest, f'{s.id}_mapper.txt')) \
                    if self.collocation_sentence_map_dest is not None else None
            self.formatter.set_structure(s)
-            self.write_out_worker(fp, s, colocation_ids, col_sent_map)
+            self.write_out_worker(fp, s, colocation_ids)
            if self.multiple_output:
                fp_close(fp)
        if not self.multiple_output:
            fp_close(fp)
    @staticmethod
    def find_variable_word_order(matches):
        orders = {}
        for words in matches:
            order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])
            orders[order] = orders.get(order, 0) + 1
        return max(orders, key=orders.get)
--- a/src/writerpy
+++ b/src/writerpy
@ -0,0 +1,133 @@
 class Writer:
    @staticmethod
    def other_params(args):
        return (args.multiple_output, int(args.sort_by), args.sort_reversed)
    @staticmethod
    def make_output_writer(args, colocation_ids, word_renderer):
        params = Writer.other_params(args)
        return Writer(args.out, OutFormatter(colocation_ids, word_renderer), params)
    @staticmethod
    def make_output_no_stat_writer(args, colocation_ids, word_renderer):
        params = Writer.other_params(args)
        return Writer(args.out_no_stat, OutNoStatFormatter(colocation_ids, word_renderer), params)
    @staticmethod
    def make_all_writer(args, colocation_ids, word_renderer):
        return Writer(args.all, AllFormatter(colocation_ids, word_renderer), None)
    @staticmethod
    def make_stats_writer(args, colocation_ids, word_renderer):
        params = Writer.other_params(args)
        return Writer(args.stats, StatsFormatter(colocation_ids, word_renderer), params)
    def __init__(self, file_out, formatter, params):
        if params is None:
            self.multiple_output = False
            self.sort_by = -1
            self.sort_order = None
        else:
            self.multiple_output = params[0]
            self.sort_by = params[1]
            self.sort_order = params[2]
        self.output_file = file_out
        self.formatter = formatter
    def header(self):
        repeating_cols = self.formatter.header_repeat()
        cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) 
                for thd in repeating_cols]
        cols = ["Structure_ID"] + cols + ["Colocation_ID"]
        cols += self.formatter.header_right()
        return cols
    def sorted_rows(self, rows):
        if self.sort_by < 0 or len(rows) < 2:
            return rows
        if len(rows[0]) <= self.sort_by:
            logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])))
            return rows
        try:
            int(rows[0][self.sort_by])
            def key(row): 
                return int(row[self.sort_by])
        except ValueError:
            def key(row): 
                return row[self.sort_by].lower()
        return sorted(rows, key=key, reverse=self.sort_order)
    def write_header(self, file_handler):
        file_handler.write(", ".join(self.header()) + "\n")
    def write_out_worker(self, file_handler, structure, colocation_ids):
        rows = []
        components = structure.components
        for match in colocation_ids.get_matches_for(structure):
            self.formatter.new_match(match)
            for words in match.matches:
                to_write = []
                for idx, _comp in enumerate(components):
                    idx = str(idx + 1)
                    if idx not in words:
                        to_write.extend([""] * self.formatter.length())
                    else:
                        to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id))
                # make them equal size
                to_write.extend([""] * (MAX_NUM_COMPONENTS * self.formatter.length() - len(to_write)))
                # structure_id and colocation_id
                to_write = [structure.id] + to_write + [match.match_id]
                # header_right
                to_write.extend(self.formatter.content_right(len(match)))
                rows.append(to_write)
                if self.formatter.group():
                    break
        if rows != []:
            rows = self.sorted_rows(rows)
            file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
            file_handler.flush()
    def write_out(self, structures, colocation_ids):
        if self.output_file is None:
            return
        def fp_close(fp_):
            if fp_ != sys.stdout:
                fp_.close()
        def fp_open(snum=None):
            if snum is None:
                return open(self.output_file, "w")
            else:
                return open("{}.{}".format(self.output_file, snum), "w")
        if not self.multiple_output:
            fp = fp_open()
            self.write_header(fp)
        for s in structures:
            if self.multiple_output:
                fp = fp_open(s.id)
                self.write_header(fp)
            self.formatter.set_structure(s)
            self.write_out_worker(fp, s, colocation_ids)
            if self.multiple_output:
                fp_close(fp)
        if not self.multiple_output:
            fp_close(fp)
		`@ -1 +0,0 @@`
			`pypy3 wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<sloleks db data>' --collocation_sentence_map_dest data/collocation-sentence-mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks`