cjvt-valency/scripts/create_xml.py

#!/usr/bin/python3

#imports from luscenje_struktur
from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word, WordCompressed
from luscenje_struktur.syntactic_structure import build_structures
from luscenje_struktur.match_store import MatchStore
from luscenje_struktur.word_stats import WordStats
from luscenje_struktur.writer import Writer
from luscenje_struktur.loader import load_files, file_sentence_glue_generator
from luscenje_struktur.database import Database
from luscenje_struktur.time_info import TimeInfo
from luscenje_struktur.msd_translate import MSD_TRANSLATE

# make database-service
import gc
import re
import string
from collections import OrderedDict
import sys
from tqdm import tqdm


import pymongo
# import tqdm as tqdm

# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
from valency.Frame import frames_from_db_entry_headword
from valency.reduce_functions import reduce_functions

import argparse
import os
import shutil
import lxml.etree as lxml
import codecs

import logging
import argparse
import pickle
import time

from io import StringIO
from lxml import etree

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, aliased
from sqlalchemy import create_engine
from sqlalchemy import func

from pymongo import MongoClient, UpdateOne, InsertOne

# examples_num = sys.maxsize
# corpus = 'ssj'

translations = {
    'ACT': 'KDO/KAJ',
    'PAT': 'KOGA/KAJ',
    'RESLT': 'REZULTAT',
    'REC': 'KOMU/ČEMU',
    'TIME': 'KDAJ',
    'MANN': 'KAKO',
    'LOC': 'KJE',
    'MEANS': 'S ČIM',
    'GOAL': 'ČEMU',
    'REG': 'GLEDE NA KOGA/KAJ',
    'DUR': 'KOLIKO ČASA',
    'CAUSE': 'ZAKAJ',
    'COND': 'POD KATERIM POGOJEM',
    'ORIG': 'IZVOR',
    'FREQ': 'KOLIKOKRAT',
    'SOURCE': 'OD KOD',
    'AIM': 'S KAKŠNIM NAMENOM',
    'QUANT': 'ŠTEVILO',
    'EVENT': 'NA DOGODKU',
    'CONTR': 'KLJUB ČEMU',
    'ACMP': 'S KOM/ČIM',
    'RESTR': 'Z OMEJITVIJO',
    'MWPRED': '',
    'MODAL': '',
    'PHRAS': ''
}

CATEGORY_MAP = {
    'noun': 'samostalnik',
    'verb': 'glagol',
    'adjective': 'pridevnik',
    'adverb': 'prislov',
    'pronoun': 'zaimek',
    'numeral': 'števnik',
    'preposition': 'predlog',
    'conjunction': 'veznik',
    'particle': 'členek',
    'interjection': 'medmet',
    'abbreviation': 'okrajšava',
    'residual': 'neuvrščeno'
}

ASPECT_MAP = {
    'perfective': 'dovršni',
    'progressive': 'nedovršni',
    'biaspectual': 'dvovidski'
}

CASE_MAP = {
    'n': 'nominative',
    'g': 'genitive',
    'd': 'dative',
    'a': 'accusative',
    'l': 'locative',
    'i': 'instrumental'
}


Lexeme = None
LexemeFeature = None
SyntacticStructure = None
StructureComponent = None
Feature = None
LexicalUnitLexeme = None
LexicalUnit = None
LexicalUnitType = None
Category = None
Sense = None
Measure = None
LexicalUnitMeasure = None
Corpus = None
Definition = None
WordForm = None
WordFormFeature = None
FormRepresentation = None


# corpus = 'gigafida'
from pathlib import Path
import json

def hws_generator(collection, headword_text, RF, mongo):
    cur = collection.find({"headwords": headword_text})
    # print('tu2!')
    frames = []
    for ent in cur:
        frames += frames_from_db_entry_headword(ent, headword_text)  # pre-process this step for prod TODO
    cur.close()

    # if headword_text == 'brati':
    #     print('here')
    # if headword_text == 'prevajati':
    #     print('here')

    ret_frames = RF(frames, mongo.db.sensemap)
    # print('tu4!')
    for frame in ret_frames:
        frame_json = frame.to_json()
        yield frame_json


def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar, status_collection, corpus_type):
    sentences_of_interest = {}
    # all_sentences = set()
    sorted(headword_category, key=lambda x: x[0])
    # num_sentences in RAM at once
    sentences_num_limit = 15000
    sentences_in_ram = 0
    # part = 0
    # start_time = time.time()
    # first_sentence = True
    # section_included = False
    # last_processed_hw = 'pomeniti'
    # last_processed_hw = 'iti'
    # last_processed_hw = 'aktivirati'
    # last_processed_hw = 'aktivirati'

    status_collection_update_list = []

    # already_processed = False
    for headword_id, (headword_text, category_text) in enumerate(headword_category):
        # check whether element has been processed
        if status_collection.count_documents({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}):
            pbar.update(1)
            continue
        # print(headword_text)
        # if already_processed:
        #     if headword_text != last_processed_hw:
        #         continue
        #     else:
        #         already_processed = False
        # for headword_text, category_text in headword_category[15:20]:
        # headword_text = 'zadovoljen'
        # category_text = 'adjective'
        headword_patterns_ids = {}
        # print('tu1!')
        cur = collection.find({"headwords": headword_text})
        # print('tu2!')
        frames = []
        for ent in cur:
            frames += frames_from_db_entry_headword(ent, headword_text)  # pre-process this step for prod TODO
        cur.close()

        # if headword_text == 'brati':
        #     print('here')
        # if headword_text == 'prevajati':
        #     print('here')

        ret_frames = RF(frames, mongo.db.sensemap)
        json_ret = {"frames": []}
        # print('tu4!')
        for frame in ret_frames:
            frame_json = frame.to_json()
            json_ret["frames"].append(frame_json)
        # print('tu5!')
        # get xml values


        for hws in json_ret.values():
            for hw in hws:

                # print(hw['hw'])
                # if hw['hw'] == 'pomeniti':
                #     print('aaa')
                # generate valency pattern key
                valency_pattern_key = []
                functors = {}
                if len(hw['tids']) != 1:
                    raise Exception('Multiple TIDS')
                for slot in hw['slots']:
                    valency_pattern_key.append(slot['functor'])
                    for tid in slot['tids']:
                        if tid not in functors:
                            functors[tid] = {}
                        functors[tid] = slot['functor']
                valency_pattern_key = tuple(sorted(valency_pattern_key))
                if valency_pattern_key not in headword_patterns_ids:
                    headword_patterns_ids[valency_pattern_key] = []

                for sentence in hw['sentences']:
                    # all_sentences.add(sentence[0][0])
                    # if len(headword_patterns_ids[valency_pattern_key]) < examples_num:
                    # if section_included:
                    #     if not sentences_in_ram > sentences_num_limit:
                    #         sentences_in_ram += 1
                    #         continue
                    #     else:
                    #         first_sentence = True

                    sentence_id = sentence[0][0].rsplit('.', 1)[0]
                    # print(sentence_id)
                    if sentence_id not in sentences_of_interest:
                        sentences_of_interest[sentence_id] = {}
                    idi = 0
                    parent_idi = -1
                    # print('t1')
                    for idx, word in sentence:
                        if idx == hw['tids'][0]:
                            parent_idi = idi
                        if word['word']:
                            idi += 1
                    # print('t2')
                    if parent_idi == -1:
                        raise Exception('No parent found!')
                    idi = 0
                    # if len(sentence) > 500:
                    #     print(len(sentence))
                    for idx, word in sentence:
                        if idx in functors:
                            # sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = functors[idx]
                            # sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = (functors[idx], idi)
                            # sentences_of_interest[sentence_id][idi] = (functors[idx], (word['lemma'], MSD_TRANSLATE[word['msd']]))
                            sentences_of_interest[sentence_id][str(idi)] = (str(parent_idi), functors[idx])
                        if word['word']:
                            # if sentence_id == 'ssj37.216.892':
                            #     print(idi)
                            #     print(word['text'])
                            idi += 1
                    # print('t3')
                    headword_patterns_ids[valency_pattern_key].append(sentence_id)

                    # check if this is first sentence
                    # if first_sentence:
                    #     one_element = next(iter(sentences_of_interest.items()))
                    #     section_included = w_collection.count_documents({'_id': one_element[0],
                    #                        list(one_element[1].keys())[0]: list(one_element[1].values())[0]}) == 1
                    #     first_sentence = False
                    if sentences_in_ram >= sentences_num_limit:
                        # print('print1:')
                        # print(time.time() - start_time)
                        start_time = time.time()
# !!!!!!!!!!!!!!!!!!!!!!print('Part %d finalized')
#                             print('Sentences in ram:')
#                             print(sentences_in_ram)
                        sentences_in_ram = 0

                        # [InsertOne({'y': 1}), DeleteOne({'x': 1}),
                        #  ...             ReplaceOne({'w': 1}, {'z': 1}, upsert=True)]

                        # requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
                        # if 'GF0010453.1116.1' in sentences_of_interest:
                        #     print('here')
                        if len(status_collection_update_list) > 0:
                            status_collection.bulk_write(status_collection_update_list)
                        requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
                        # print('print2:')
                        # print(time.time() - start_time)
                        # start_time = time.time()
                        result = w_collection.bulk_write(requests)

                        # print('print3:')
                        # print(time.time() - start_time)
                        # start_time = time.time()
                        del status_collection_update_list
                        del requests
                        del sentences_of_interest
                        gc.collect()

                        # print('print4:')
                        # print(time.time() - start_time)
                        # start_time = time.time()

                        # print(part)
                        # print('HEADWORD')
                        # print(headword_text)
                        # pbar.update(1)
                        # part += 1
                        #
                        # w_collection.bulk_write(
                        #     array.map((val) = >
                        #     ({
                        #         updateOne: {
                        #             filter: {_id: val, uniqueid: 1001, atype: 1, ftype: 6},
                        #             update: {
                        #     $set: {epoch: 1548484978658, actionbyuserid: 110, title: 'Good Morning To All'}},
                        #     upsert: true
                        #     }
                        #     })
                        #     )
                        # })

                        # sentences_of_interest = {{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()}
                        # w_collection.update_many({'_id': {'$exists': False}}, sentences_of_interest, upsert=True)
                        # try:
                        #     w_collection.insert_many(sentences_of_interest, ordered=False)
                        # except pymongo.errors.BulkWriteError as e:
                        #     print(e.details['writeErrors'])
                        status_collection_update_list = []
                        sentences_of_interest = {}

                        # first_sentence = True

                    sentences_in_ram += 1
        pbar.update(1)
        status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}))

    # if 'GF0010453.1116.1' in sentences_of_interest:
    #     a = sentences_of_interest['GF0010453.1116.1']
    #     print('here')
    if len(status_collection_update_list) > 0:
        status_collection.bulk_write(status_collection_update_list)
    requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]

    if len(requests) > 0:
        result = w_collection.bulk_write(requests)

    # sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
    # try:
    #     w_collection.insert_many(sentences_of_interest, ordered=False)
    # except pymongo.errors.BulkWriteError as e:
    #     print(e.details['writeErrors'])
    # sentences_of_interest = {}
    #                 # else:
    #                 #     print('aaa')
    # return sentences_of_interest


def create_sentence_output(sentence, headword_id, corpus):
    glue_outside = False
    headword_id = str(headword_id)
    parent_node = etree.Element('corpusExample')
    parent_node.set('corpusName', corpus)
    # parent_node.text = 'AAA'
    # parent_node.prefix = 'BBB'
    # parent_node.tail = 'CCC'
    cur_node = parent_node
    # formatted_sentence = ''
    first_in_tag = True
    first_outside_tag = False
    in_dependency_tree = False
    # TODO use whole sentence!
    # for idi, word in enumerate(sentence):

    # def idi_word_generator(sentence):
    #     idi = 0
    #     for word in sentence:
    #         if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
    #             continue
    #         yield idi, word
    #         idi += 1

    idi = 0
    attach_to = None
    p_cur_node = None
    p_attach_to = None
    p_glue_attach_to = None
    previous_word = None
    # if sentence[0][0][0] == 'Tako':
    #     print('here')
    # for idi, word in idi_word_generator(sentence):
    for word_id in range(len(sentence)):
        # is_ending_tree = False
        # SRL container output
        word = sentence[word_id]

        # sentence output
        if in_dependency_tree:
            if headword_id not in word[2] or in_dependency_tree != word[2][headword_id]:
                attach_to = cur_node
                # is_ending_tree = True
                p_glue_attach_to = cur_node
                cur_node = parent_node
                if not first_in_tag:
                    # formatted_sentence += '\n'
                    first_in_tag = True
                # formatted_sentence += '</tree>'
                in_dependency_tree = False
                first_outside_tag = True

        if headword_id in word[2] and not in_dependency_tree:
            dep_tree = lxml.SubElement(cur_node, 'tree')
            dep_tree.set('role', word[2][headword_id])
            cur_node = dep_tree
            if not first_in_tag:
                # formatted_sentence += '\n'
                first_in_tag = True
            # formatted_sentence += '<tree role="{}">'.format(word[2][headword_id])
            in_dependency_tree = word[2][headword_id]
            attach_to = None
            if p_glue_attach_to is not None:
                glue_outside = True

        if headword_id == str(idi) and not (len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None):
        # if headword_id == idi:
            comp = lxml.SubElement(cur_node, 'comp')
            comp.set('role', 'headword')

            if not first_outside_tag:
                if p_attach_to is None:
                    if p_cur_node is not None:
                        p_cur_node.text += previous_word[0][1]
                else:
                    p_attach_to.tail += previous_word[0][1]
            elif p_glue_attach_to is not None:
                if p_glue_attach_to.tail is None:
                    p_glue_attach_to.tail = previous_word[0][1]
                else:
                    p_glue_attach_to.tail += previous_word[0][1]
            # elif p_attach_to is not None:
            #     if p_attach_to.tail is None:
            #         p_attach_to.tail = previous_word[0][1]
            #     else:
            #         p_attach_to.tail += previous_word[0][1]
            word_text = word[0][0]
            comp.text = word_text
            attach_to = comp
            if not first_in_tag:
                # formatted_sentence += '\n'
                first_in_tag = True
            first_outside_tag = True
            p_cur_node = cur_node
            p_glue_attach_to = comp
            p_attach_to = attach_to
            previous_word = word
            # formatted_sentence += '<comp structure_id="headword">{}</comp>'.format(word[0][0])
            idi += 1
            continue
        if word[1] and in_dependency_tree:
            col_id = -1
            for i, col in enumerate(word[1]):
                if headword_id in col[3]:
                    col_id = i
                    break

            if col_id != -1:
                comp = lxml.SubElement(cur_node, 'comp')
                comp.set('structure_id', word[1][col_id][0])
                comp.set('num', word[1][col_id][1])

                if not first_outside_tag:
                    if p_attach_to is None:
                        if p_cur_node is not None:
                            p_cur_node.text += previous_word[0][1]
                    else:
                        p_attach_to.tail += previous_word[0][1]
                elif p_glue_attach_to is not None:
                    if p_glue_attach_to.tail is None:
                        p_glue_attach_to.tail = previous_word[0][1]
                    else:
                        p_glue_attach_to.tail += previous_word[0][1]
                # elif p_attach_to is not None:
                #     if p_attach_to.tail is None:
                #         p_attach_to.tail = previous_word[0][1]
                #     else:
                #         p_attach_to.tail += previous_word[0][1]
                word_text = word[0][0]
                comp.text = word_text
                attach_to = comp
                if not first_in_tag:
                    # formatted_sentence += '\n'
                    first_in_tag = True
                first_outside_tag = True
                # Assuming one collocation per word
                # formatted_sentence += '<comp structure_id="{}" num="{}">{}</comp>'.format(word[1][0][0], word[1][0][1], word[0][0])
                p_cur_node = cur_node
                p_glue_attach_to = comp
                p_attach_to = attach_to
                previous_word = word
                idi += 1
                continue
            # collocation
        # if not first_in_new_row:
        #     # formatted_sentence += ' '
        #     word_text = ' ' + word[0][0]
        # else:
        #     word_text = word[0][0]
        # if first_in_tag and previous_word:
        #     word_text = previous_word[0][1] + word[0][0]
        # else:
        #     word_text = word[0][0]
        # word_text += word[0][1]
        # word_text = word[0][0] + word[0][1]
        if not first_outside_tag:
            if p_attach_to is None:
                if p_cur_node is not None:
                    p_cur_node.text += previous_word[0][1]
            else:
                p_attach_to.tail += previous_word[0][1]
            word_text = word[0][0]
        else:
            word_text = ''
            if p_attach_to is None:
                if p_cur_node is not None:
                    word_text += previous_word[0][1]
            else:
                word_text += previous_word[0][1]
            if glue_outside:
                p_glue_attach_to.tail = previous_word[0][1]
                word_text = word[0][0]
            else:
                word_text += word[0][0]


        if attach_to is None:
            if cur_node.text is None:
                cur_node.text = word_text
            else:
                cur_node.text += word_text
        else:
            if attach_to.tail is None:
                attach_to.tail = word_text
            else:
                attach_to.tail += word_text
            # attach_to.tail +=word[0][0]
        # formatted_sentence += word[0][0]
        first_in_tag = False
        first_outside_tag = False

        p_cur_node = cur_node
        p_attach_to = attach_to
        previous_word = word

        p_glue_attach_to = None

        if len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None:
            continue
        idi += 1

    return parent_node


def get_SRLcontainer_data(sentence, word_of_interest_id, summary):
    for word in sentence:
        if word_of_interest_id in word[2]:
            for col in word[1]:
                if word_of_interest_id in col[3]:
                    if word[2][word_of_interest_id] not in summary:
                        summary[word[2][word_of_interest_id]] = {}
                    if col[0] not in summary[word[2][word_of_interest_id]]:
                        summary[word[2][word_of_interest_id]][col[0]] = {}
                    # word_of_interest_included = word_of_interest_id in col[3]
                    if col[1] not in summary[word[2][word_of_interest_id]][col[0]]:
                        summary[word[2][word_of_interest_id]][col[0]][col[1]] = set()
                    if col[2][0] == 'S':
                        summary[word[2][word_of_interest_id]][col[0]][col[1]].add((word[0][0], col[2][1], word[3]))

    return summary


def valid_valency_pattern(valency_pattern_key):
    occurences = set()
    for v_p in valency_pattern_key:
        if v_p in occurences:
            return False
        occurences.add(v_p)
    return True


def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, corpus, examples_num, headword_patterns_ssj):
    cur = collection.find({"headwords": headword_text})
    frames = []
    for ent in cur:
        frames += frames_from_db_entry_headword(ent, headword_text)
    cur.close()

    ret_frames = RF(frames, mongo.db.sensemap)
    json_ret = {"frames": []}
    for frame in ret_frames:
        frame_json = frame.to_json()
        json_ret["frames"].append(frame_json)

    # get xml values
    headword_patterns = {}
    new_patterns = {}

    for hws in json_ret.values():

        for hw in hws:
            # generate valency pattern key

            valency_pattern_key = []
            for slot in hw['slots']:
                valency_pattern_key.append(slot['functor'])
            # sort valency_pattern_key by order provided in translations
            valency_pattern_key_new = []
            for key in translations:
                if key in valency_pattern_key:
                    valency_pattern_key_new.append(key)
            valency_pattern_key = tuple(valency_pattern_key_new)

            if valency_pattern_key not in headword_patterns:
                headword_patterns[valency_pattern_key] = {}
                headword_patterns[valency_pattern_key]['sentence_examples'] = []
                headword_patterns[valency_pattern_key]['sentence_num'] = 0
                headword_patterns[valency_pattern_key]['sr_data'] = {}
                if valency_pattern_key not in patterns and valency_pattern_key not in new_patterns:
                    new_patterns[valency_pattern_key] = pattern_id_max
                    patterns[valency_pattern_key] = pattern_id_max
                    pattern_id_max += 1
                headword_patterns[valency_pattern_key]['id'] = patterns[valency_pattern_key]

            sr_data = headword_patterns[valency_pattern_key]['sr_data']
            tids = set(hw['tids'])

            if valency_pattern_key in headword_patterns_ssj:
                ssj_len = len(headword_patterns_ssj[valency_pattern_key]['sentence_examples'])
            else:
                ssj_len = 0

            for sentence in hw['sentences']:
                # sentences_of_interest.append(sentence[0])
                # get sentence example
                # sentence_example = []
                sent_id = sentence[0][0].rsplit('.', 1)[0]

                try:
                    db_sentence = next(iter(w_a_collection.find({'_id': sent_id})))['words']
                except StopIteration:
                    continue

                # if valency_pattern_key == ('ACT', 'PAT'):
                #     print('am')

                # idi = 0
                idi = 0
                hw_idi = -1
                for word_id, word in sentence:
                    if word_id in tids:
                        hw_idi = idi
                    if word['word']:
                        idi += 1
                if hw_idi == -1:
                    raise Exception('No such headword idi!')
                # for idi, word in idi_word_generator(sentence):
                #     print('here')
                # for word_id, word_dict in sentence:
                #     # TODO Modify sentence!
                #     # if formatted_sentences[sent_id]
                #     sentence_example.append(word_dict['text'])
                #     if word_dict['word']:
                #         idi += 1
                # if sent_id == 'ssj134.880.3375':
                #     print('here')
                # if sent_id == 'ssj38.227.917':
                #     print('here')
                # if sent_id == 'GF0004627.1913.1':
                #     print('here')
                # print(sent_id)
                # print([a for a in w_a_collection.find()])

                # if valency_pattern_key == ('ACT', 'PAT'):
                #     print('here')

                sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data)
                examples_included_num = 0

                # sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
                if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key):
                    examples_included_num += 1
                    sentence_example = create_sentence_output(db_sentence, hw_idi, corpus)
                    # sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)

                    # sentence_example = ''.join(sentence_example)
                    # headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example)
                    headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example)
                headword_patterns[valency_pattern_key]['sentence_num'] += 1

            headword_patterns[valency_pattern_key]['sr_data'] = sr_data

    # add patterns to db
    new_patterns_query = [InsertOne({'_id': v, 'semantic_roles': list(k)}) for k, v in new_patterns.items()]
    if len(new_patterns_query) > 0:
        result = valency_pattern_id_collection.bulk_write(new_patterns_query)


    # calculate statistics
    semantic_role_stats = {}
    sentence_tot = 0
    pattern_tot = len(headword_patterns)
    for key, val in headword_patterns.items():
        sentence_num = val['sentence_num']
        for sr in key:
            if sr in semantic_role_stats:
                semantic_role_stats[sr]['valency_pattern_num'] += 1
                semantic_role_stats[sr]['valency_sentence_num'] += sentence_num
            else:
                semantic_role_stats[sr] = {}
                semantic_role_stats[sr]['valency_pattern_num'] = 1
                semantic_role_stats[sr]['valency_sentence_num'] = sentence_num
        sentence_tot += sentence_num

    return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max


def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
    query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
                                  Lexeme.dummy, LexicalUnitType.name) \
        .join(Category, Category.id == Lexeme.category_id) \
        .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
        .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
        .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
        .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
        .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
        .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
        .filter(LexicalUnitType.name == 'single_lexeme_unit') \
        .filter(Measure.name == 'frequency') \
        .filter(Corpus.name == 'gigafida') \
        .filter(Corpus.version == '2.0')

    # valency_pattern_id_collection.find()


    # used to not repeat search queries for prepositions
    preposition_list = {}
    for headword_text, category_text in headword_category:
    # with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:

        # a = [a for a in valency_pattern_id_collection.find()]
        patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in valency_pattern_id_collection.find()]}
        # patterns = {}
        pattern_id_max = len(patterns) + 1

        # pattern_examples_limit = 4

        # get data
        headword_patterns_ssj, semantic_role_stats_ssj, sentence_tot_ssj, pattern_tot_ssj, pattern_id_max = obtain_xml_data(collection_ssj, w_a_collection_ssj,
                                                                                    headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, 'ssj500k 2.2', pattern_examples_limit,
                                                                                                                            {})

        if not ignore_gigafida:
            headword_patterns_gf, semantic_role_stats_gf, sentence_tot_gf, pattern_tot_gf, pattern_id_max = obtain_xml_data(collection_gigafida,
                                                                                                w_a_collection_gigafida,
                                                                                                headword_text, RF,
                                                                                                mongo, patterns,
                                                                                                pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj)


        wf1 = aliased(WordFormFeature)
        wf2 = aliased(WordFormFeature)
        wf3 = aliased(WordFormFeature)
        query_preposition = session.query(FormRepresentation.form) \
            .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
            .join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
            .join(wf1, wf1.word_form_id == WordForm.id) \
            .join(wf2, wf2.word_form_id == WordForm.id) \
            .join(wf3, wf3.word_form_id == WordForm.id) \
            .filter(Lexeme.lemma == headword_text) \
            .filter(wf1.value == 'singular') \
            .filter(wf2.value == 'third') \
            .filter(wf3.value == 'present')
        pattern_translation_hws = query_preposition.all()

        pattern_translation_3_sin = headword_text
        if len(pattern_translation_hws) == 1:
            pattern_translation_3_sin = pattern_translation_hws[0].form

        qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
        dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})


        if headword_text[-1] == '_':
            headword_text_query = headword_text[:-1]
        else:
            headword_text_query = headword_text
        query = query_general.filter(Category.name == category_text) \
            .filter(Lexeme.lemma == headword_text_query) \
            .group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
                      LexicalUnitType.name)

        # res = query.one_or_none()
        query_res = query.all()

        # query2 = session.query(Lexeme.id) \
        #     .join(Category, Category.id == Lexeme.category_id) \
        #     .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
        #     .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
        #     .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
        #     .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
        #     .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
        #     .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
        #     .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
        #     .join(Feature, Feature.id == LexemeFeature.feature_id) \
        #     .filter(LexicalUnitType.name == 'single_lexeme_unit') \
        #     .filter(Measure.name == 'frequency') \
        #     .filter(Category.name == 'preposition') \
        #     .filter(Lexeme.lemma == 'za') \
        #     .filter(Feature.name == 'case') \
        #     .filter(LexemeFeature.value == 'instrumental') \
        #     .group_by(Lexeme.id)

        # query2 = session.query(Lexeme.id) \
        #     .join(Category, Category.id == Lexeme.category_id) \
        #     .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
        #     .join(Feature, Feature.id == LexemeFeature.feature_id) \
        #     .filter(Lexeme.lemma == 'za') \
        #     .filter(Feature.name == 'case') \
        #     .filter(LexemeFeature.value == 'instrumental') \
        #     .group_by(Lexeme.id)
        #
        # a = query2.all()

        if len(query_res) == 1:
            (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
            query_res[0]

        elif len(query_res) > 1:
            # all lexical_unit_ids equal or at least one dummy
            final_lexical_unit_id = 0
            final_lexical_unit_lexeme_id = 0
            for r in query_res:
                (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy,
                 lexical_unit_type_name) = r
                if dummy:
                    final_lexical_unit_id = lexical_unit_id
                    final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
                    break
            lexical_unit_id = final_lexical_unit_id
            lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
        else:
            frequency = 0
            lexeme_id = 0
            lexical_unit_id = 0
            lexical_unit_lexeme_id = 0
            lexical_unit_type_name = ''

        sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all()
        features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
            .filter(LexemeFeature.lexeme_id == lexeme_id) \
            .filter(Feature.name == 'aspect').all()

        entry = lxml.SubElement(dictionary, 'entry')

        head = lxml.SubElement(entry, 'head')

        headword = lxml.SubElement(head, 'headword')
        lemma = lxml.SubElement(headword, 'lemma')
        lemma.text = headword_text

        lexical_unit = lxml.SubElement(head, 'lexicalUnit')
        lexical_unit.set('id', str(lexical_unit_id))
        lexical_unit_type_name = 'single' if lexical_unit_type_name == 'single_lexeme_unit' else lexical_unit_type_name
        lexical_unit.set('type', lexical_unit_type_name)
        lexeme = lxml.SubElement(lexical_unit, 'lexeme')
        lexeme.set('lexical_unit_lexeme_id', str(lexical_unit_lexeme_id))
        lexeme.text = headword_text

        grammar = lxml.SubElement(head, 'grammar')
        category = lxml.SubElement(grammar, 'category')
        if args.language == 'sl':
            category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
        else:
            category.text = category_text
        grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
        if args.language == 'sl':
            grammarFeature.set('name', 'vid')
            grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[
                0].value in ASPECT_MAP else ''
        else:
            grammarFeature.set('name', 'aspect')
            grammarFeature.text = features[0].value if len(features) > 0 else ''

        measureList = lxml.SubElement(head, 'measureList')
        measure = lxml.SubElement(measureList, 'measure')
        measure.set('type', 'frequency')
        # TODO Modify this!
        measure.set('source', 'Gigafida 2.0')
        # measure.set('source', 'ssj500k')
        measure.text = str(int(frequency))


        body = lxml.SubElement(entry, 'body')
        statisticsContainerList = lxml.SubElement(body, 'statisticsContainerList')

        # combine semantic_role_stats
        semantic_role_stats = {}
        for semanticRole_val, semanticRole_stats in semantic_role_stats_ssj.items():
            semantic_role_stats[semanticRole_val] = {}
            semantic_role_stats[semanticRole_val]['ssj'] = semanticRole_stats

        if not ignore_gigafida:
            for semanticRole_val, semanticRole_stats in semantic_role_stats_gf.items():
                if semanticRole_val not in semantic_role_stats:
                    semantic_role_stats[semanticRole_val] = {}
                semantic_role_stats[semanticRole_val]['gf'] = semanticRole_stats

        for semanticRole_val, semanticRole_stats in semantic_role_stats.items():
            statisticsContainer = lxml.SubElement(statisticsContainerList, 'statisticsContainer')
            semanticRole = lxml.SubElement(statisticsContainer, 'semanticRole')
            semanticRole.text = semanticRole_val
            measureList = lxml.SubElement(statisticsContainer, 'measureList')
            if 'ssj' in semanticRole_stats:
                measure_pattern_ssj = lxml.SubElement(measureList, 'measure')
                measure_pattern_ssj.set('type', 'valency_pattern_ratio')
                measure_pattern_ssj.set('source', 'ssj500k 2.2')
                measure_pattern_ssj.text = '%.4f' % (
                            semantic_role_stats[semanticRole_val]['ssj']['valency_pattern_num'] / pattern_tot_ssj)
                measure_sentence_ssj = lxml.SubElement(measureList, 'measure')
                measure_sentence_ssj.set('type', 'valency_sentence_ratio')
                measure_sentence_ssj.set('source', 'ssj500k 2.2')

                if sentence_tot_ssj == 0:
                    measure_sentence_ssj.text = '%.4f' % (0.0)
                    # print(headword_text)
                    # print(semanticRole_val)
                    # print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
                else:
                    measure_sentence_ssj.text = '%.4f' % (
                                semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj)

                # measure_sentence_ssj.text = '%.2f' % (
                #         semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj)
            if 'gf' in semanticRole_stats and not ignore_gigafida:
                measure_pattern_gf = lxml.SubElement(measureList, 'measure')
                measure_pattern_gf.set('type', 'valency_pattern_ratio')
                measure_pattern_gf.set('source', 'Gigafida 2.0')
                measure_pattern_gf.text = '%.4f' % (
                            semantic_role_stats[semanticRole_val]['gf']['valency_pattern_num'] / pattern_tot_gf)
                measure_sentence_gf = lxml.SubElement(measureList, 'measure')
                measure_sentence_gf.set('type', 'valency_sentence_ratio')
                measure_sentence_gf.set('source', 'Gigafida 2.0')
                if sentence_tot_gf == 0:
                    measure_sentence_gf.text = '%.4f' % (0.0)
                    # print(headword_text)
                    # print(semanticRole_val)
                    # print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
                else:
                    measure_sentence_gf.text = '%.4f' % (
                                semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)

        senseList = lxml.SubElement(body, 'senseList')
        for sense_id in sense_ids:
            if len(sense_ids) > 1 and sense_id.dummy:
                continue

            sense = lxml.SubElement(senseList, 'sense')
            if not sense_id.dummy:
                sense.set('id', str(sense_id.id))

            definitionList = lxml.SubElement(sense, 'definitionList')

            definition_texts = session.query(Definition.description).filter(
                Definition.sense_id == sense_id.id).all()

            for definition_text in definition_texts:
                definition = lxml.SubElement(definitionList, 'definition')
                definition.text = definition_text[0]

            syntactic_structures = session.query(SyntacticStructure.id, SyntacticStructure.name,
                                                 StructureComponent.id, StructureComponent.name).join(
                LexicalUnit, LexicalUnit.syntactic_structure_id == SyntacticStructure.id) \
                .join(StructureComponent, StructureComponent.syntactic_structure_id == SyntacticStructure.id) \
                .filter(LexicalUnit.id == sense_id.id)

            # .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \

            # syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \
            #     .filter(SyntacticStructure.id == sense_id)

            syntactic_structuresr = syntactic_structures.all()
            # syntactic_structures2r = syntactic_structures2.all()

            valencyPatternList = lxml.SubElement(sense, 'valencyPatternList')
            valencyPatternList.set('system', 'JOS')

            # combine semantic_role_stats ##################################
            headword_patterns = {}
            for headword_patterns_val, headword_patterns_stats in headword_patterns_ssj.items():
                headword_patterns[headword_patterns_val] = {}
                headword_patterns[headword_patterns_val]['ssj'] = headword_patterns_stats

            if not ignore_gigafida:
                for headword_patterns_val, headword_patterns_stats in headword_patterns_gf.items():
                    if headword_patterns_val not in headword_patterns:
                        headword_patterns[headword_patterns_val] = {}
                    headword_patterns[headword_patterns_val]['gf'] = headword_patterns_stats
            #################################################################
            for headword_pattern, headword_pattern_dict in headword_patterns.items():
                valencyPattern = lxml.SubElement(valencyPatternList, 'valencyPattern')
                valencyPattern.set('id', str(patterns[headword_pattern]))
                measureList_sense = lxml.SubElement(valencyPattern, 'measureList')
                if 'ssj' in headword_pattern_dict:
                    measure_sense = lxml.SubElement(measureList_sense, 'measure')
                    measure_sense.set('type', 'frequency_all')
                    measure_sense.set('source', 'ssj500k 2.2')
                    measure_sense.text = str(headword_pattern_dict['ssj']['sentence_num'])
                if not ignore_gigafida and 'gf' in headword_pattern_dict and headword_pattern_dict['gf']['sentence_num']:
                    measure_sense = lxml.SubElement(measureList_sense, 'measure')
                    measure_sense.set('type', 'frequency_all')
                    measure_sense.set('source', 'Gigafida 2.0')
                    measure_sense.text = str(headword_pattern_dict['gf']['sentence_num'])
                semanticRoleContainerList = lxml.SubElement(valencyPattern, 'semanticRoleContainerList')
                # patternId = lxml.SubElement(semanticRoles, 'patternId')
                # patternId.text = str(patterns[headword_pattern])

                if 'ACT' in headword_pattern:
                    patternTranslationText = 'KDO/KAJ ' + pattern_translation_3_sin
                else:
                    patternTranslationText = headword_text
                for semantic_role in headword_pattern:
                    if semantic_role != 'ACT':
                        # additional rules
                        # if semantic_role == 'RESLT':
                        #     pass
                        # else:
                        #     patternTranslationText += ' ' + translations[semantic_role]
                        patternTranslationText += ' ' + translations[semantic_role]
                    semanticRoleContainer = lxml.SubElement(semanticRoleContainerList, 'semanticRoleContainer')
                    semanticRole = lxml.SubElement(semanticRoleContainer, 'semanticRole')
                    semanticRole.text = semantic_role

                    syntactic_structure_dict = {}

                    if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']:
                        for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items():
                            if syn_struct_id not in syntactic_structure_dict:
                                syntactic_structure_dict[syn_struct_id] = {}
                            for com_num, com_set in syn_struct_dict.items():
                                if com_num not in syntactic_structure_dict[syn_struct_id]:
                                    syntactic_structure_dict[syn_struct_id][com_num] = set()
                                for lex in com_set:
                                    syntactic_structure_dict[syn_struct_id][com_num].add(lex)

                    if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']:
                        for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items():
                            if syn_struct_id not in syntactic_structure_dict:
                                syntactic_structure_dict[syn_struct_id] = {}
                            for com_num, com_set in syn_struct_dict.items():
                                if com_num not in syntactic_structure_dict[syn_struct_id]:
                                    syntactic_structure_dict[syn_struct_id][com_num] = set()
                                for lex in com_set:
                                    syntactic_structure_dict[syn_struct_id][com_num].add(lex)

                    if len(syntactic_structure_dict) > 0:
                        syntacticStructureList = lxml.SubElement(semanticRoleContainer, 'syntacticStructureList')
                        # iterate over syntactic structures and write them
                        for syn_struct_id, component_dict in syntactic_structure_dict.items():
                            syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
                            syntacticStructure.set('id', syn_struct_id)
                            for comp_id, lexemes in component_dict.items():
                                for l in lexemes:
                                    component = lxml.SubElement(syntacticStructure, 'component')
                                    component.set('num', comp_id)
                                    lexem = lxml.SubElement(component, 'lexeme')

                                    if l in preposition_list:
                                        prep_id = preposition_list[l]
                                    else:
                                        query_preposition = session.query(Lexeme.id) \
                                            .join(Category, Category.id == Lexeme.category_id) \
                                            .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
                                            .join(Feature, Feature.id == LexemeFeature.feature_id) \
                                            .filter(Lexeme.lemma == l[2]) \
                                            .filter(Feature.name == 'case') \
                                            .filter(LexemeFeature.value == CASE_MAP[l[1]]) \
                                            .group_by(Lexeme.id)
                                        preposition_ids = query_preposition.all()
                                        if len(preposition_ids) != 1:
                                            prep_id = ''
                                        else:
                                            prep_id = str(preposition_ids[0][0])
                                        preposition_list[l] = prep_id


                                    lexem.set('sloleks', prep_id)
                                    lexem.text = l[2]

                patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation')
                patternRepresentation.text = patternTranslationText

                exampleContainerList = lxml.SubElement(valencyPattern, 'exampleContainerList')
                if 'ssj' in headword_pattern_dict:
                    for sentence_example in headword_pattern_dict['ssj']['sentence_examples']:
                        exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
                        # corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
                        exampleContainer.append(sentence_example)

                if 'gf' in headword_pattern_dict:
                    for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
                        exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
                        # corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
                        exampleContainer.append(sentence_example)
        with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'),
                              encoding='utf-8') as xf:
            xf.write(dictionary, pretty_print=True)
        pbar.update(1)


def init_db(db):
    global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
    [db_user, db_password, db_database, db_host] = db.split(':')
    Base = declarative_base()
    engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
                           pool_recycle=14400)
    Base.metadata.reflect(engine)

    class Lexeme(Base):
        __table__ = Base.metadata.tables['jedro_lexeme']

    class LexemeFeature(Base):
        __table__ = Base.metadata.tables['jedro_lexeme_feature']

    class SyntacticStructure(Base):
        __table__ = Base.metadata.tables['jedro_syntacticstructure']

    class StructureComponent(Base):
        __table__ = Base.metadata.tables['jedro_structurecomponent']

    class Feature(Base):
        __table__ = Base.metadata.tables['jedro_feature']

    class LexicalUnitLexeme(Base):
        __table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']

    class LexicalUnit(Base):
        __table__ = Base.metadata.tables['jedro_lexicalunit']

    class LexicalUnitType(Base):
        __table__ = Base.metadata.tables['jedro_lexicalunittype']

    class Category(Base):
        __table__ = Base.metadata.tables['jedro_category']

    class Sense(Base):
        __table__ = Base.metadata.tables['jedro_sense']

    class Measure(Base):
        __table__ = Base.metadata.tables['jedro_measure']

    class LexicalUnitMeasure(Base):
        __table__ = Base.metadata.tables['jedro_lexicalunitmeasure']

    class Corpus(Base):
        __table__ = Base.metadata.tables['jedro_corpus']

    class Definition(Base):
        __table__ = Base.metadata.tables['jedro_definition']

    class WordForm(Base):
        __table__ = Base.metadata.tables['jedro_wordform']

    class WordFormFeature(Base):
        __table__ = Base.metadata.tables['jedro_wordform_feature']

    class FormRepresentation(Base):
        __table__ = Base.metadata.tables['jedro_formrepresentation']

    return engine


def match_file(words, structures):
    matches = []

    for s in structures:
        for w in words:
            mhere = s.match(w)
            for match in mhere:
                # save only those with verbs in them
                if not [True for m in match.values() if m.msd[0] == 'V']:
                    continue

                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
                colocation_id = tuple(colocation_id)

                matches.append([match, colocation_id])

    return matches


possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'}


def find_word_sons(word, deppar_dict, word_id, role):
    for k, v in word.links.items():
        for w in v:
            # if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
            #     print('here')
            if k in possible_jos_links:
                if w.id not in deppar_dict:
                    deppar_dict[w.id] = {}
                deppar_dict[w.id][word_id] = role
                find_word_sons(w, deppar_dict, word_id, role)
            # elif k in possible_jos_links:
            #     raise Exception('One word in multiple dependency parsetrees')


# for ignoring punctuations
def idi_word_generator(sentence):
    idi = 0
    for word in sentence:
        if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
            continue
        yield idi, word
        idi += 1


def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_corpus_orig):
    structures, _, max_num_components = build_structures(args)
    timeinfo = TimeInfo(len(input_corpus))

    database = Database(args)
    formatted_sentences = {}
    start_time = time.time()

    sentences_num_limit = 10000
    sentences_in_ram = 0
    sentence_glue_numbers = None

    is_gf = input_corpus_orig is not None
    if is_gf:
        glue_words_gen = file_sentence_glue_generator(input_corpus_orig, args.pc_tag, w_collection)

    for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus):
        if is_gf:
            # create tuple for comparison with sentence_flue_words
            sent_id_numbers = tuple([int(sid) for sid in sent_id[2:].split('.')])
            if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
                logging.warning(
                    f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
                continue
            sentence_glue = next(glue_words_gen)
            sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
            while sentence_glue_numbers < sent_id_numbers:
                logging.warning(
                    f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
                sentence_glue = next(glue_words_gen)

            if sent_id != sentence_glue[0]:
                raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
            if len(sentence_glue[1]) != len(sentence):
                logging.warning(f"Skipping sentence! Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}")
                continue
            for w, w_glue in zip(sentence, sentence_glue[1]):
                w.glue = w_glue[2]
        if sentence is None:
            timeinfo.add_measurement(-1)
            continue

        # start_time = time.time()
        # print(time.time() - start_time)
        matches = match_file(sentence, structures)
        # if sent_id == 'ssj134.880.3375':
        #     print('here')
        # print(time.time() - start_time)
        # match_store.add_matches(matches)
        # word_stats.add_words(words)
        # database.commit()

        # find unimportant collocations
        # extract_possible_headwords = set(v[0] for v in othr_sentence_attributes.values())
        for match in matches:
            match_idis = []
            for key, word in match[0].items():
                match_idis.append(word.idi)
            match.append(match_idis)


        collocations = {}
        for match in matches:
            for key, word in match[0].items():
                # if word.id == ''
                if word.id not in collocations:
                    collocations[word.id] = []
                collocations[word.id].append((match[1][0], key, word.msd[:2], match[2]))

        # print(time.time() - start_time)
        formatted_sentence = []
        deppar_dict = {}

        # idi = 0

        # create output and form dependency parsetree sons
        for idi, word in idi_word_generator(sentence):
            # if word.text == 'Mumel':
            #     print('here')
            # if word.text == 'Poleg':
            #     print('here')
            # if word.text == 'Luka':
            #     print('here')
            idi = str(idi)
            # a = sent_id in sentences_of_interest
            # b = (word.lemma, word.msd) in sentences_of_interest[sent_id]
            # if word.msd == 'X':
            #     continue
            # if len(word.text) == 1 and word.text in string.punctuation + '':
            # a = re.match('^[\w]+$', word.text) is not None
            # if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
            #     continue
            # if sent_id in sentences_of_interest and (word.lemma, word.msd) in sentences_of_interest[sent_id]:
            # if sent_id in sentences_of_interest and idi in sentences_of_interest[sent_id]:
            # cur_count = w_collection.count_documents({'_id': sent_id})
            # if w_collection.count_documents({'_id': sent_id}) > 0:
            sentence_of_interest = othr_sentence_attributes
            # is_count = cur.count() > 0
            if idi in othr_sentence_attributes:
                if word.id not in deppar_dict:
                    deppar_dict[word.id] = {}
                deppar_dict[word.id][sentence_of_interest[idi][0]] = sentence_of_interest[idi][1]
                # deppar_dict[word.id] = {idi: sentences_of_interest[sent_id][idi]}

                # if idi != sentences_of_interest[sent_id][(word.lemma, word.msd)][1]:
                # if (word.lemma, word.msd) != sentences_of_interest[sent_id][idi][1]:
                #     print((word.lemma, word.msd))
                #     print(sentences_of_interest[sent_id][idi][1])
                    # if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi:
                    #     print('HERE')
                find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1])
            # idi += 1

        # print(time.time() - start_time)

        for word in sentence:
            if word.id in collocations:
                col = collocations[word.id]
            else:
                col = []

            if word.id in deppar_dict:
                dp = deppar_dict[word.id]
            else:
                dp = {}

            formatted_sentence.append(((word.text, word.glue), col, dp, word.lemma))

        # create_sentence_output(formatted_sentence, 4)
        formatted_sentences[sent_id] = formatted_sentence

        if sentences_in_ram >= sentences_num_limit:
            sentences_in_ram = 0

            requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]

            result = w_a_collection.bulk_write(requests)

            formatted_sentences = {}
        sentences_in_ram += 1
        # print(time.time() - start_time)

    requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]

    result = w_a_collection.bulk_write(requests)

        # force a bit of garbage collection
        # del sentence
        # del sent_id
        # del matches
        # gc.collect()

    print(time.time() - start_time)
    # return formatted_sentences

    #     # timeinfo.add_measurement(time.time() - start_time)
    #     # timeinfo.info()
    # # if no output files, just exit
    # if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
    #     return
    #
    # # get word renders for lemma/msd
    # word_stats.generate_renders()
    # match_store.determine_colocation_dispersions()
    #
    # # figure out representations!
    # if args.out or args.out_no_stat:
    #     match_store.set_representations(word_stats, structures)
    #
    # Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
    #     structures, match_store)
    # Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
    #     structures, match_store)
    # Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
    #     structures, match_store)
    # Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
    #     structures, match_store)


def get_headword_category(collection):
    """
    Returns
    :return:
    List of tuples with all headwords in mongodb and their categories.
    """
    headwords = sorted(collection.distinct("headwords")[1:])
    if args.headwords:
        with open(args.headwords, 'w') as f:
            for item in headwords:
                f.write("%s\n" % item)
    headword_category = [(headword, 'verb') if headword[-1] != '_' else (headword, 'adjective') for headword in
                         headwords]
    return headword_category


def main(args):
    # with Path('data/wordlist.json').open("r") as fp:
    #     sskj_wordlist = json.load(fp)
    #     # wordlist = set(sskj_wordlist['wordlist'])
    #     wordlist = set(sskj_wordlist['wordlist'])
    print('beginning chunk')
    start_time = time.time()
    # user:user:valdb:127.0.0.1

    [db_user, db_password, db_database, db_host] = args.mongo_db.split(':')

    mongo = MongoClient(username=db_user, password=db_password, authSource=db_database)

    db = mongo.valdb
    collection_ssj = db['ssj']
    collection_gigafida = db['gigafida']

    db2 = mongo.extvaldb
    # write collection
    w_collection_ssj = db2['ssj']
    w_collection_gigafida = db2['gigafida']
    w_a_collection_ssj = db2['ssj' + '_all']
    w_a_collection_gigafida = db2['gigafida' + '_all']
    status_collection = db2['status']

    valency_pattern_id_collection = db2['valency_pattern_ids']

    RF = reduce_functions["reduce_0"]["f"]

    # get all headwords from database
    # headword_category = get_headword_category(collection_ssj)
    with open(args.headwords, 'r') as read:
        headword_category = [(line[:-1], 'verb') for line in read.readlines()]

    assert args.language == 'en' or args.language == 'sl'


    shutil.rmtree(args.outdir, True)
    os.mkdir(args.outdir)

    engine = init_db(args.sloleks_db)


    # input_file = codecs.open(args.infile, 'r')
    # # input_file = []
    # next(input_file)

    # category_map = {'samostalnik':'noun', 'glagol':'verb', 'pridevnik':'adjective', 'prislov':'adverb', 'števnik':'numeral', 'zaimek':'pronoun', 'medmet':'interjection', 'veznik':'conjunction'}

    session = Session(engine)


    # cur = collection.find({})
    #
    # a = []
    # cur_len = 0
    # # num_empty_sent = 0
    # for ent in cur:
    #     cur_len += 1
    #     # s = frames_from_db_entry(ent)
    #     # if not s:
    #     #     num_empty_sent += 1
    #     a += frames_from_db_entry(ent)
    print(time.time() - start_time)
    # print(num_empty_sent)

    print('get_sentences_of_interest')
    start_time = time.time()
    # sentences_of_interest = get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo)
    # sentences_of_interest_stored = args.p1_processed
    if not args.p1_processed:
        with tqdm(total=len(headword_category)) as pbar:
            get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar, status_collection, 'ssj')
        if not args.ignore_gigafida:
            with tqdm(total=len(headword_category)) as pbar:
                get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar, status_collection, 'gigafida')
    # sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items()))
    print(time.time() - start_time)
    # num_sentences = 0
    # for el in all_sentences:
    #     if el not in sentences_of_interest:
    #         num_sentences += 1
    #
    # print(num_sentences)
    # print(len(all_sentences))

    print('extract_sentences')
    start_time = time.time()
    # formatted_sentences_stored = args.p2_processed
    if not args.p2_processed:
        gf_anno_paths = list(os.walk(args.input_gigafida_annotated))
        gf_anno_paths = [os.path.join(p_t[0], f_n) for p_t in gf_anno_paths for f_n in p_t[2]]

        gf_orig_paths = list(os.walk(args.input_gigafida_original))
        gf_orig_paths = sorted([os.path.join(p_t[0], f_n) for p_t in gf_orig_paths for f_n in p_t[2] if f_n[:2] == 'GF'])

        extract_sentences(w_collection_ssj, w_a_collection_ssj, args, args.input_sloleks, None)
        if not args.ignore_gigafida:
            extract_sentences(w_collection_gigafida, w_a_collection_gigafida, args, gf_anno_paths, gf_orig_paths)
    print(time.time() - start_time)

    print('write_xml')
    start_time = time.time()
    # print('aa ' + 3)
    with tqdm(total=len(headword_category)) as pbar:
        write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
    print(time.time() - start_time)
    # input_file.close()
    session.close()

if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
    arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials')
    arg_parser.add_argument('--mongo_db', type=str, help='Database credentials')
    arg_parser.add_argument('--schema', type=str, help='XML schema')
    arg_parser.add_argument('--infile', type=str, help='Input file')
    arg_parser.add_argument('--outdir', type=str, help='Output directory')
    arg_parser.add_argument('--headwords', type=str, default=None, help='Path to file, where headwords will be saved.')
    arg_parser.add_argument('--language', type=str, help='Language of certain attributes')
    arg_parser.add_argument('--corpus_name', type=str, help='Name of corpus to be written in outputs.')
    arg_parser.add_argument('--pattern_examples_limit', type=int, default=10, help='Max number of examples.')
    arg_parser.add_argument('--ignore_gigafida', action='store_true', help='If tagged ignore gigafida in output.')

    arg_parser.add_argument('--p1_processed',
                            help='Skip first part (obtaining sentences of interest) when they are already in DB.',
                            action='store_true')
    arg_parser.add_argument('--p2_processed',
                            help='Skip second part (obtaining formatted sentences) when they are already in DB.',
                            action='store_true')

    arg_parser.add_argument('--structures',
                        help='Structures definitions in xml file')
    arg_parser.add_argument('--input_sloleks',
                        help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
    arg_parser.add_argument('--input_gigafida_annotated',
                        help='input file in (gz or xml currently). If none, then just database is loaded')
    arg_parser.add_argument('--input_gigafida_original',
                        help='input file in (gz or xml currently). If none, then just database is loaded')
    arg_parser.add_argument('--out',
                        help='Classic output file')
    arg_parser.add_argument('--out-no-stat',
                        help='Output file, but without statistical columns')
    arg_parser.add_argument('--all',
                        help='Additional output file, writes more data')
    arg_parser.add_argument('--stats',
                        help='Output file for statistics')

    arg_parser.add_argument('--no-msd-translate',
                        help='MSDs are translated from slovene to english by default',
                        action='store_true')
    arg_parser.add_argument('--skip-id-check',
                        help='Skips checks for ids of <w> and <pc>, if they are in correct format',
                        action='store_true')
    arg_parser.add_argument('--min_freq', help='Minimal frequency in output',
                        type=int, default=0, const=1, nargs='?')
    arg_parser.add_argument('--verbose', help='Enable verbose output to stderr',
                        choices=["warning", "info", "debug"], default="info",
                        const="info", nargs='?')
    arg_parser.add_argument('--count-files',
                        help="Count files: more verbose output", action='store_true')
    arg_parser.add_argument('--multiple-output',
                        help='Generate one output for each syntactic structure',
                        action='store_true')

    arg_parser.add_argument('--sort-by',
                        help="Sort by a this column (index)", type=int, default=-1)
    arg_parser.add_argument('--sort-reversed',
                        help="Sort in reversed ored", action='store_true')

    arg_parser.add_argument('--db',
                        help="Database file to use (instead of memory)", default=None)
    arg_parser.add_argument('--new-db',
                        help="Writes over database file, if there exists one", action='store_true')

    arg_parser.add_argument('--pc-tag',
                        help='Tag for separators, usually pc or c', default="pc")

    args = arg_parser.parse_args()
    logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())

    start = time.time()
    main(args)
    logging.info("TIME: {}".format(time.time() - start))