svala-scripts/svala2tei.py

import argparse
import json
import logging
import os
import pickle
import shutil
import time
from xml.etree import ElementTree
from conllu import TokenList
import conllu
import classla
import copy

from lxml import etree

from src.create_tei import construct_sentence_from_list, \
    construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl

logging.basicConfig(level=logging.INFO)


def add_source(svala_i, source_i, sentence_string_id_split, source, el):
    source_id = "s" + svala_i
    source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{source_i}'
    token_tag = 'w' if el.tag.startswith('w') else 'pc'
    source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'id': source_token_id,
                   'space_after': False, 'svala_id': source_id})

def add_target(svala_i, target_i, sentence_string_id_split, target, el):
    target_id = "t" + svala_i
    target_token_id = f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{target_i}'
    token_tag = 'w' if el.tag.startswith('w') else 'pc'
    target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'id': target_token_id,
                   'space_after': False, 'svala_id': target_id})

def add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id):
    edge_id = "e-" + source_id + "-" + target_id
    labels = svala_data['edges'][edge_id]['labels']
    edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': labels})


def create_edges_list(target_ids, links_ids_mapper):
    target_edges = []
    target_edges_set = []
    for target_sentence in target_ids:
        target_sentence_edges = []
        for target_id in target_sentence:
            target_sentence_edges.extend(links_ids_mapper[target_id])
        target_edges.append(target_sentence_edges)
        target_edges_set.append(set(target_sentence_edges))

    return target_edges, target_edges_set


SKIP_IDS = ['solar2284s.1.1.1']

def create_edges(svala_data, source_par, target_par):
    if source_par and source_par[0]:
        if source_par[0][0]['id'] in SKIP_IDS:
            return []
        # print(source_par[0][0]['id'])
        # if source_par[0][0]['id'] == 'solar2150s.4.14.1':
        #     print('pause!')
    # if target_par and target_par[0]:
    #     print(target_par[0][0]['id'])
    #     if target_par[0][0]['id'] == 'solar2150t.4.1.1':
    #         print('pause!')
    source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source}
    target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target}

    source_ids = [[el['svala_id'] for el in source] for source in source_par]
    target_ids = [[el['svala_id'] for el in target] for target in target_par]

    source_sentence_ids = [set([el['svala_id'] for el in source]) for source in source_par]
    target_sentence_ids = [set([el['svala_id'] for el in target]) for target in target_par]

    # create links to ids mapper
    links_ids_mapper = {}
    edges_of_one_type = set()

    # delete empty edge
    if 'e-' in svala_data['edges']:
        del (svala_data['edges']['e-'])

    for k, v in svala_data['edges'].items():
        has_source = False
        has_target = False
        for el in v['ids']:
            # create edges of one type
            if el[0] == 's':
                has_source = True
            if el[0] == 't':
                has_target = True

            # create links_ids_mapper
            if el not in links_ids_mapper:
                links_ids_mapper[el] = []
            links_ids_mapper[el].append(k)
        if not has_source or not has_target or (len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \
                or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '):
            edges_of_one_type.add(k)

    # delete edge with space
    save_deleted_edges = {}
    if len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ':
        for edg in links_ids_mapper[svala_data['source'][0]['id']]:
            save_deleted_edges[edg] = svala_data['edges'][edg]
            del (svala_data['edges'][edg])
            del (links_ids_mapper[svala_data['source'][0]['id']])
    if len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' ':
        for edg in links_ids_mapper[svala_data['target'][0]['id']]:
            save_deleted_edges[edg] = svala_data['edges'][edg]
            del (svala_data['edges'][edg])
            del (links_ids_mapper[svala_data['target'][0]['id']])

    # create edge order
    edges_order = []
    edges_processed = set()
    active_target_sentence_i = 0

    # create target edges
    target_edges, target_edges_set = create_edges_list(target_ids, links_ids_mapper)
    source_edges, source_edges_set = create_edges_list(source_ids, links_ids_mapper)

    last_target_edge = ''

    for active_source_sentence_i, active_source_sentence in enumerate(source_edges):
        for source_edge in active_source_sentence:
            # print(source_edge)
            # if 'e-s7-t8' == source_edge:
            #     print('aaa')
            if source_edge in edges_of_one_type:
                if source_edge not in edges_processed:
                    edges_order.append(source_edge)
                    edges_processed.add(source_edge)

            elif target_edges_set and source_edge in target_edges_set[active_target_sentence_i]:

                # if 'e-s119-t119' == source_edge:
                #     print('aaa')
                if source_edge not in edges_processed:
                    edges_order.append(source_edge)
                    edges_processed.add(source_edge)
                last_target_edge = source_edge
            # when source is connected to two targets
            elif source_edge not in target_edges_set[active_target_sentence_i]:
                # add missing edges from target
                while source_edge not in target_edges_set[active_target_sentence_i]:
                    for target_edge in target_edges[active_target_sentence_i]:
                        if target_edge in edges_of_one_type:
                            if target_edge not in edges_processed:
                                edges_order.append(target_edge)
                                edges_processed.add(target_edge)
                            last_target_edge = target_edge
                    active_target_sentence_i += 1
                    if source_edge in target_edges_set[active_target_sentence_i]:
                        if source_edge not in edges_processed:
                            edges_order.append(source_edge)
                            edges_processed.add(source_edge)

            else:
                raise 'Impossible!!!'
        if not target_edges_set or not target_edges_set[0] or active_target_sentence_i >= len(target_edges):
            continue
        if len(target_edges[active_target_sentence_i]) == 0:
            active_target_sentence_i += 1
            continue

        if last_target_edge == target_edges[active_target_sentence_i][-1] or (len(target_edges[active_target_sentence_i]) > 1 and last_target_edge == target_edges[active_target_sentence_i][-2] and (target_edges[active_target_sentence_i][-1] in edges_of_one_type or (target_edges[active_target_sentence_i][-1] not in edges_of_one_type and target_edges[active_target_sentence_i][-1] in source_edges_set[active_source_sentence_i]))):
            for target_edge in target_edges[active_target_sentence_i]:
                if target_edge in edges_of_one_type:
                    if target_edge not in edges_processed:
                        edges_order.append(target_edge)
                        edges_processed.add(target_edge)
                    last_target_edge = target_edge
            active_target_sentence_i += 1
            continue
        target_edge_in_next_source_edge_sentence = False
        for target_edge in target_edges[active_target_sentence_i]:
            if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[active_source_sentence_i + 1]:
                target_edge_in_next_source_edge_sentence = True
                break
        if target_edge_in_next_source_edge_sentence:
            pass
        elif not target_edge_in_next_source_edge_sentence:
            target_edge_in_next_source_edge_sentence = False
            while not target_edge_in_next_source_edge_sentence:
                # if active_target_sentence_i >= len(target_edges_set):
                #     break
                for target_edge in target_edges[active_target_sentence_i]:
                    if target_edge in edges_of_one_type:
                        if target_edge not in edges_processed:
                            edges_order.append(target_edge)
                            edges_processed.add(target_edge)
                        last_target_edge = target_edge

                # if there is no next source sentence
                if active_source_sentence_i + 1 >= len(source_edges_set):
                    target_edge_in_next_source_edge_sentence = True

                # if last_target_edge only in target stop regularly
                if last_target_edge == target_edges[active_target_sentence_i][-1]:
                    target_edge_in_next_source_edge_sentence = True

                # test if target_edge in next source
                for target_edge in target_edges[active_target_sentence_i]:
                    if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[
                        active_source_sentence_i + 1]:
                        target_edge_in_next_source_edge_sentence = True
                        break
                active_target_sentence_i += 1

    if not source_edges:
        for active_target_sentence in target_edges:
            for target_edge in active_target_sentence:
                if target_edge not in edges_processed:
                    edges_order.append(target_edge)
                    edges_processed.add(target_edge)

    # # DEBUG stuff
    # for edge_order in edges_order:
    #     if edges_order.count(edge_order) > 1:
    #     # if edge_order not in a:
    #         print(f'ERROR {edge_order}')
    #
    # for edge_order in edges_order:
    #     if edge_order not in svala_data['edges']:
    #         print(f'ERROR {edge_order}')
    #
    # for key in svala_data['edges'].keys():
    #     if key not in edges_order:
    #         print(f'ERROR {key}')
    #
    # a = len(svala_data['edges'])
    # b = len(edges_order)
    if len(svala_data['edges']) != len(edges_order):
        for k, v in save_deleted_edges.items():
            svala_data['edges'][k] = v


    assert len(svala_data['edges']) == len(edges_order)

    sentence_edges = []
    source_sent_id = 0
    target_sent_id = 0
    # actually add edges
    edges = []
    for edge_id in edges_order:
        labels = svala_data['edges'][edge_id]['labels']
        source_ids = [source_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in source_mapper]
        target_ids = [target_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in target_mapper]
        ids = svala_data['edges'][edge_id]['ids']

        source_ok = [el[0] == 't' or el in source_sentence_ids[source_sent_id] for el in ids] if source_sentence_ids else []
        source_ok_all = all(source_ok)

        if not source_ok_all:
            source_sent_id += 1

        target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else []
        target_ok_all = all(target_ok)

        if not target_ok_all:
            target_sent_id += 1

        if not source_ok_all or not target_ok_all:
            sentence_edges.append(edges)
            edges = []
        edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': labels})

    if edges:
        sentence_edges.append(edges)

    return sentence_edges

def add_token(svala_i, source_i, target_i, el, source, target, edges, svala_data, sentence_string_id):
    source_id = "s" + svala_i
    target_id = "t" + svala_i
    edge_id = "e-" + source_id + "-" + target_id
    labels = svala_data['edges'][edge_id]['labels']
    sentence_string_id_split = sentence_string_id.split('.')
    source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{source_i}'
    target_token_id = f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{target_i}'
    token_tag = 'w' if el.tag.startswith('w') else 'pc'
    lemma = el.attrib['lemma'] if token_tag == 'w' else el.text
    source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False, 'svala_id': source_id})
    target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False, 'svala_id': target_id})
    edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': labels})


def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id):
    sentence_string_id_split = sentence_string_id.split('.')

    source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{out_list_i}' if is_source \
        else f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{out_list_i}'
    token_tag = 'w' if el.tag.startswith('w') else 'pc'
    lemma = el.attrib['lemma'] if token_tag == 'w' else el.text
    out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False, 'svala_id': s_t_id})
    out_list_ids.append(source_token_id)


def add_error_token_source_target_only(el, out_list, sentence_string_id, out_list_i, is_source, s_t_id):
    sentence_string_id_split = sentence_string_id.split('.')

    source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{out_list_i}' if is_source \
        else f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{out_list_i}'
    token_tag = 'w' if el.tag.startswith('w') else 'pc'
    out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'id': source_token_id, 'space_after': False, 'svala_id': s_t_id})


def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_data, sentence_string_id, edges=None):
    source_edge_ids = []
    target_edge_ids = []
    source_ids = []
    target_ids = []

    # solar5.7
    for el in error:
        if el.tag.startswith('w') or el.tag.startswith('pc'):
            ind = str(svala_i)

            source_id = "s" + ind
            source_edge_ids.append(source_id)

            add_error_token(el, source, sentence_string_id, source_i, source_ids, True, source_id)

            source_i += 1
            svala_i += 1

        elif el.tag.startswith('c') and len(source) > 0:
            source[-1]['space_after'] = True

        elif el.tag.startswith('p'):
            for p_el in el:
                if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
                    ind = str(svala_i)

                    target_id = "t" + ind
                    target_edge_ids.append(target_id)

                    add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False, target_id)

                    target_i += 1
                    svala_i += 1

                elif p_el.tag.startswith('c') and len(target) > 0:
                    target[-1]['space_after'] = True

        elif el.tag.startswith('u2'):
            for el_l2 in el:
                if el_l2.tag.startswith('w') or el_l2.tag.startswith('pc'):
                    ind = str(svala_i)

                    source_id = "s" + ind
                    source_edge_ids.append(source_id)

                    add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True, source_id)

                    source_i += 1
                    svala_i += 1

                elif el_l2.tag.startswith('c') and len(source) > 0:
                    source[-1]['space_after'] = True

                elif el_l2.tag.startswith('u3'):
                    for el_l3 in el_l2:
                        if el_l3.tag.startswith('w') or el_l3.tag.startswith('pc'):
                            ind = str(svala_i)

                            source_id = "s" + ind
                            source_edge_ids.append(source_id)

                            add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True, source_id)

                            source_i += 1
                            svala_i += 1

                        elif el_l3.tag.startswith('c') and len(source) > 0:
                            source[-1]['space_after'] = True

                        elif el_l3.tag.startswith('u4'):
                            for el_l4 in el_l3:
                                if el_l4.tag.startswith('w') or el_l4.tag.startswith('pc'):
                                    ind = str(svala_i)

                                    source_id = "s" + ind
                                    source_edge_ids.append(source_id)

                                    add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True, source_id)

                                    source_i += 1
                                    svala_i += 1
                                elif el_l4.tag.startswith('c') and len(source) > 0:
                                    source[-1]['space_after'] = True

                                elif el_l4.tag.startswith('u5'):
                                    for el_l5 in el_l4:
                                        if el_l5.tag.startswith('w') or el_l5.tag.startswith('pc'):
                                            ind = str(svala_i)

                                            source_id = "s" + ind
                                            source_edge_ids.append(source_id)

                                            add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True, source_id)

                                            source_i += 1
                                            svala_i += 1
                                        elif el_l5.tag.startswith('c') and len(source) > 0:
                                            source[-1]['space_after'] = True

            for p_el in el:
                if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
                    ind = str(svala_i)

                    target_id = "t" + ind
                    target_edge_ids.append(target_id)

                    add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False, target_id)

                    target_i += 1
                    svala_i += 1
                elif p_el.tag.startswith('c') and len(target) > 0:
                    target[-1]['space_after'] = True

    if edges is not None:
        edge_ids = sorted(source_edge_ids) + sorted(target_edge_ids)
        edge_id = "e-" + "-".join(edge_ids)
        edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': svala_data['edges'][edge_id]['labels']})

    return svala_i, source_i, target_i


def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, sentence_string_id, edges=None):
    source_edge_ids = []
    target_edge_ids = []
    source_ids = []
    target_ids = []

    # solar5.7
    for el in error:
        if el.tag.startswith('w') or el.tag.startswith('pc'):
            ind = str(svala_i)

            source_id = "s" + ind
            source_edge_ids.append(source_id)

            add_error_token(el, source, sentence_string_id, source_i, source_ids, True, source_id)

            source_i += 1
            svala_i += 1

        elif el.tag.startswith('c') and len(source) > 0:
            source[-1]['space_after'] = True

        elif el.tag.startswith('p'):
            for p_el in el:
                if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
                    ind = str(svala_i)

                    target_id = "t" + ind
                    target_edge_ids.append(target_id)

                    add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False, target_id)

                    target_i += 1
                    svala_i += 1

                elif p_el.tag.startswith('c') and len(target) > 0:
                    target[-1]['space_after'] = True

        elif el.tag.startswith('u2'):
            for el_l2 in el:
                if el_l2.tag.startswith('w') or el_l2.tag.startswith('pc'):
                    ind = str(svala_i)

                    source_id = "s" + ind
                    source_edge_ids.append(source_id)

                    add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True, source_id)

                    source_i += 1
                    svala_i += 1

                elif el_l2.tag.startswith('c') and len(source) > 0:
                    source[-1]['space_after'] = True

                elif el_l2.tag.startswith('u3'):
                    for el_l3 in el_l2:
                        if el_l3.tag.startswith('w') or el_l3.tag.startswith('pc'):
                            ind = str(svala_i)

                            source_id = "s" + ind
                            source_edge_ids.append(source_id)

                            add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True, source_id)

                            source_i += 1
                            svala_i += 1

                        elif el_l3.tag.startswith('c') and len(source) > 0:
                            source[-1]['space_after'] = True

                        elif el_l3.tag.startswith('u4'):
                            for el_l4 in el_l3:
                                if el_l4.tag.startswith('w') or el_l4.tag.startswith('pc'):
                                    ind = str(svala_i)

                                    source_id = "s" + ind
                                    source_edge_ids.append(source_id)

                                    add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True, source_id)

                                    source_i += 1
                                    svala_i += 1
                                elif el_l4.tag.startswith('c') and len(source) > 0:
                                    source[-1]['space_after'] = True

                                elif el_l4.tag.startswith('u5'):
                                    for el_l5 in el_l4:
                                        if el_l5.tag.startswith('w') or el_l5.tag.startswith('pc'):
                                            ind = str(svala_i)

                                            source_id = "s" + ind
                                            source_edge_ids.append(source_id)

                                            add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True, source_id)

                                            source_i += 1
                                            svala_i += 1
                                        elif el_l5.tag.startswith('c') and len(source) > 0:
                                            source[-1]['space_after'] = True

    if edges is not None:
        edge_ids = sorted(source_edge_ids) + sorted(target_edge_ids)
        edge_id = "e-" + "-".join(edge_ids)
        edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': svala_data['edges'][edge_id]['labels']})

    return svala_i, source_i, target_i


def add_errors_source_target_only(svala_i, source_i, target_i, error, source, target, svala_data, sentence_string_id):
    # solar5.7
    for el in error:
        if el.tag.startswith('w') or el.tag.startswith('pc'):
            ind = str(svala_i)

            source_id = "s" + ind

            add_error_token_source_target_only(el, source, sentence_string_id, source_i, True, source_id)

            source_i += 1
            svala_i += 1

        elif el.tag.startswith('c') and len(source) > 0:
            source[-1]['space_after'] = True

        elif el.tag.startswith('p'):
            for p_el in el:
                if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
                    ind = str(svala_i)

                    target_id = "t" + ind

                    add_error_token_source_target_only(p_el, target, sentence_string_id, target_i, False, target_id)

                    target_i += 1
                    svala_i += 1

                elif p_el.tag.startswith('c') and len(target) > 0:
                    target[-1]['space_after'] = True

        elif el.tag.startswith('u2'):
            for el_l2 in el:
                if el_l2.tag.startswith('w') or el_l2.tag.startswith('pc'):
                    ind = str(svala_i)

                    source_id = "s" + ind

                    add_error_token_source_target_only(el_l2, source, sentence_string_id, source_i, True, source_id)

                    source_i += 1
                    svala_i += 1

                elif el_l2.tag.startswith('c') and len(source) > 0:
                    source[-1]['space_after'] = True

                elif el_l2.tag.startswith('u3'):
                    for el_l3 in el_l2:
                        if el_l3.tag.startswith('w') or el_l3.tag.startswith('pc'):
                            ind = str(svala_i)

                            source_id = "s" + ind

                            add_error_token_source_target_only(el_l3, source, sentence_string_id, source_i, True, source_id)

                            source_i += 1
                            svala_i += 1

                        elif el_l3.tag.startswith('c') and len(source) > 0:
                            source[-1]['space_after'] = True

                        elif el_l3.tag.startswith('u4'):
                            for el_l4 in el_l3:
                                if el_l4.tag.startswith('w') or el_l4.tag.startswith('pc'):
                                    ind = str(svala_i)

                                    source_id = "s" + ind

                                    add_error_token_source_target_only(el_l4, source, sentence_string_id, source_i, True, source_id)

                                    source_i += 1
                                    svala_i += 1
                                elif el_l4.tag.startswith('c') and len(source) > 0:
                                    source[-1]['space_after'] = True

                                elif el_l4.tag.startswith('u5'):
                                    for el_l5 in el_l4:
                                        if el_l5.tag.startswith('w') or el_l5.tag.startswith('pc'):
                                            ind = str(svala_i)

                                            source_id = "s" + ind

                                            add_error_token_source_target_only(el_l5, source, sentence_string_id, source_i, True, source_id)

                                            source_i += 1
                                            svala_i += 1
                                        elif el_l5.tag.startswith('c') and len(source) > 0:
                                            source[-1]['space_after'] = True

            for p_el in el:
                if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
                    ind = str(svala_i)

                    target_id = "t" + ind

                    add_error_token_source_target_only(p_el, target, sentence_string_id, target_i, False, target_id)

                    target_i += 1
                    svala_i += 1
                elif p_el.tag.startswith('c') and len(target) > 0:
                    target[-1]['space_after'] = True
    return svala_i, source_i, target_i


def create_conllu(interest_list, sentence_string_id):
    conllu_result = TokenList([{"id": token_i + 1, "form": token['token'], "lemma": None, "upos": None, "xpos": None, "feats": None,
                "head": None, "deprel": None, "deps": None, "misc": "SpaceAfter=No"} if not token['space_after']
               else {"id": token_i + 1, "form": token['token'], "lemma": None, "upos": None, "xpos": None,
                     "feats": None, "head": None, "deprel": None, "deps": None, "misc": None} for token_i, token in
               enumerate(interest_list)])
    # Delete last SpaceAfter
    misc = conllu_result[len(conllu_result) - 1]['misc'] if len(conllu_result) > 0 else None
    if misc is not None:
        misc_split = misc.split('|')
        if misc is not None and misc == 'SpaceAfter=No':
            conllu_result[len(conllu_result) - 1]['misc'] = None
        elif misc is not None and 'SpaceAfter=No' in misc_split:
            conllu_result[len(conllu_result) - 1]['misc'] = '|'.join([el for el in misc_split if el != 'SpaceAfter=No'])
    conllu_result.metadata = {"sent_id": sentence_string_id}

    return conllu_result.serialize()


def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func):
    par_source = []
    par_target = []

    source_conllus = []
    target_conllus = []

    for sentence_id, sentence in enumerate(sentences):
        source = []
        target = []
        edges = []

        sentence_id += 1
        source_i = 1
        target_i = 1
        sentence_string_id = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + f'.{sentence_id}'
        for el in sentence:
            if el.tag.startswith('w'):
                add_token(str(svala_i), source_i, target_i, el, source, target, edges, svala_data, sentence_string_id)
                svala_i += 1
                source_i += 1
                target_i += 1
            elif el.tag.startswith('pc'):
                add_token(str(svala_i), source_i, target_i, el, source, target, edges, svala_data, sentence_string_id)
                svala_i += 1
                source_i += 1
                target_i += 1
            elif el.tag.startswith('u'):
                svala_i, source_i, target_i = add_errors_func(svala_i, source_i, target_i, el, source, target,
                                                              svala_data, sentence_string_id)
            elif el.tag.startswith('c'):
                if len(source) > 0:
                    source[-1]['space_after'] = True
                if len(target) > 0:
                    target[-1]['space_after'] = True

        par_source.append(source)
        par_target.append(target)

        source_conllu = ''
        if len(source) > 0:
            source_conllu = create_conllu(source, sentence_string_id)
        target_conllu = ''
        if len(target) > 0:
            target_conllu = create_conllu(target, sentence_string_id)

        source_conllus.append(source_conllu)
        target_conllus.append(target_conllu)

    sentence_edges = create_edges(svala_data, par_source, par_target)

    return sentence_edges, source_conllus, target_conllus


def read_raw_text(path):
    with open(path, 'r') as rf:
        return rf.read()

HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globač§§§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'],  '§§§oveduje': ['§', '§', '§', 'oveduje'],  'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.']}

def map_svala_tokenized(svala_data_part, tokenized_paragraph):
    paragraph_res = []
    svala_data_i = 0
    wierd_sign_count = 0
    for sentence in tokenized_paragraph:
        sentence_res = []
        sentence_id = 0
        for tok in sentence:
            tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w'
            if 'misc' in tok:
                assert tok['misc'] == 'SpaceAfter=No'
            space_after = not 'misc' in tok
            if svala_data_part[svala_data_i]['text'].strip() != tok['text']:
                key = svala_data_part[svala_data_i]['text'].strip()
                if key not in HAND_FIXES:
                    print(f'key: {key} ; tok[text]: {tok["text"]}')
                    if key.startswith('§§§') and key.endswith('§§§'):
                        HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§']
                    elif key.startswith('§§§'):
                        HAND_FIXES[key] = ['§', '§', '§', key[3:]]
                    elif key.endswith('§§§'):
                        HAND_FIXES[key] = [key[:-3], '§', '§', '§']
                    else:
                        raise 'Word mismatch!'

                if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
                    wierd_sign_count += 1
                    if wierd_sign_count < len(HAND_FIXES[key]):
                        continue
                    else:
                        tok['text'] = key
                        wierd_sign_count = 0
                else:
                    print(f'key: {key} ; tok[text]: {tok["text"]}')
                    raise 'Word mismatch!'
            sentence_id += 1
            sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']})
            svala_data_i += 1
        paragraph_res.append(sentence_res)
    return paragraph_res


def map_svala_solar2(svala_data_part, solar2_paragraph):
    svala_data_i = 0
    for sentence in solar2_paragraph:
        sentence_id = 0
        for tok in sentence:
            # if svala_data_part[svala_data_i]['text'].strip() != tok['token']:
            #     if tok['text'] == '§' and svala_data_part[svala_data_i]['token'].strip() == '§§§':
            #         wierd_sign_count += 1
            #         if wierd_sign_count < 3:
            #             continue
            #         else:
            #             tok['text'] = '§§§'
            #             wierd_sign_count = 0
            #     else:
            #         raise 'Word mismatch!'
            assert svala_data_part[svala_data_i]['text'].strip() == tok['token']
            sentence_id += 1
            tok['svala_id'] = svala_data_part[svala_data_i]['id']
            svala_data_i += 1


def update_ids(pretag, in_list):
    for el in in_list:
        el['id'] = f'{pretag}.{el["id"]}'


def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, source_raw_text, target_raw_text, nlp_tokenize):
    if source_raw_text is not None:
        text = read_raw_text(source_raw_text)
        raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else ([], [], [])
        source_res = map_svala_tokenized(svala_data['source'], source_tokenized)

    if target_raw_text is not None:
        text = read_raw_text(target_raw_text)
        raw_text, target_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else ([], [], [])
        target_res = map_svala_tokenized(svala_data['target'], target_tokenized)

    par_source = []
    par_target = []
    sentences_len = len(sentences)
    source_conllus = []
    target_conllus = []
    if source_raw_text is not None:
        sentences_len = max(sentences_len, len(source_res))
    if target_raw_text is not None:
        sentences_len = max(sentences_len, len(target_res))
    for sentence_id in range(sentences_len):
        source = []
        target = []
        sentence_id += 1
        source_i = 1
        target_i = 1
        sentence_string_id = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + f'.{sentence_id}'
        sentence_string_id_split = sentence_string_id.split('.')

        if sentence_id - 1 < len(sentences):
            sentence = sentences[sentence_id - 1]
            for el in sentence:
                if el.tag.startswith('w'):
                    if source_raw_text is None:
                        add_source(str(svala_i), source_i, sentence_string_id_split, source, el)
                    if target_raw_text is None:
                        add_target(str(svala_i), target_i, sentence_string_id_split, target, el)

                    svala_i += 1
                    source_i += 1
                    target_i += 1
                elif el.tag.startswith('pc'):
                    if source_raw_text is None:
                        add_source(str(svala_i), source_i, sentence_string_id_split, source, el)
                    if target_raw_text is None:
                        add_target(str(svala_i), target_i, sentence_string_id_split, target, el)

                    svala_i += 1
                    source_i += 1
                    target_i += 1
                elif el.tag.startswith('u'):
                    if source_raw_text is None or target_raw_text is None:
                        svala_i, source_i, target_i = add_errors_source_target_only(svala_i, source_i, target_i, el, source, target, svala_data, sentence_string_id)
                    else:
                        svala_i, source_i, target_i = add_errors_func(svala_i, source_i, target_i, el, source, target,
                                                                      svala_data, sentence_string_id)
                elif el.tag.startswith('c'):
                    if len(source) > 0:
                        source[-1]['space_after'] = True
                    if len(target) > 0:
                        target[-1]['space_after'] = True

        if source_raw_text is not None and sentence_id - 1 < len(source_res):
            source = source_res[sentence_id - 1]
            update_ids(f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}', source)
            par_source.append(source)
        source_conllu = ''
        if len(source) > 0:
            source_conllu = create_conllu(source, sentence_string_id)
        if target_raw_text is not None and sentence_id - 1 < len(target_res):
            target = target_res[sentence_id - 1]
            update_ids(f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}', target)
            par_target.append(target)

        if source_raw_text is None:
            par_source.append(source)
        if target_raw_text is None:
            par_target.append(target)

        target_conllu = ''
        if len(target) > 0:
            target_conllu = create_conllu(target, sentence_string_id)

        if source_raw_text is None or len(source_conllus) < len(par_source):
            source_conllus.append(source_conllu)

        if target_raw_text is None or len(target_conllus) < len(par_target):
            target_conllus.append(target_conllu)

    # reannotate svala_ids
    if source_raw_text is None:
        map_svala_solar2(svala_data['source'], par_source)
    if target_raw_text is None:
        map_svala_solar2(svala_data['target'], par_target)

    sentence_edges = create_edges(svala_data, par_source, par_target)

    return sentence_edges, source_conllus, target_conllus

def tokenize(args):
    if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization:
        print('READING AND MERGING...')
        with open(args.tokenization_interprocessing, 'rb') as rp:
            tokenized_source_divs, tokenized_target_divs, document_edges = pickle.load(rp)
            return tokenized_source_divs, tokenized_target_divs, document_edges

    print('TOKENIZING...')
    with open(args.solar_file, 'r') as fp:
        logging.info(args.solar_file)
        et = ElementTree.XML(fp.read())

    nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True)
    # filename_encountered = False
    i = 0
    folders_count = 5484
    tokenized_source_divs = []
    tokenized_target_divs = []
    document_edges = []

    for div in et.iter('div'):
        bibl = div.find('bibl')
        file_name = bibl.get('n')
        file_name = file_name.replace('/', '_')
        print(f'{i*100/folders_count} % : {file_name}')
        i += 1
        # if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt':
        # if file_name == 'KUS-PI-slo-5-CE-E-2009-30137':
        # # # if i*100/folders_count > 40:
        #     filename_encountered = True
        # # # # if i*100/folders_count > 41:
        # # # #     filename_encountered = False
        # if not filename_encountered:
        #     continue

        svala_path = os.path.join(args.svala_folder, file_name)
        corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name)
        raw_texts_path = os.path.join(args.svala_generated_text_folder, file_name)

        svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)] if os.path.isdir(svala_path) else []
        svala_dict = {e[0]: e[1] for e in svala_list}

        if os.path.exists(corrected_svala_path):
            corrected_svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(corrected_svala_path)]
            corrected_svala_dict = {e[0]: e[1] for e in corrected_svala_list}

            svala_dict.update(corrected_svala_dict)

        assert len(svala_dict) != 0

        tokenized_source_paragraphs = []
        tokenized_target_paragraphs = []
        paragraph_edges = []

        paragraphs = div.findall('p')
        for paragraph in paragraphs:
            sentences = paragraph.findall('s')
            svala_i = 1

            # read json
            # if paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] == 'solar5.7':
            #     print('here')
            svala_file = os.path.join(svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']])
            corrected_svala_file = os.path.join(corrected_svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']])
            add_errors_func = add_errors if not os.path.exists(corrected_svala_file) else add_errors1_0_1
            jf = open(svala_file) if not os.path.exists(corrected_svala_file) else open(corrected_svala_file)
            svala_data = json.load(jf)
            jf.close()

            source_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_source.json'
            target_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_target.json'

            source_raw_text = os.path.join(raw_texts_path, source_filename) if os.path.exists(os.path.join(raw_texts_path, source_filename)) else None
            target_raw_text = os.path.join(raw_texts_path, target_filename) if os.path.exists(os.path.join(raw_texts_path, target_filename)) else None

            if not (source_raw_text or target_raw_text):
                sentence_edges, tokenized_source_sentences, tokenized_target_sentences = process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func)

            else:
                sentence_edges, tokenized_source_sentences, tokenized_target_sentences = process_obeliks_paragraph(sentences, paragraph, svala_i,
                                                                                           svala_data, add_errors_func, source_raw_text, target_raw_text, nlp_tokenize)

            tokenized_source_paragraphs.append(tokenized_source_sentences)
            tokenized_target_paragraphs.append(tokenized_target_sentences)
            paragraph_edges.append(sentence_edges)

        tokenized_source_divs.append(tokenized_source_paragraphs)
        tokenized_target_divs.append(tokenized_target_paragraphs)
        document_edges.append(paragraph_edges)

    with open(args.tokenization_interprocessing, 'wb') as wp:
        pickle.dump((tokenized_source_divs, tokenized_target_divs, document_edges), wp)

    return tokenized_source_divs, tokenized_target_divs, document_edges

def annotate(tokenized_source_divs, tokenized_target_divs, args):
    if os.path.exists(args.annotation_interprocessing) and not args.overwrite_annotation:
        print('READING...')
        with open(args.annotation_interprocessing, 'rb') as rp:
            annotated_source_divs, annotated_target_divs = pickle.load(rp)
            return annotated_source_divs, annotated_target_divs

    nlp = classla.Pipeline('sl', pos_use_lexicon=True, pos_lemma_pretag=False, tokenize_pretokenized="conllu",
                           type='standard_jos')

    annotated_source_divs = []
    complete_source_conllu = ''
    print('ANNOTATING SOURCE...')
    for i, div in enumerate(tokenized_source_divs):
        print(f'{str(i*100/len(tokenized_source_divs))}')
        annotated_source_pars = []
        for par in div:
            annotated_source_sens = []
            for sen in par:
                source_conllu_annotated = nlp(sen).to_conll() if sen else ''
                annotated_source_sens.append(source_conllu_annotated)
                complete_source_conllu += source_conllu_annotated
            annotated_source_pars.append(annotated_source_sens)
        annotated_source_divs.append(annotated_source_pars)

    annotated_target_divs = []
    complete_target_conllu = ''
    print('ANNOTATING TARGET...')
    for i, div in enumerate(tokenized_target_divs):
        print(f'{str(i * 100 / len(tokenized_target_divs))}')
        annotated_target_pars = []
        for par in div:
            annotated_target_sens = []
            for sen in par:
                target_conllu_annotated = nlp(sen).to_conll() if sen else ''
                annotated_target_sens.append(target_conllu_annotated)
                complete_target_conllu += target_conllu_annotated
            annotated_target_pars.append(annotated_target_sens)
        annotated_target_divs.append(annotated_target_pars)

    with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
        sf.write(complete_source_conllu)

    with open(os.path.join(args.results_folder, f"target.conllu"), 'w') as sf:
        sf.write(complete_target_conllu)

    with open(args.annotation_interprocessing, 'wb') as wp:
        pickle.dump((annotated_source_divs, annotated_target_divs), wp)

    return annotated_source_divs, annotated_target_divs


def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
    print('BUILDING LINKS...')
    etree_links = build_links(document_edges)

    with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf:
        tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode())

    with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf:
        json.dump(document_edges, jf, ensure_ascii=False, indent="  ")


    print('WRITTING TEI...')
    etree_source_documents = []
    etree_target_documents = []
    etree_source_divs = []
    etree_target_divs = []

    with open(args.solar_file, 'r') as fp:
        logging.info(args.solar_file)
        et = ElementTree.XML(fp.read())

    # filename_encountered = False
    i = 0
    folders_count = 5484

    div_i = 0
    for div in et.iter('div'):
        bibl = div.find('bibl')
        file_name = bibl.get('n')
        file_name = file_name.replace('/', '_')
        print(f'{i * 100 / folders_count} % : {file_name}')
        i += 1

        # if i * 100 / folders_count > 50:
        #     filename_encountered = True
        # # if file_name == 'KUS-G-slo-4-GO-E-2009-10071':
        # #     filename_encountered = True
        # if i * 100 / folders_count > 51:
        #     filename_encountered = False
        #
        # if file_name == 'KUS-G-slo-1-LJ-E-2009_2010-10540':
        #     # div_i -= 1
        #     continue
        #
        # if file_name == 'KUS-SI-slo-2-NM-E-2009_2010-20362' or file_name == 'KUS-OS-slo-9-SG-R-2009_2010-40129' or file_name == 'KUS-OS-slo-7-SG-R-2009_2010-40173':
        #     # div_i -= 1
        #     continue
        #
        # if not filename_encountered:
        #     div_i+=1
        #
        #     continue


        etree_source_paragraphs = []
        etree_target_paragraphs = []
        # paragraph_edges = []

        paragraphs = div.findall('p')
        par_i = 0
        for paragraph in paragraphs:

            etree_source_sentences = []
            etree_target_sentences = []

            for sentence_id, source_conllu_annotated in enumerate(annotated_source_divs[div_i][par_i]):
                if len(source_conllu_annotated) > 0:
                    source_conllu_parsed = conllu.parse(source_conllu_annotated)[0]
                if len(source_conllu_annotated) > 0:
                    etree_source_sentences.append(construct_sentence_from_list(str(sentence_id + 1), source_conllu_parsed, True))


            for sentence_id, target_conllu_annotated in enumerate(annotated_target_divs[div_i][par_i]):
                if len(target_conllu_annotated) > 0:
                    target_conllu_parsed = conllu.parse(target_conllu_annotated)[0]
                if len(target_conllu_annotated) > 0:
                    etree_target_sentences.append(construct_sentence_from_list(str(sentence_id + 1), target_conllu_parsed, False))

            etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True))
            etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False))

            par_i += 1

        etree_bibl = convert_bibl(bibl)
        etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's'))
        etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't'))

        div_i += 1

    print('APPENDING DOCUMENT...')
    etree_source_documents.append(
        TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's',
                    etree_source_divs, etree_target_divs))
    etree_target_documents.append(
        TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't',
                    etree_target_divs, etree_source_divs))

    print('BUILDING TEI DOCUMENTS...')
    etree_source = build_tei_etrees(etree_source_documents)
    etree_target = build_tei_etrees(etree_target_documents)

    print('Writting all but complete')
    with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf:
        sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode())

    with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf:
        tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode())

    print('COMPLETE TREE CREATION...')
    complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links)
    # complete_etree = build_complete_tei(etree_source, etree_target, etree_links)

    print('WRITING COMPLETE TREE')
    with open(os.path.join(args.results_folder, f"complete.xml"), 'w') as tf:
        tf.write(etree.tostring(complete_etree, pretty_print=True, encoding='utf-8').decode())

def process_file(args):
    if os.path.exists(args.results_folder):
        shutil.rmtree(args.results_folder)
    os.mkdir(args.results_folder)


    # READ AND MERGE svala tokenization, solar2 tokenization and obeliks tokenization
    tokenized_source_divs, tokenized_target_divs, document_edges = tokenize(args)

    # ANNOTATE WITH CLASSLA
    annotated_source_divs, annotated_target_divs = annotate(tokenized_source_divs, tokenized_target_divs, args)

    # GENERATE TEI AND WRITE OUTPUT
    write_tei(annotated_source_divs, annotated_target_divs, document_edges, args)


def main(args):
    process_file(args)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
    parser.add_argument('--solar_file', default='data/Solar2.0/solar2.xml',
                        help='input file in (gz or xml currently). If none, then just database is loaded')
    parser.add_argument('--svala_folder', default='data/solar.svala',
                        help='input file in (gz or xml currently). If none, then just database is loaded')
    parser.add_argument('--corrected_svala_folder', default='data/solar.svala.fixed.1.0.1_2',
                        help='input file in (gz or xml currently). If none, then just database is loaded')
    parser.add_argument('--results_folder', default='data/results/solar3.0',
                        help='input file in (gz or xml currently). If none, then just database is loaded')
    parser.add_argument('--svala_generated_text_folder', default='data/svala_generated_text.formatted',
                        help='input file in (gz or xml currently). If none, then just database is loaded')
    parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
                        help='input file in (gz or xml currently). If none, then just database is loaded')
    parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
    parser.add_argument('--annotation_interprocessing', default='data/processing.annotation',
                        help='input file in (gz or xml currently). If none, then just database is loaded')
    parser.add_argument('--overwrite_annotation', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
    args = parser.parse_args()

    start = time.time()
    main(args)
    logging.info("TIME: {}".format(time.time() - start))