23 changed files with 72 additions and 1764 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,3 @@ nohup.out

 data/kres_out/*
 data/kres_example/
-venv/
-.idea/
-data/
--- a/3
+++ b/3
@ -6,9 +6,8 @@ json_files: # srl_tagged_files
 	cd tools; python3 gen_json.py

 srl_tagged_files: # tsv_files
-# 	# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
+	# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
 	cd tools/srl-20131216; ./tag_all.sh
-# 	cd tools/srl-20131216; ./tag_ssj500k2.3.sh

 tsv_files: # tools/fillpred_model/model.pickle
 	cd tools; python3 parse_all.py
--- a/README.md
+++ b/README.md
@ -1,11 +1,3 @@
-# Instructions
-For mining ssj500k <b>checkout to branch ssj500k</b>.
-For running order look at Makefile. Generally it works like this:
- tools/parse_all.py - It creates mate file that is necessary for running Java based srl.jar
- tools/srl-20131216/tag_all.sh - Tags ssj500k
- tools/gen_json.py - Mine SRL to json
- tools/gen_tei.py - Mine SRL to tei
-
 # cjvt-srl-tagging
 We'll be using mate-tools to perform SRL on Kres. 

--- a/dockerfiles/python-java/Makefile
+++ b/dockerfiles/python-java/Makefile
@ -15,6 +15,6 @@ run:
 	-v /etc/group:/etc/group \
    	-v $(shell pwd)/../../:/cjvt-srl-tagging \
 	-w /cjvt-srl-tagging \
-	-v /home/luka/Development/srl/data:/kres_mount:ro \
+	-v /home/kristjan/kres_mount:/kres_mount:ro \
    python-java \
    /bin/bash
--- a/tools/check_all_files_existence.py
+++ b/tools/check_all_files_existence.py
@ -1,19 +0,0 @@
-import os
-
-# INPATH = Path(config["tools"]["giga_srl"])
-# infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
-SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files'
-from shutil import copyfile
-
-INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl'
-OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv'
-for i in range(100000):
-    # print(os.path.join(INPATH, 'giga.%07d.tsv' % i))
-    # if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)):
-    #     print('giga.%07d.tsv' % i)
-    if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)):
-        copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i))
-        print('giga%07d.srl.tsv' % i)
-
-    if i % 1000 == 0:
-        print(i)
--- a/tools/fillpred_model.srl.tsv
+++ b/tools/fillpred_model.srl.tsv
--- a/tools/find_diff_sentence_ids.py
+++ b/tools/find_diff_sentence_ids.py
@ -1,192 +0,0 @@
-import pickle
-
-from parser.parser import Parser
-import os
-from os.path import join, dirname
-from pathlib import Path
-import re
-import sys
-import cProfile
-import configparser
-import logging
-from multiprocessing import Pool
-
-SSJ500K_2_1 = 27829  # number of sentences
-par = Parser()
-
-# path to data
-config = configparser.ConfigParser()
-config.read("tools.cfg")
-analysis = ''
-if 'kres_orig' in config["tools"]:
-    analysis = 'kres'
-    INDIR = Path(config["tools"]["kres_orig"])
-    OUTDIR = Path(config["tools"]["kres_tsv"])
-elif 'giga_orig' in config["tools"]:
-    # analysis = 'gigafida'
-    analysis = 'giga'
-    INDIR_GIGA = Path(config["tools"]["giga_orig"])
-    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
-    INDIR_JOS = Path(config["tools"]["giga_jos"])
-    OUTDIR = Path(config["tools"]["giga_tsv"])
-    GIGA_PARTS = int(config["tools"]["giga_parts"])
-    INTERNAL_DATA = config["tools"]["internal_data"]
-
-CPU_CORES = int(config["tools"]["cpu_cores"])
-
-LOGFILE = Path(config["tools"]["logfile"]).absolute()
-LOGFILE.touch(exist_ok=True)
-LOGFILE.resolve()
-
-logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
-
-origfiles = []
-for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
-    for file in files:
-        origfiles.append(Path(os.path.join(subdir, file)))
-origfiles=list(enumerate(sorted(origfiles)))
-
-def giga_orig_sentence_generator():
-    with open(INDIR_GIGA, 'r') as gof:
-        previous_new_line = False
-        sentence_words = []
-        for l_gof in gof:
-            if l_gof == '\n':
-                yield ' '.join(sentence_words)
-                sentence_words = []
-            else:
-                sentence_words.append(l_gof.split('\t')[0])
-            # yield l_gof
-
-sentence_generator = giga_orig_sentence_generator()
-
-sentence_ids = []
-for origfile in origfiles:
-    split_file_sentences = par.parse_tei(origfile[1])
-    for k, v in split_file_sentences.items():
-        one_file_sentence = next(sentence_generator)
-        if one_file_sentence == v['text']:
-            sentence_ids.append(v['sid'])
-        else:
-            print('----------------')
-            print('ERROR')
-            print(v['sid'])
-            print(one_file_sentence)
-            print(v['text'])
-    print(origfile[0])
-
-# count sentences in orig (if not counted before)
-# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
-if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
-    os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
-
-with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
-    pickle.dump(sentence_ids, output)
-
-# def giga_orig_generator():
-#     with open(INDIR_GIGA, 'r') as gof:
-#         previous_new_line = False
-#         for l_gof in gof:
-#             if l_gof == '\n':
-#                 if previous_new_line:
-#                     continue
-#                 previous_new_line = True
-#             elif previous_new_line:
-#                 previous_new_line = False
-#             yield l_gof
-
-# import  time
-# def handle_giga_file(ran):
-#     """
-#     File that splits big text file into more minor files. Only split on empty lines.
-#     """
-#     # with open(INDIR_GIGA, 'r') as gof:
-#     #     with open(INDIR_JOS, 'r') as gjf:
-#     #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
-#     #             pass
-#     #     num_lines = i + 1
-#     # print(num_lines)
-#     num_lines = 1393184026
-#     # 1393184026
-#     # 1393184033
-#     # return
-#     num_lines_per_part = num_lines / GIGA_PARTS
-#     curr_part = 0
-#     gof_generator = giga_orig_generator()
-#
-#     diff_files = set()
-#     # with open(INDIR_GIGA, 'r') as gof:
-#     with open(INDIR_GIGA_OLD, 'r') as gjf:
-#         # sentence = {}
-#         # sentence['tokens'] = []
-#         # sentence['links'] = {}
-#         # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
-#         #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
-#
-#         # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
-#
-#         # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
-#         for i, l_gjf in enumerate(gjf):
-#             l_gof = next(gof_generator)
-#             if curr_part < ran[0]:
-#                 if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
-#                     if curr_part < ran[0]:
-#                         print(curr_part)
-#                         curr_part += 1
-#                         continue
-#                 else:
-#                     continue
-#
-#             l_gof_split = l_gof.split('\t')
-#             l_gjf_split = l_gjf.split('\t')
-#
-#             # if punctuation
-#             if l_gof != '\n':
-#                 if l_gof_split != l_gjf_split:
-#                     print(curr_part)
-#                     diff_files.add(curr_part)
-#                     l_gof = next(gof_generator)
-#
-#
-#             # if l_gof == '\n':
-#             else:
-#             # wf.flush()
-#             # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
-#                 if i > num_lines_per_part * (curr_part + 1):
-#                     curr_part += 1
-#                     # if wf doesn't exist (first one)
-#                     # wf.close()
-#                     if curr_part >= ran[1]:
-#                         break
-#                     # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
-#                     #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
-#
-#                     # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
-#
-#         curr_part += 1
-#     return diff_files
-#         # wf.close()
-#
-# with Pool(CPU_CORES) as p:
-#     final_range = [0, 100000]
-#     # final_range = [0, 150]
-#     # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
-#     # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
-#     # ranges = []
-#     # ps = None
-#     # for i in range(CPU_CORES):
-#     #     s = int(final_range[0] + size_per_proc * i)
-#     #     ns = int(final_range[0] + size_per_proc * (i + 1))
-#     #     ranges.append([s, ns])
-#     # # ranges = [[0, 1]]
-#     # res = p.map(handle_giga_file, ranges)
-#
-#     res = handle_giga_file(final_range)
-#     res = sorted(list(res))
-#     if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
-#         os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
-#     with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
-#         pickle.dump(res, pkl_file)
-#     # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
-#     #     mydict2 = pickle.load(pkl_file)
-#     print('test')
--- a/tools/gen_json.kres.py
+++ b/tools/gen_json.kres.py
@ -1,114 +0,0 @@
-from pathlib import Path
-from parser.parser import Parser
-import configparser
-import json
-import sys
-import logging
-from multiprocessing import Pool
-
-# parse config
-config = configparser.ConfigParser()
-config.read("tools.cfg")
-# ORIGPATH = Path(config["tools"]["kres_orig"])
-INPATH = Path(config["tools"]["giga_srl"])
-OUTPATH = Path(config["tools"]["kres_json"])
-DEBUG = config["tools"]["debug"] == "True"
-CPU_CORES = int(config["tools"]["cpu_cores"])
-
-LOGFILE = Path(config["tools"]["logfile"]).absolute()
-LOGFILE.touch(exist_ok=True)
-LOGFILE.resolve()
-
-logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
-
-def get_origfile(filename):
-    for origfile in ORIGPATH.iterdir():
-        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
-            return origfile
-    raise FileNotFoundError
-
-def extract_sentences(line_reader):
-    acc = []
-    # last char in line is \n, remove it
-    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
-        if len(line) == 1:  # empty line
-            tmp = acc
-            acc = []
-            yield tmp
-        else:
-            acc.append(line)
-
-def to_sentence(sentence_arr):
-    return " ".join([token[1] for token in sentence_arr])
-
-def match_sentence_id(sentence, orig_dict):
-    for k, e in orig_dict.items():
-        orig_sentence = " ".join(token[2] for token in e["tokens"])
-        if sentence == orig_sentence:
-            return k
-    raise KeyError
-
-def get_dep_rel(token):
-    logging.debug(token)
-    for i, field in enumerate(token[14:]):
-        if field != "_":
-            return {
-                "arg":  field,
-                "from": i,  # i-th predicate in sentence
-                "dep":  token[0],
-            }
-    return None
-
-def handle_file(infile_tpl):
-    i = infile_tpl[0]
-    infile = infile_tpl[1]
-    outfile = (OUTPATH / infile.name).with_suffix(".json")
-    origfile = get_origfile(infile)
-    orig_dict = par.parse_tei(origfile)
-
-    with infile.open("rb") as fp:
-        outdata = {}
-        for sentence_arr in extract_sentences(fp.readlines()):
-            # tsv dropped sentence ids, match the ID, using original data
-            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
-
-            outdata[sid] = []
-
-            # find all predicate indices in the sentence
-            predicates = []
-            for token in sentence_arr:
-                if token[12] == "Y":
-                    predicates += [token[0]]  # idx
-
-                deprel = get_dep_rel(token)
-                if deprel is not None:
-                    outdata[sid].append(deprel)
-
-            # deprel["from"] points to n-th predicate
-            # replace with predicate's token index
-            for deprel in outdata[sid]:
-                deprel["from"] = predicates[deprel["from"]]
-
-            if DEBUG:
-                print(to_sentence(sentence_arr))
-                print(outdata[sid])
-                print(sid)
-                print()
-                print()
-
-    with outfile.open("w") as fp:
-        json.dump(outdata, fp)
-        logging.info("SRL relations written to: {}".format(outfile))
-
-
-# main
-par = Parser()
-OUTPATH.mkdir(exist_ok=True)
-
-infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
-logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
-
-with Pool(CPU_CORES) as p:
-    p.map(handle_file, infiles)
-
-logging.info("Finished generating .json files.")
--- a/tools/gen_json.py
+++ b/tools/gen_json.py
@ -1,8 +1,3 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import os
-import pickle
 from pathlib import Path
 from parser.parser import Parser
 import configparser
@ -13,11 +8,10 @@ from multiprocessing import Pool

 # parse config
 config = configparser.ConfigParser()
-config.read("tools.cfg.ssj500k2.3")
-ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
-INPATH = Path(config["tools"]["ssj500k_srl"])
-OUTPATH = Path(config["tools"]["ssj500k_json"])
-INTERNAL_DATA = Path(config["tools"]["internal_data"])
+config.read("tools.cfg")
+ORIGPATH = Path(config["tools"]["kres_orig"])
+INPATH = Path(config["tools"]["kres_srl"])
+OUTPATH = Path(config["tools"]["kres_json"])
 DEBUG = config["tools"]["debug"] == "True"
 CPU_CORES = int(config["tools"]["cpu_cores"])

@ -54,13 +48,6 @@ def match_sentence_id(sentence, orig_dict):
            return k
    raise KeyError

-def match_sentence_id_giga(sentence, orig_dict):
-    for k, e in orig_dict.items():
-        # orig_sentence = " ".join(token[2] for token in e["tokens"])
-        if sentence == e["text"]:
-            return k
-    raise KeyError
-
 def get_dep_rel(token):
    logging.debug(token)
    for i, field in enumerate(token[14:]):
@ -72,7 +59,7 @@ def get_dep_rel(token):
            }
    return None

-def handle_file_old(infile_tpl):
+def handle_file(infile_tpl):
    i = infile_tpl[0]
    infile = infile_tpl[1]
    outfile = (OUTPATH / infile.name).with_suffix(".json")
@ -114,275 +101,14 @@ def handle_file_old(infile_tpl):
        logging.info("SRL relations written to: {}".format(outfile))


-def handle_file(whole_input):
-    # sentence_id = whole_input[0][3]
-    # orig_infile = whole_input[0][1]
-    sentence_id = whole_input[3]
-    orig_infile = whole_input[1]
-
-    # origfile = origfiles[0][1]
-    # infile_tpl = infile_tpl[0]
-
-    # i = infile_tpl[0]
-    # infile = infile_tpl[1]
-    outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
-
-    if outfile.exists():
-        return
-    # origfile = get_origfile()
-    orig_dict = par.parse_tei(orig_infile)
-    outdata = {}
-
-    gen = srl_multiple_files_sentences_generator(sentence_id)
-    # gen = srl_multiple_files_sentences_generator(whole_input[1])
-
-    mismatch_sentences = 0
-
-    for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
-        if orig_id == 'GF0014802.2685.7':
-            print('PAUSE')
-
-        # look at neighbouring sentences if they are correct
-        sentence, sentence_arr = next(gen)
-        # orig_sentence = " ".join(token[2] for token in e["tokens"])
-        assert sentence.replace(' ', '') == orig_val['text']
-        # if i != 10 and i != 0:
-        #     print('OK!')
-        sid = orig_id
-
-        outdata[sid] = []
-
-        # find all predicate indices in the sentence
-        predicates = []
-        for token in sentence_arr:
-            if token[12] == "Y":
-                predicates += [token[0]]  # idx
-
-            deprel = get_dep_rel(token)
-            if deprel is not None:
-                outdata[sid].append(deprel)
-
-        # deprel["from"] points to n-th predicate
-        # replace with predicate's token index
-        for deprel in outdata[sid]:
-            deprel["from"] = predicates[deprel["from"]]
-
-        if DEBUG:
-            print(to_sentence(sentence_arr))
-            print(outdata[sid])
-            print(sid)
-            print()
-            print()
-
-    if mismatch_sentences > 0:
-        if mismatch_sentences / len(orig_dict.items()) < 0.1:
-            print('Slight mismatch - %d' % sentence_id)
-            print(whole_input)
-            print('ABS mitigated %d' % mismatch_sentences)
-            print('------------------------------------------------')
-        else:
-            print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
-            print('Big mismatch - %d' % sentence_id)
-            print(whole_input)
-            print('ABS mitigated errors:')
-            print(mismatch_sentences)
-            print('------------------------------------------------')
-
-
-    with outfile.open("w") as fp:
-        json.dump(outdata, fp)
-        logging.info("SRL relations written to: {}".format(outfile))
-
-def count_orig_file_sentences(filename):
-
-    if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
-        return
-    print(filename[0])
-    orig_dict = par.parse_tei(filename[1])
-    # return filename[0], filename[1], len(orig_dict)
-    with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
-        pickle.dump((filename[0], filename[1], len(orig_dict)), output)
-
-
-def count_srl_file_sentences(filename):
-    if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
-        return
-
-    print(filename[0])
-    num_sentences = 0
-    with filename[1].open("r") as fp:
-        for line in fp:
-            if line == '\n':
-                num_sentences += 1
-
-    # return filename[0], filename[1], num_sentences
-    with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
-        pickle.dump((filename[0], filename[1], num_sentences), output)
-
-def srl_sentences_generator(infile, curr_index, sen_start_index):
-    with infile.open("rb") as fp:
-        outdata = {}
-        for sentence_arr in extract_sentences(fp.readlines()):
-            if curr_index < sen_start_index:
-                curr_index += 1
-            else:
-                yield to_sentence(sentence_arr), sentence_arr
-    yield None
-
-
-def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
-    sentence_id = max(0, sentence_id - 10)
-    for i, srl_file in enumerate(srl_file_sizes):
-        if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
-            srl_files = srl_file_sizes[i:]
-            break
-
-    for file_info in srl_files:
-        # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
-        srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
-        el = next(srl_gen)
-        while el is not None:
-            yield el
-            el = next(srl_gen)
-
-    yield None
-
-
 # main
 par = Parser()
 OUTPATH.mkdir(exist_ok=True)

-infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
+infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
 logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))

-origfiles = []
-for subdir, dirs, files in os.walk(ORIGPATH):
-    for file in files:
-        origfiles.append(Path(os.path.join(subdir, file)))
-origfiles=list(enumerate(sorted(origfiles)))
-##### REMOVE ############
-# origfiles = origfiles[:3]
-
-# count sentences in orig (if not counted before)
-# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
-if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
-    # srl_file_sizes = {}
-    if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
-        os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
-    # with Pool(CPU_CORES) as p:
-    #     # p.map(handle_file, infiles)
-    #     p.map(count_orig_file_sentences, origfiles)
-    for i in range(len(origfiles)):
-        count_orig_file_sentences(origfiles[i])
-    orig_file_sizes = []
-    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
-        print(x.name)
-        if x.is_file():
-            with x.open('rb') as pkl_small_file:
-                orig_file_sizes.append(pickle.load(pkl_small_file))
-    # orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
-    print("Sorting orig files")
-    orig_file_sizes = sorted(orig_file_sizes)
-    total_size = 0
-    orig_file_sizes_final = []
-    print("Calculating orig files size")
-    for n, pa, si in orig_file_sizes:
-        orig_file_sizes_final.append((n, pa, si, total_size))
-        total_size += si
-    orig_file_sizes = orig_file_sizes_final
-    print("Saving orig files size")
-    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
-        pickle.dump(orig_file_sizes, output)
-    print("Orig files saved")
-else:
-    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
-        orig_file_sizes = pickle.load(pkl_file)
-
-
-# count sentences in srl (if not counted before)
-# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
-if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
-    # srl_file_sizes = {}
-    if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
-        os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
-    # with Pool(CPU_CORES) as p:
-    #     # p.map(handle_file, infiles)
-    #     p.map(count_srl_file_sentences, infiles)
-
-    for i in range(len(infiles)):
-        count_srl_file_sentences(infiles[i])
-
-    srl_file_sizes = []
-    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
-        print(x.name)
-        if x.is_file():
-            with x.open('rb') as pkl_small_file:
-                srl_file_sizes.append(pickle.load(pkl_small_file))
-    print("Sorting srl files")
-    srl_file_sizes = sorted(srl_file_sizes)
-    total_size = 0
-    srl_file_sizes_final = []
-    print("Calculating srl files size")
-    for n, pa, si in srl_file_sizes:
-        srl_file_sizes_final.append((n, pa, si, total_size))
-        total_size += si
-    srl_file_sizes = srl_file_sizes_final
-    print("Saving srl files size")
-    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
-        pickle.dump(srl_file_sizes, output)
-    print("Srl files saved")
-else:
-    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
-        srl_file_sizes = pickle.load(pkl_file)
-
-
-# print(len(orig_file_sizes))
-# print('asd' + 2)
-
-# inputs = []
-# srl_i = 0
-# srl_file = srl_file_sizes[srl_i]
-# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
-#     interesting_srl_files = []
-#     # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
-#     # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
-#     #     srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
-#     while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
-#         # if beginning of file is in
-#         if srl_file[3] > orig_first_sent_i:
-#             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
-#             # print('if %d' % srl_file[3])
-#         else:
-#             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
-#             # print('else %d' % orig_first_sent_i)
-#
-#         if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
-#             srl_i += 1
-#             if srl_i < len(srl_file_sizes):
-#                 srl_file = srl_file_sizes[srl_i]
-#             else:
-#                 break
-#                 # print(srl_i)
-#                 # print('a ' + 2)
-#         else:
-#             break
-#
-#     inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
-    # print(inputs[-1])
-
-
-
-# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
-# a = next(srl_gen)
-# b = next(srl_gen)
-# c = next(srl_gen)
-
-print('beginning processing')
 with Pool(CPU_CORES) as p:
-    # p.map(handle_file, inputs)
-    p.map(handle_file, orig_file_sizes)
-
-# for of in orig_file_sizes:
-#     handle_file(of)
+    p.map(handle_file, infiles)

 logging.info("Finished generating .json files.")
--- a/tools/gen_json_fix_errors.py
+++ b/tools/gen_json_fix_errors.py
@ -1,294 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import os
-import pickle
-from pathlib import Path
-from parser.parser import Parser
-import configparser
-import json
-import sys
-import logging
-from multiprocessing import Pool
-
-# parse config
-config = configparser.ConfigParser()
-config.read("tools.cfg")
-ORIGPATH = Path(config["tools"]["giga"])
-INPATH = Path(config["tools"]["giga_srl_errors"])
-OUTPATH = Path(config["tools"]["giga_json"])
-INTERNAL_DATA = Path(config["tools"]["internal_data"])
-DEBUG = config["tools"]["debug"] == "True"
-CPU_CORES = int(config["tools"]["cpu_cores"])
-
-LOGFILE = Path(config["tools"]["logfile"]).absolute()
-LOGFILE.touch(exist_ok=True)
-LOGFILE.resolve()
-
-logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
-error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
-
-
-
-
-def get_origfile(filename):
-    for origfile in ORIGPATH.iterdir():
-        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
-            return origfile
-    raise FileNotFoundError
-
-def extract_sentences(line_reader):
-    acc = []
-    # last char in line is \n, remove it
-    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
-        if len(line) == 1:  # empty line
-            tmp = acc
-            acc = []
-            yield tmp
-        else:
-            acc.append(line)
-
-def to_sentence(sentence_arr):
-    return " ".join([token[1] for token in sentence_arr])
-
-def match_sentence_id(sentence, orig_dict):
-    for k, e in orig_dict.items():
-        orig_sentence = " ".join(token[2] for token in e["tokens"])
-        if sentence == orig_sentence:
-            return k
-    raise KeyError
-
-def match_sentence_id_giga(sentence, orig_dict):
-    for k, e in orig_dict.items():
-        # orig_sentence = " ".join(token[2] for token in e["tokens"])
-        if sentence == e["text"]:
-            return k
-    raise KeyError
-
-def get_dep_rel(token):
-    logging.debug(token)
-    for i, field in enumerate(token[14:]):
-        if field != "_":
-            return {
-                "arg":  field,
-                "from": i,  # i-th predicate in sentence
-                "dep":  token[0],
-            }
-    return None
-
-def handle_file_old(infile_tpl):
-    i = infile_tpl[0]
-    infile = infile_tpl[1]
-    outfile = (OUTPATH / infile.name).with_suffix(".json")
-    origfile = get_origfile(infile)
-    orig_dict = par.parse_tei(origfile)
-
-    with infile.open("rb") as fp:
-        outdata = {}
-        for sentence_arr in extract_sentences(fp.readlines()):
-            # tsv dropped sentence ids, match the ID, using original data
-            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
-
-            outdata[sid] = []
-
-            # find all predicate indices in the sentence
-            predicates = []
-            for token in sentence_arr:
-                if token[12] == "Y":
-                    predicates += [token[0]]  # idx
-
-                deprel = get_dep_rel(token)
-                if deprel is not None:
-                    outdata[sid].append(deprel)
-
-            # deprel["from"] points to n-th predicate
-            # replace with predicate's token index
-            for deprel in outdata[sid]:
-                deprel["from"] = predicates[deprel["from"]]
-
-            if DEBUG:
-                print(to_sentence(sentence_arr))
-                print(outdata[sid])
-                print(sid)
-                print()
-                print()
-
-    with outfile.open("w") as fp:
-        json.dump(outdata, fp)
-        logging.info("SRL relations written to: {}".format(outfile))
-
-
-def fix_json(srl_gen, error_sentence, orig_json_data):
-    # sentence_id = whole_input[0][3]
-    # orig_infile = whole_input[0][1]
-    # sentence_id = whole_input[3]
-    # orig_infile = whole_input[1]
-
-    # origfile = origfiles[0][1]
-    # infile_tpl = infile_tpl[0]
-
-    # i = infile_tpl[0]
-    # infile = infile_tpl[1]
-    # outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
-
-    # if outfile.exists():
-    #     return
-    # origfile = get_origfile()
-    # orig_dict = par.parse_tei(orig_infile)
-    # outdata = {}
-
-    # gen = srl_multiple_files_sentences_generator(sentence_id)
-    # gen = srl_multiple_files_sentences_generator(whole_input[1])
-
-    # mismatch_sentences = 0
-
-    # look at neighbouring sentences if they are correct
-    sentence, sentence_arr = next(srl_gen)
-    # orig_sentence = " ".join(token[2] for token in e["tokens"])
-    sid = error_sentence
-    # a = orig_json_data[sid]
-    if orig_json_data[sid] != []:
-        # print('POSSIBLE ERROR:')
-        # print(orig_json_data[sid])
-        orig_json_data[sid] = []
-
-    # find all predicate indices in the sentence
-    predicates = []
-    for token in sentence_arr:
-        if token[12] == "Y":
-            predicates += [token[0]]  # idx
-
-        deprel = get_dep_rel(token)
-        if deprel is not None:
-            orig_json_data[sid].append(deprel)
-
-    # deprel["from"] points to n-th predicate
-    # replace with predicate's token index
-    for deprel in orig_json_data[sid]:
-        deprel["from"] = predicates[deprel["from"]]
-
-    if DEBUG:
-        print(to_sentence(sentence_arr))
-        print(orig_json_data[sid])
-        print(sid)
-        print()
-        print()
-    # a = orig_json_data[sid]
-    return orig_json_data
-
-def count_orig_file_sentences(filename):
-
-    if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
-        return
-    print(filename[0])
-    orig_dict = par.parse_tei(filename[1])
-    # return filename[0], filename[1], len(orig_dict)
-    with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
-        pickle.dump((filename[0], filename[1], len(orig_dict)), output)
-
-
-def count_srl_file_sentences(filename):
-    if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
-        return
-
-    print(filename[0])
-    num_sentences = 0
-    with filename[1].open("r") as fp:
-        for line in fp:
-            if line == '\n':
-                num_sentences += 1
-
-    # return filename[0], filename[1], num_sentences
-    with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
-        pickle.dump((filename[0], filename[1], num_sentences), output)
-
-def srl_error_fix_generator(infile):
-    with infile.open("rb") as fp:
-        for sentence_arr in extract_sentences(fp.readlines()):
-            yield to_sentence(sentence_arr), sentence_arr
-    yield None
-
-def srl_sentences_generator(infile, curr_index, sen_start_index):
-    with infile.open("rb") as fp:
-        outdata = {}
-        for sentence_arr in extract_sentences(fp.readlines()):
-            if curr_index < sen_start_index:
-                curr_index += 1
-            else:
-                yield to_sentence(sentence_arr), sentence_arr
-    yield None
-
-
-def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
-    sentence_id = max(0, sentence_id - 10)
-    for i, srl_file in enumerate(srl_file_sizes):
-        if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
-            srl_files = srl_file_sizes[i:]
-            break
-
-    for file_info in srl_files:
-        # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
-        srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
-        el = next(srl_gen)
-        while el is not None:
-            yield el
-            el = next(srl_gen)
-
-    yield None
-
-error_sentences_grouped = []
-group = False
-prev_name = ''
-# group sentences by their files
-for name in error_sentences:
-    if name[:9] == prev_name:
-        group.append(name)
-    else:
-        prev_name = name[:9]
-        if group:
-            error_sentences_grouped.append(group)
-        group = [name]
-error_sentences_grouped.append(group)
-
-srl_gen = srl_error_fix_generator(INPATH)
-
-# find errors in json files:
-# with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output:
-#     sentence_ids = pickle.load(output)
-#
-#
-#
-# origfiles = []
-# for subdir, dirs, files in os.walk(OUTPATH):
-#     for file in files:
-#         origfiles.append(Path(os.path.join(subdir, file)))
-# origfiles=sorted(origfiles)
-#
-#
-#
-# for sent in origfiles:
-# # for sent in sentence_ids:
-# #     outfile = Path(OUTPATH, sent[:9] + '-dedup.json')
-#     outfile = sent
-#
-#     try:
-#         with outfile.open() as json_file:
-#             json.load(json_file)
-#             pass
-#     except:
-#         print(outfile.name)
-#
-#
-# raise Exception('test')
-# iterate over all wronged sentences and fix them
-for errors_in_file in error_sentences_grouped:
-    outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json')
-    with outfile.open() as json_file:
-        print(outfile.name)
-        orig_json_data = json.load(json_file)
-        for error_sentence in errors_in_file:
-            orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data)
-
-    with outfile.open('w') as json_file:
-        json.dump(orig_json_data, json_file)
-        logging.info("SRL relations written to: {}".format(outfile))
--- a/tools/gen_tei.py
+++ b/tools/gen_tei.py
@ -1,47 +0,0 @@
-# parse config
-import configparser
-import json
-import logging
-import os
-from pathlib import Path
-
-from tools.parser.parser import Parser
-
-config = configparser.ConfigParser()
-config.read("tools.cfg.ssj500k2.3")
-ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
-JSONPATH = Path(config["tools"]["ssj500k_json"] + '/ssj500k-sl.body.json')
-OUTPATH = Path(config["tools"]["ssj500k_tei"])
-INTERNAL_DATA = Path(config["tools"]["internal_data"])
-DEBUG = config["tools"]["debug"] == "True"
-CPU_CORES = int(config["tools"]["cpu_cores"])
-
-LOGFILE = Path(config["tools"]["logfile"]).absolute()
-LOGFILE.touch(exist_ok=True)
-LOGFILE.resolve()
-
-logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
-
-
-par = Parser()
-OUTPATH.mkdir(exist_ok=True)
-
-jsondata = []
-with open(JSONPATH, 'r') as jf:
-    jsondata = json.load(jf)
-
-logging.info("Generating TEI with annotated SRL.")
-
-def handle_file(file, jsondata):
-    teifile = (ORIGPATH / file)
-    resfile = (OUTPATH / file)
-
-    orig_dict = par.parse_tei(teifile)
-
-    # origfile = get_origfile()
-    orig_dict = par.minimize_tei(teifile, jsondata)
-
-origfiles = []
-for subdir, dirs, files in os.walk(ORIGPATH):
-    for file in files:
-        handle_file(file, jsondata)
--- a/tools/parse_all.py
+++ b/tools/parse_all.py
@ -1,5 +1,3 @@
-import pickle
-
 from parser.parser import Parser
 import os
 from os.path import join, dirname
@ -16,31 +14,9 @@ par = Parser()

 # path to data
 config = configparser.ConfigParser()
-# config.read("tools.cfg")
-config.read("tools.cfg.ssj500k2.3")
-analysis = ''
-if 'kres_orig' in config["tools"]:
-    analysis = 'kres'
-    INDIR = Path(config["tools"]["kres_orig"])
-    OUTDIR = Path(config["tools"]["kres_tsv"])
-elif 'giga_orig' in config["tools"]:
-    # analysis = 'gigafida'
-    analysis = 'giga'
-    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
-    INDIR_GIGA = Path(config["tools"]["giga_orig"])
-    INDIR_JOS = Path(config["tools"]["giga_jos"])
-    OUTDIR = Path(config["tools"]["giga_tsv"])
-    GIGA_PARTS = int(config["tools"]["giga_parts"])
-    INTERNAL_DATA = config["tools"]["internal_data"]
-elif 'ssj500k_orig' in config["tools"]:
-    # analysis = 'gigafida'
-    analysis = 'ssj500k'
-    INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])
-    INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])
-    INDIR_JOS = Path(config["tools"]["ssj500k_jos"])
-    OUTDIR = Path(config["tools"]["ssj500k_tsv"])
-    INTERNAL_DATA = config["tools"]["internal_data"]
-
+config.read("tools.cfg")
+INDIR = Path(config["tools"]["kres_orig"])
+OUTDIR = Path(config["tools"]["kres_tsv"])
 CPU_CORES = int(config["tools"]["cpu_cores"])

 LOGFILE = Path(config["tools"]["logfile"]).absolute()
@ -58,365 +34,41 @@ print("end parsing ssj")
 """

 # kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
-# OUTDIR.mkdir(exist_ok=True)
+OUTDIR.mkdir(exist_ok=True)

-if analysis == 'kres':
-    infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
-    logging.info("Parsing kres: {} files.".format(len(infiles)))
+infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
+logging.info("Parsing kres: {} files.".format(len(infiles)))

-
-def handle_ssj500k_file():
-    kres_file = INDIR_SSJ500K_ORIG
-    outfile = OUTDIR
+def handle_file(infile):
+    i = infile[0]
+    kres_file = infile[1]
+    outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")

    if outfile.is_file():
        logging.info("Skipping existing file: {}.".format(str(kres_file)))
        return True

-    # try:
-    res_dict = par.parse_tei(kres_file)
-    kres_out_str = ""
-    for _, sentence in res_dict.items():
-        kres_out_str += par.to_conll_2009_SRL(sentence)
-    # except Exception as exc:
-    #     logging.info("Failed processing file: {}".format(str(kres_file)))
-    #     logging.error(exc)
-    #     return False
+    try:
+        res_dict = par.parse_tei(kres_file)
+        kres_out_str = ""
+        for _, sentence in res_dict.items():
+            kres_out_str += par.to_conll_2009_SRL(sentence)
+    except Exception as exc:
+        logging.info("Failed processing file: {}".format(str(kres_file)))
+        logging.error(exc)
+        return False


    with outfile.open("wb+") as fp:
        fp.write(kres_out_str.encode("utf-8"))
-        # logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
+        logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
        return True
    return False

-def ssj500k_orig_generator():
-    with open(INDIR_SSJ500K, 'r') as gof:
-        previous_new_line = False
-        for l_gof in gof:
-            if l_gof == '\n':
-                if previous_new_line:
-                    continue
-                previous_new_line = True
-            elif previous_new_line:
-                previous_new_line = False
-            yield l_gof
-
-
-def handle_gigafida_file():
-    """
-    File that splits big text file into more minor files. Only split on empty lines.  
-    """
-    # with open(INDIR_GIGA, 'r') as gof:
-    #     with open(INDIR_JOS, 'r') as gjf:
-    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
-    #             pass
-    #     num_lines = i + 1
-    # print(num_lines)
-    gof_generator = giga_orig_generator()
-    # with open(INDIR_GIGA, 'r') as gof:
-    with open(INDIR_JOS, 'r') as gjf:
-        sentence = {}
-        sentence['tokens'] = []
-        sentence['links'] = {}
-        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
-            ignore_lines = True
-            wf = False
-        else:
-            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
-            ignore_lines = False
-        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
-        for i, l_gjf in enumerate(gjf):
-            l_gof = next(gof_generator)
-            if ignore_lines:
-                if i > num_lines_per_part * curr_part and l_gof == '\n':
-                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
-                        ignore_lines = False
-                        # delete last file (probably not whole)
-                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
-                    if ignore_lines:
-                        print(curr_part)
-                        curr_part += 1
-                        continue
-                else:
-                    continue
-            l_gof_split = l_gof.split('\t')
-            l_gjf_split = l_gjf.split('\t')
-
-            # if punctuation
-            if l_gof != '\n':
-                if l_gof_split[1][-1] == 'u':
-                    # print(l_gjf_split)
-                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
-                else:
-                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
-
-                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
-
-            # if l_gof == '\n':
-            else:
-                if wf:
-                    # print(i)
-                    wf.write(par.to_conll_2009_SRL(sentence))
-                sentence['tokens'] = []
-                sentence['links'] = {}
-            # wf.flush()
-            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
-                if i > num_lines_per_part * (curr_part + 1):
-                    curr_part += 1
-                    # if wf doesn't exist (first one)
-                    if wf:
-                        wf.close()
-                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
-        curr_part += 1
-        wf.close()
-
-
-def handle_ssj500k_file2():
-    """
-    File that splits big text file into more minor files. Only split on empty lines.
-    """
-    gof_generator = ssj500k_orig_generator()
-    # with open(INDIR_GIGA, 'r') as gof:
-    with open(INDIR_JOS, 'r') as gjf:
-        sentence = {}
-        sentence['tokens'] = []
-        sentence['links'] = {}
-        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
-            ignore_lines = True
-            wf = False
-        else:
-            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
-            ignore_lines = False
-        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
-        for i, l_gjf in enumerate(gjf):
-            l_gof = next(gof_generator)
-            if ignore_lines:
-                if i > num_lines_per_part * curr_part and l_gof == '\n':
-                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
-                        ignore_lines = False
-                        # delete last file (probably not whole)
-                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
-                    if ignore_lines:
-                        print(curr_part)
-                        curr_part += 1
-                        continue
-                else:
-                    continue
-            l_gof_split = l_gof.split('\t')
-            l_gjf_split = l_gjf.split('\t')
-
-            # if punctuation
-            if l_gof != '\n':
-                if l_gof_split[1][-1] == 'u':
-                    # print(l_gjf_split)
-                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
-                else:
-                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
-
-                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
-
-            # if l_gof == '\n':
-            else:
-                if wf:
-                    # print(i)
-                    wf.write(par.to_conll_2009_SRL(sentence))
-                sentence['tokens'] = []
-                sentence['links'] = {}
-            # wf.flush()
-            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
-                if i > num_lines_per_part * (curr_part + 1):
-                    curr_part += 1
-                    # if wf doesn't exist (first one)
-                    if wf:
-                        wf.close()
-                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
-        curr_part += 1
-        wf.close()
-
-
-import  time
-def handle_giga_file(ran):
-    """
-    File that splits big text file into more minor files. Only split on empty lines.
-    """
-    # with open(INDIR_GIGA, 'r') as gof:
-    #     with open(INDIR_JOS, 'r') as gjf:
-    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
-    #             pass
-    #     num_lines = i + 1
-    # print(num_lines)
-    num_lines = 1393184026
-    # 1393184026
-    # 1393184033
-    # return
-    num_lines_per_part = num_lines / GIGA_PARTS
-    curr_part = 0
-    gof_generator = giga_orig_generator()
-    # with open(INDIR_GIGA, 'r') as gof:
-    with open(INDIR_JOS, 'r') as gjf:
-        sentence = {}
-        sentence['tokens'] = []
-        sentence['links'] = {}
-        wf = None
-        if curr_part in file_indices:
-            if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
-                os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
-
-            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
-
-        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
-        for i, l_gjf in enumerate(gjf):
-            l_gof = next(gof_generator)
-            if curr_part < ran[0]:
-                if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
-                    if curr_part < ran[0]:
-                        print(curr_part)
-                        curr_part += 1
-                        continue
-                else:
-                    continue
-
-            l_gof_split = l_gof.split('\t')
-            l_gjf_split = l_gjf.split('\t')
-
-            # if punctuation
-            if l_gof != '\n':
-                if curr_part not in file_indices:
-                    continue
-                if l_gof_split[1][-1] == 'u':
-                    # print(l_gjf_split)
-                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
-                else:
-                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
-
-                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
-
-            # if l_gof == '\n':
-            else:
-                if curr_part in file_indices:
-                    wf.write(par.to_conll_2009_SRL(sentence))
-                    sentence['tokens'] = []
-                    sentence['links'] = {}
-            # wf.flush()
-            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
-                if i > num_lines_per_part * (curr_part + 1):
-                    curr_part += 1
-                    # if wf doesn't exist (first one)
-                    if curr_part in file_indices and wf:
-                        wf.close()
-                    if curr_part >= ran[1]:
-                        break
-                    if curr_part in file_indices:
-                        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
-                            os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
-
-                        wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
-
-        curr_part += 1
-        wf.close()
-
-def handle_giga_file_selected_sentences(error_sentences):
-    """
-    File that splits big text file into more minor files. Only split on empty lines.
-    """
-    # with open(INDIR_GIGA, 'r') as gof:
-    #     with open(INDIR_JOS, 'r') as gjf:
-    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
-    #             pass
-    #     num_lines = i + 1
-    # print(num_lines)
-    # print('num_lines' + 3)
-    # num_lines = 1393184026
-    num_lines = 1393222523
-    # 1393184026
-    # 1393184033
-    # return
-    # num_lines_per_part = num_lines / GIGA_PARTS
-    # curr_part = 0
-    gof_generator = giga_orig_generator()
-    # with open(INDIR_GIGA, 'r') as gof:
-    with open(INDIR_JOS, 'r') as gjf:
-        sentence = {}
-        sentence['tokens'] = []
-        sentence['links'] = {}
-        wf = None
-        if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
-            os.remove(os.path.join(OUTDIR, 'giga_errors'))
-
-        wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
-
-        with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
-            sentence_ids_list = pickle.load(pkl_file)
-
-        sentence_id = 0
-        skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
-
-        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
-        for i, l_gjf in enumerate(gjf):
-            l_gof = next(gof_generator)
-
-
-            if l_gjf == '\n':
-                if not skip_sentence:
-                    wf.write(par.to_conll_2009_SRL(sentence))
-                    sentence['tokens'] = []
-                    sentence['links'] = {}
-                sentence_id += 1
-                if sentence_ids_list[sentence_id] in error_sentences:
-                    print(sentence_ids_list[sentence_id])
-                    skip_sentence = False
-                else:
-                    skip_sentence = True
-
-            if skip_sentence:
-                continue
-
-
-            # if curr_part < ran[0]:
-            #     if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
-            #         if curr_part < ran[0]:
-            #             print(curr_part)
-            #             curr_part += 1
-            #             continue
-            #     else:
-            #         continue
-
-            l_gof_split = l_gof.split('\t')
-            l_gjf_split = l_gjf.split('\t')
-
-            # if punctuation
-            if l_gof != '\n':
-                if l_gof_split[1][-1] == 'u':
-                    # print(l_gjf_split)
-                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
-                else:
-                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
-
-                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
-
-            # if l_gof == '\n':
-            # wf.flush()
-            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
-            #     if i > num_lines_per_part * (curr_part + 1):
-            #         curr_part += 1
-            #         # if wf doesn't exist (first one)
-            #         if curr_part in file_indices and wf:
-            #             wf.close()
-            #         if curr_part >= ran[1]:
-            #             break
-                    # if curr_part in file_indices:
-                    #     if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
-                    #         os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
-                    #
-                    #     wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
-
-        # curr_part += 1
-        wf.close()
-
-
-
-handle_ssj500k_file()
+with Pool(CPU_CORES) as p:
+    p.map(handle_file, infiles)


 logging.info("end parsing kres")
+
+
--- a/tools/parser/parser.py
+++ b/tools/parser/parser.py
@ -1,5 +1,3 @@
-import copy
-
 from lxml import etree
 import re
 from parser.msd.msdmap import Msdmap
@ -7,7 +5,6 @@ import pickle
 from pathlib import Path
 from fillpred_model.step1 import build_model_row
 import sys
-import xml.etree.ElementTree as ET

 class Parser:
    # reads a TEI xml file and returns a dictionary:
@ -32,23 +29,17 @@ class Parser:
    def parse_tei(self, filepath):

        def parse_links(s_el):
-            sent_id = '#' + s_el.get('id')
-            lgrps = s_el.findall(".//linkGrp")
+            lgrps = s_el.findall(".//links")
            if len(lgrps) < 1:
                raise IOError("Can't find links.")
            res_links = {}
-            for lgrp in lgrps:
-                if lgrp.get("type") == "JOS-SYN":
-                    for link in lgrp:
-                        jos_type = link.get("ana").split(":")[-1]
-                        link_data = link.get("target").split(" ")
-                        link_from = int(link_data[1].split('.')[-1][1:])
-                        link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
-                        res_links[link_from] = (
-                            jos_type,
-                            link_from,
-                            link_to,
-                        )
+            for link in lgrps[0]:
+                dep = int(link.get("dep").split(".")[-1])
+                res_links[dep] = (
+                    link.get("afun"),
+                    dep,
+                    int(link.get("from").split(".")[-1]),
+                )
            return res_links

        guess_corpus = None  # SSJ | KRES
@ -66,206 +57,7 @@ class Parser:
            divs = []  # in ssj, there are divs, in Kres, there are separate files
            if "id" in root.keys():
                # Kres files start with <TEI id=...>
-                if root.get("id")[0:2] == 'GF':
-                    guess_corpus = "GIGA"
-                else:
-                    guess_corpus = "KRES"
-                divs = [root]
-            else:
-                guess_corpus = "SSJ"
-                divs = root.findall(".//div")
-
-            # parse divs
-            for div in divs:
-                f_id = div.get("id")[:-6]
-
-                if guess_corpus == "GIGA":
-                    div = div.findall(".//body")[0]
-
-                # parse paragraphs
-                for p in div.findall(".//p"):
-                    p_id = p.get("id").split(".")[-1]
-
-                    # parse sentences
-                    for s in p.findall(".//s"):
-                        # test if sentence has jos-syn annotations and doesn't have SRL
-                        sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
-                        if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
-                            continue
-
-                        s_id = s.get("id").split(".")[-1]
-                        sentence_text = ""
-                        sentence_list = []
-                        sentence_tokens = []
-
-                        # parse tokens
-                        for el in s.iter():
-                            if el.tag in self.W_TAGS:
-                                el_id = el.get("id").split(".")[-1]
-                                if el_id[0] == 't':
-                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
-                                sentence_text += el.text
-                                uPosTag = None
-                                uPosFeats = []
-                                for msd_el in el.get("msd").split('|'):
-                                    key, val = msd_el.split('=')
-                                    if key == 'UPosTag':
-                                        uPosTag = val
-                                    else:
-                                        uPosFeats.append(msd_el)
-                                uPosFeats = '|'.join(uPosFeats)
-                                sentence_tokens += [(
-                                    "w",
-                                    int(el_id),
-                                    el.text,
-                                    el.get("lemma"),
-                                    (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
-                                     else el.get("ana").split(":")[-1]),
-                                    uPosTag,
-                                    uPosFeats
-                                )]
-                            elif el.tag in self.C_TAGS:
-                                # only Kres' C_TAGS have ids
-                                if guess_corpus != "GIGA":
-                                    el_id = el.get("id") or "none"
-                                    el_id = el_id.split(".")[-1]
-                                    sentence_text += el.text
-                                    sentence_tokens += [("c", el_id, el.text,)]
-                            elif el.tag in self.S_TAGS:
-                                el_id = el.get("id").split(".")[-1]
-                                if el_id[0] == 't':
-                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
-                                sentence_text += el.text
-                                uPosTag = None
-                                uPosFeats = []
-                                for msd_el in el.get("msd").split('|'):
-                                    key, val = msd_el.split('=')
-                                    if key == 'UPosTag':
-                                        uPosTag = val
-                                    else:
-                                        uPosFeats.append(msd_el)
-                                uPosFeats = '|'.join(uPosFeats)
-                                sentence_tokens += [(
-                                    "pc",
-                                    int(el_id),
-                                    el.text,
-                                    el.text,
-                                    (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
-                                     else el.get("ana").split(":")[-1]),
-                                    uPosTag,
-                                    uPosFeats
-                                )]
-                            else:
-                                # pass links and linkGroups
-                                pass
-                        sentence_id = s.get("id")
-                        if sentence_id in res_dict:
-                            raise KeyError("duplicated id: {}".format(sentence_id))
-
-                        res_dict[sentence_id] = {
-                            "sid": sentence_id,
-                            "text": sentence_text,
-                            "tokens": sentence_tokens,
-                            "links": (
-                                parse_links(s)
-                            )
-                        }
-        fp.close()
-        return res_dict
-
-
-    def minimize_tei(self, filepath, jsondata):
-        def set_xml_attr(node, attribute, value):
-            node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value
-
-        def parse_links(s_el):
-            sent_id = '#' + s_el.get('id')
-            lgrps = s_el.findall(".//linkGrp")
-            if len(lgrps) < 1:
-                raise IOError("Can't find links.")
-            res_links = {}
-            for lgrp in lgrps:
-                if lgrp.get("type") == "JOS-SYN":
-                    for link in lgrp:
-                        jos_type = link.get("ana").split(":")[-1]
-                        link_data = link.get("target").split(" ")
-                        link_from = int(link_data[1].split('.')[-1][1:])
-                        link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
-                        res_links[link_from] = (
-                            jos_type,
-                            link_from,
-                            link_to,
-                        )
-            return res_links
-
-        guess_corpus = None  # SSJ | KRES
-        res_dict = {}
-        # with filepath.open("rb") as fp, open("../data/ssj500k2.3/final_tei/res.xml", 'w') as sf:
-        with filepath.open("rb") as fp:
-            used_ssj_documents = set([k.split('.')[0] for k, v in jsondata.items()])
-            used_ssj_paragraphs = set(['.'.join(k.split('.')[:-1]) for k, v in jsondata.items()])
-            used_ssj_sentences = set([k for k, v in jsondata.items()])
-
-            ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
-            tree = ET.parse(fp)
-            root_res = tree.getroot()
-            # root_res = copy.deepcopy(root)
-            ns = '{http://www.w3.org/XML/1998/namespace}'
-            ns2 = '{http://www.tei-c.org/ns/1.0}'
-
-            for doc in list(root_res):
-                doc_id = doc.get(ns + 'id')
-                if doc_id not in used_ssj_documents:
-                    root_res.remove(doc)
-                    continue
-
-                for par in list(doc):
-                    par_id = par.get(ns + 'id')
-                    if par_id not in used_ssj_paragraphs:
-                        if par.tag != ns2 + 'bibl':
-                            doc.remove(par)
-                        continue
-
-                    for sen in list(par):
-                        sen_id = sen.get(ns + 'id')
-                        if sen_id not in used_ssj_sentences:
-                            par.remove(sen)
-                            continue
-
-                        linkGrp = ET.Element(f'{ns2}linkGrp')
-
-                        linkGrp.attrib[f'targFunc'] = 'head argument'
-                        linkGrp.attrib[f'type'] = 'SRL'
-
-                        for srl_el in jsondata[sen_id]:
-                            link = ET.Element(f'{ns2}link')
-                            link.attrib['ana'] = f'srl:{srl_el["arg"]}'
-                            link.attrib['target'] = f'#{sen_id}.t{srl_el["from"]} #{sen_id}.t{srl_el["dep"]}'
-                            linkGrp.append(link)
-                        sen.append(linkGrp)
-
-
-                        # <linkGrp corresp="#ssj1.1.1" targFunc="head argument" type="SRL">
-                        # <link ana="srl:TIME" target="#ssj1.1.1.t6 #ssj1.1.1.t3"/>
-                        # <link ana="srl:QUANT" target="#ssj1.1.1.t6 #ssj1.1.1.t5"/>
-                        # <link ana="srl:TIME" target="#ssj1.1.1.t8 #ssj1.1.1.t11"/>
-                        # <link ana="srl:PAT" target="#ssj1.1.1.t23 #ssj1.1.1.t21"/>
-                        # <link ana="srl:ACT" target="#ssj1.1.1.t23 #ssj1.1.1.t22"/>
-                        # <link ana="srl:RESLT" target="#ssj1.1.1.t18 #ssj1.1.1.t23"/>
-                        # </linkGrp>
-                # print('aaa')
-
-            # sf.write(etree.tostring(tree, pretty_print=True, encoding='utf-8').decode())
-            tree.write("../data/ssj500k2.3/final_tei/res.xml", encoding='utf-8')
-
-            return
-            divs = []  # in ssj, there are divs, in Kres, there are separate files
-            if "id" in root.keys():
-                # Kres files start with <TEI id=...>
-                if root.get("id")[0:2] == 'GF':
-                    guess_corpus = "GIGA"
-                else:
-                    guess_corpus = "KRES"
+                guess_corpus = "KRES"
                divs = [root]
            else:
                guess_corpus = "SSJ"
@ -275,24 +67,14 @@ class Parser:
            for div in divs:
                f_id = div.get("id")

-                if guess_corpus == "GIGA":
-                    div = div.findall(".//body")[0]
-
                # parse paragraphs
                for p in div.findall(".//p"):
                    p_id = p.get("id").split(".")[-1]

                    # parse sentences
                    for s in p.findall(".//s"):
-                        # test if sentence has jos-syn annotations and doesn't have SRL
-                        sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
-                        if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
-                            del s
-                            continue
-
                        s_id = s.get("id").split(".")[-1]
                        sentence_text = ""
-                        sentence_list = []
                        sentence_tokens = []

                        # parse tokens
@ -302,73 +84,37 @@ class Parser:
                                if el_id[0] == 't':
                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
                                sentence_text += el.text
-                                uPosTag = None
-                                uPosFeats = []
-                                for msd_el in el.get("msd").split('|'):
-                                    key, val = msd_el.split('=')
-                                    if key == 'UPosTag':
-                                        uPosTag = val
-                                    else:
-                                        uPosFeats.append(msd_el)
-                                uPosFeats = '|'.join(uPosFeats)
                                sentence_tokens += [(
                                    "w",
                                    int(el_id),
                                    el.text,
                                    el.get("lemma"),
-                                    (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
-                                     else el.get("ana").split(":")[-1]),
-                                    uPosTag,
-                                    uPosFeats
+                                    (el.get("msd") if guess_corpus == "KRES"
+                                        else el.get("ana").split(":")[-1]),
                                )]
                            elif el.tag in self.C_TAGS:
                                # only Kres' C_TAGS have ids
-                                if guess_corpus != "GIGA":
-                                    el_id = el.get("id") or "none"
-                                    el_id = el_id.split(".")[-1]
-                                    sentence_text += el.text
-                                    sentence_tokens += [("c", el_id, el.text,)]
-                            elif el.tag in self.S_TAGS:
-                                el_id = el.get("id").split(".")[-1]
-                                if el_id[0] == 't':
-                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
+                                el_id = el.get("id") or "none"
+                                el_id = el_id.split(".")[-1]
                                sentence_text += el.text
-                                uPosTag = None
-                                uPosFeats = []
-                                for msd_el in el.get("msd").split('|'):
-                                    key, val = msd_el.split('=')
-                                    if key == 'UPosTag':
-                                        uPosTag = val
-                                    else:
-                                        uPosFeats.append(msd_el)
-                                uPosFeats = '|'.join(uPosFeats)
-                                sentence_tokens += [(
-                                    "pc",
-                                    int(el_id),
-                                    el.text,
-                                    el.text,
-                                    (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
-                                     else el.get("ana").split(":")[-1]),
-                                    uPosTag,
-                                    uPosFeats
-                                )]
+                                sentence_tokens += [("c", el_id, el.text,)]
+                            elif el.tag in self.S_TAGS:
+                                # Kres' <S /> doesn't contain .text
+                                sentence_text += " "
                            else:
                                # pass links and linkGroups
                                pass
-                        sentence_id = s.get("id")
+                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
                        if sentence_id in res_dict:
                            raise KeyError("duplicated id: {}".format(sentence_id))
-
                        res_dict[sentence_id] = {
                            "sid": sentence_id,
                            "text": sentence_text,
                            "tokens": sentence_tokens,
                            "links": (
-                                parse_links(s)
+                                parse_links(s) if guess_corpus == "KRES" else None
                            )
                        }
-            et = etree.ElementTree(root)
-            et.write("../data/ssj500k2.3/final_tei/res.xml", pretty_print=True, encoding='unicode')
        fp.close()
        return res_dict

@ -377,7 +123,7 @@ class Parser:

        def fillpred(tsv_row):
            mrow = build_model_row(tsv_row)
-            x = mrow[:-1]
+            x = mrow[:-1] 
            y = self.fillpred_model.predict([x])
            return y[0]  # bool

@ -389,8 +135,12 @@ class Parser:

            # handle stop signs
            if token[0] != "w":
-                out_list = [t_id] + [form for x in range(7)] + ["0", "0", "modra", "modra", "_", "_"] + ["\n"]
-                out_str += '\t'.join(map(str, out_list))
+                out_str += '\t'.join(
+                    [t_id] +
+                    [form for x in range(7)] + 
+                    ["0", "0", "modra", "modra", "_", "_"] +
+                    ["\n"]
+                )
                continue 

            pos = self.msdmap.slo_msd_to_eng_pos(token[4])
--- a/tools/srl-20131216/scripts/parse_srl_only_mod.sh
+++ b/tools/srl-20131216/scripts/parse_srl_only_mod.sh
@ -34,8 +34,7 @@ JVM_ARGS="-cp $CP -Xmx$MEM"
 NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step. This setting is equivalent to the CoNLL 2009 ST.


-$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang "$INPUT" $MODEL $RERANKER $NOPI "$OUTPUT"
-# CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang '$INPUT' $MODEL $RERANKER $NOPI '$OUTPUT'"
-# echo "Executing: $CMD"
+CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang $INPUT $MODEL $RERANKER $NOPI $OUTPUT"
+echo "Executing: $CMD"

-# $CMD
+$CMD
--- a/tools/srl-20131216/tag_all.gigafida.sh
+++ b/tools/srl-20131216/tag_all.gigafida.sh
@ -1,29 +0,0 @@
-#!/bin/bash
-
-# parsing tools.cfg values
-IN_FOLDER="$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg)"
-echo "input folder: $IN_FOLDER"
-OUT_FOLDER="$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg)"
-echo "output folder: $OUT_FOLDER"
-
-SUFFIX="srl.tsv"
-
-mkdir -p "$OUT_FOLDER"
-rm "$OUT_FOLDER/*${SUFFIX}" &> /dev/null
-
-for infile in "$IN_FOLDER/*"; do
-	echo "Tagging: ${infile}"
-	base=$(basename $infile | cut -d'.' -f1)
-	outfile="${OUT_FOLDER}/${base}.${SUFFIX}"
-
-	# mate-tools tagger
-	./scripts/parse_srl_only_mod.sh "$infile" "$outfile"
-
-	if [ $? -eq 0 ]; then
-		echo "Saved as ${outfile}"
-	else
-		echo "ERR"
-		exit 1
-	fi
-done
-
--- a/tools/srl-20131216/tag_all.kres.sh
+++ b/tools/srl-20131216/tag_all.kres.sh
@ -1,29 +0,0 @@
-#!/bin/bash
-
-# parsing tools.cfg values
-IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
-echo "input folder: $IN_FOLDER"
-OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
-echo "output folder: $OUT_FOLDER"
-
-SUFFIX="srl.tsv"
-
-mkdir -p $OUT_FOLDER
-rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
-
-for infile in $IN_FOLDER/*; do
-	echo "Tagging: ${infile}"
-	base=$(basename $infile | cut -d'.' -f1)
-	outfile=${OUT_FOLDER}/${base}.${SUFFIX}
-
-	# mate-tools tagger
-	./scripts/parse_srl_only_mod.sh $infile $outfile
-
-	if [ $? -eq 0 ]; then
-		echo "Saved as ${outfile}"
-	else
-		echo "ERR"
-		exit 1
-	fi
-done
-
--- a/tools/srl-20131216/tag_all.sh
+++ b/tools/srl-20131216/tag_all.sh
@ -1,16 +1,15 @@
 #!/bin/bash

 # parsing tools.cfg values
-IN_FOLDER="../$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg.kres_new)"
-IN_FOLDER=$IN_FOLDER$1
+IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
 echo "input folder: $IN_FOLDER"
-OUT_FOLDER="../$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg.kres_new)"
+OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
 echo "output folder: $OUT_FOLDER"

 SUFFIX="srl.tsv"

 mkdir -p $OUT_FOLDER
-# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
+rm $OUT_FOLDER/*${SUFFIX} &> /dev/null

 for infile in $IN_FOLDER/*; do
 	echo "Tagging: ${infile}"
--- a/tools/srl-20131216/tag_ssj500k2.3.sh
+++ b/tools/srl-20131216/tag_ssj500k2.3.sh
@ -1,30 +0,0 @@
-#!/bin/bash
-
-# parsing tools.cfg values
-IN_FOLDER="../$(sed -n -e 's/^\s*ssj500k_tsv_folder\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
-IN_FOLDER=$IN_FOLDER$1
-echo "input folder: $IN_FOLDER"
-OUT_FOLDER="../$(sed -n -e 's/^\s*ssj500k_srl\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
-echo "output folder: $OUT_FOLDER"
-
-SUFFIX="srl.tsv"
-
-mkdir -p $OUT_FOLDER
-# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
-
-for infile in $IN_FOLDER/*; do
-	echo "Tagging: ${infile}"
-	base=$(basename $infile | cut -d'.' -f1)
-	outfile=${OUT_FOLDER}/${base}.${SUFFIX}
-
-	# mate-tools tagger
-	./scripts/parse_srl_only_mod.sh $infile $outfile
-
-	if [ $? -eq 0 ]; then
-		echo "Saved as ${outfile}"
-	else
-		echo "ERR"
-		exit 1
-	fi
-done
-
--- a/tools/tools.cfg
+++ b/tools/tools.cfg
@ -1,13 +1,8 @@
 [tools]
-giga = ../data/gf_example/gf2_orig
-giga_orig = ../data/gf_example/gf2-dedup.patch0001
-giga_jos = ../data/gf_example/gf2-dedup.jos.patch0001
-giga_tsv = ../data/gf_example/gf_files_part
-giga_srl = ../data/gf_example/2_srl
-;giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
-giga_json = ../data/gf_example/final_json
-internal_data = ../data/gf_example/internal_data
-giga_parts = 100000
-logfile = ../data/gf_example/progress.log
-cpu_cores = 1
-debug = True
+kres_orig = /kres_mount/kres_parsed/tei
+kres_tsv = ../data/kres_out/1_tsv
+kres_srl = ../data/kres_out/2_srl
+kres_json = ../data/kres_out/final_json
+logfile = ../progress.log
+cpu_cores = 5
+debug = False
--- a/tools/tools.cfg.gigafida
+++ b/tools/tools.cfg.gigafida
@ -1,16 +0,0 @@
-[tools]
-giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
-giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
-; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
-giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
-giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
-; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
-; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
-; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
-giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
-giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
-internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
-giga_parts = 100000
-logfile = ../progress.log
-cpu_cores = 1
-debug = False
--- a/tools/tools.cfg.kres
+++ b/tools/tools.cfg.kres
@ -1,8 +0,0 @@
-[tools]
-kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
-kres_tsv = ../data/kres_out/1_tsv
-kres_srl = ../data/kres_out/2_srl
-kres_json = ../data/kres_out/final_json
-logfile = ../progress.log
-cpu_cores = 5
-debug = False
--- a/tools/tools.cfg.kres_new
+++ b/tools/tools.cfg.kres_new
@ -1,8 +0,0 @@
-[tools]
-kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
-giga_tsv = ../data/giga_out/1_tsv
-giga_srl = ../data/giga_out/2_srl
-kres_json = ../data/giga_out/final_json
-logfile = ../progress.log
-cpu_cores = 5
-debug = False
--- a/tools/tools.cfg.ssj500k2.3
+++ b/tools/tools.cfg.ssj500k2.3
@ -1,15 +0,0 @@
-[tools]
-ssj500k = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
-ssj500k_orig = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
-ssj500k_orig_folder = ../data/ssj500k2.3/orig
-ssj500k_jos = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
-ssj500k_tsv = ../data/ssj500k2.3/tsvs/tsvs.tsv
-ssj500k_tsv_folder = ../data/ssj500k2.3/tsvs
-ssj500k_srl = ../data/ssj500k2.3/srls
-ssj500k_json = ../data/ssj500k2.3/final_json
-ssj500k_tei = ../data/ssj500k2.3/final_tei
-internal_data = ../data/ssj500k2.3/internal_data
-;internal_data = ../data/gf_example/internal_data
-logfile = ../data/ssj500k2.3/progress.log
-cpu_cores = 1
-debug = True