Big changes

2022-02-04 11:24:47 +01:00
18 changed files with 1384 additions and 53 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,6 @@ nohup.out

 data/kres_out/*
 data/kres_example/
+venv/
+.idea/
+data/
--- a/dockerfiles/python-java/Makefile
+++ b/dockerfiles/python-java/Makefile
@ -15,6 +15,6 @@ run:
 	-v /etc/group:/etc/group \
    	-v $(shell pwd)/../../:/cjvt-srl-tagging \
 	-w /cjvt-srl-tagging \
-	-v /home/kristjan/kres_mount:/kres_mount:ro \
+	-v /home/luka/Development/srl/data:/kres_mount:ro \
    python-java \
    /bin/bash
--- a/tools/check_all_files_existence.py
+++ b/tools/check_all_files_existence.py
@ -0,0 +1,19 @@
+import os
+
+# INPATH = Path(config["tools"]["giga_srl"])
+# infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
+SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files'
+from shutil import copyfile
+
+INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl'
+OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv'
+for i in range(100000):
+    # print(os.path.join(INPATH, 'giga.%07d.tsv' % i))
+    # if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)):
+    #     print('giga.%07d.tsv' % i)
+    if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)):
+        copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i))
+        print('giga%07d.srl.tsv' % i)
+
+    if i % 1000 == 0:
+        print(i)
--- a/tools/fillpred_model.srl.tsv
+++ b/tools/fillpred_model.srl.tsv
--- a/tools/find_diff_sentence_ids.py
+++ b/tools/find_diff_sentence_ids.py
@ -0,0 +1,192 @@
+import pickle
+
+from parser.parser import Parser
+import os
+from os.path import join, dirname
+from pathlib import Path
+import re
+import sys
+import cProfile
+import configparser
+import logging
+from multiprocessing import Pool
+
+SSJ500K_2_1 = 27829  # number of sentences
+par = Parser()
+
+# path to data
+config = configparser.ConfigParser()
+config.read("tools.cfg")
+analysis = ''
+if 'kres_orig' in config["tools"]:
+    analysis = 'kres'
+    INDIR = Path(config["tools"]["kres_orig"])
+    OUTDIR = Path(config["tools"]["kres_tsv"])
+elif 'giga_orig' in config["tools"]:
+    # analysis = 'gigafida'
+    analysis = 'giga'
+    INDIR_GIGA = Path(config["tools"]["giga_orig"])
+    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
+    INDIR_JOS = Path(config["tools"]["giga_jos"])
+    OUTDIR = Path(config["tools"]["giga_tsv"])
+    GIGA_PARTS = int(config["tools"]["giga_parts"])
+    INTERNAL_DATA = config["tools"]["internal_data"]
+
+CPU_CORES = int(config["tools"]["cpu_cores"])
+
+LOGFILE = Path(config["tools"]["logfile"]).absolute()
+LOGFILE.touch(exist_ok=True)
+LOGFILE.resolve()
+
+logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
+
+origfiles = []
+for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
+    for file in files:
+        origfiles.append(Path(os.path.join(subdir, file)))
+origfiles=list(enumerate(sorted(origfiles)))
+
+def giga_orig_sentence_generator():
+    with open(INDIR_GIGA, 'r') as gof:
+        previous_new_line = False
+        sentence_words = []
+        for l_gof in gof:
+            if l_gof == '\n':
+                yield ' '.join(sentence_words)
+                sentence_words = []
+            else:
+                sentence_words.append(l_gof.split('\t')[0])
+            # yield l_gof
+
+sentence_generator = giga_orig_sentence_generator()
+
+sentence_ids = []
+for origfile in origfiles:
+    split_file_sentences = par.parse_tei(origfile[1])
+    for k, v in split_file_sentences.items():
+        one_file_sentence = next(sentence_generator)
+        if one_file_sentence == v['text']:
+            sentence_ids.append(v['sid'])
+        else:
+            print('----------------')
+            print('ERROR')
+            print(v['sid'])
+            print(one_file_sentence)
+            print(v['text'])
+    print(origfile[0])
+
+# count sentences in orig (if not counted before)
+# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
+if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
+    os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
+
+with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
+    pickle.dump(sentence_ids, output)
+
+# def giga_orig_generator():
+#     with open(INDIR_GIGA, 'r') as gof:
+#         previous_new_line = False
+#         for l_gof in gof:
+#             if l_gof == '\n':
+#                 if previous_new_line:
+#                     continue
+#                 previous_new_line = True
+#             elif previous_new_line:
+#                 previous_new_line = False
+#             yield l_gof
+
+# import  time
+# def handle_giga_file(ran):
+#     """
+#     File that splits big text file into more minor files. Only split on empty lines.
+#     """
+#     # with open(INDIR_GIGA, 'r') as gof:
+#     #     with open(INDIR_JOS, 'r') as gjf:
+#     #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+#     #             pass
+#     #     num_lines = i + 1
+#     # print(num_lines)
+#     num_lines = 1393184026
+#     # 1393184026
+#     # 1393184033
+#     # return
+#     num_lines_per_part = num_lines / GIGA_PARTS
+#     curr_part = 0
+#     gof_generator = giga_orig_generator()
+#
+#     diff_files = set()
+#     # with open(INDIR_GIGA, 'r') as gof:
+#     with open(INDIR_GIGA_OLD, 'r') as gjf:
+#         # sentence = {}
+#         # sentence['tokens'] = []
+#         # sentence['links'] = {}
+#         # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
+#         #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
+#
+#         # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
+#
+#         # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+#         for i, l_gjf in enumerate(gjf):
+#             l_gof = next(gof_generator)
+#             if curr_part < ran[0]:
+#                 if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
+#                     if curr_part < ran[0]:
+#                         print(curr_part)
+#                         curr_part += 1
+#                         continue
+#                 else:
+#                     continue
+#
+#             l_gof_split = l_gof.split('\t')
+#             l_gjf_split = l_gjf.split('\t')
+#
+#             # if punctuation
+#             if l_gof != '\n':
+#                 if l_gof_split != l_gjf_split:
+#                     print(curr_part)
+#                     diff_files.add(curr_part)
+#                     l_gof = next(gof_generator)
+#
+#
+#             # if l_gof == '\n':
+#             else:
+#             # wf.flush()
+#             # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
+#                 if i > num_lines_per_part * (curr_part + 1):
+#                     curr_part += 1
+#                     # if wf doesn't exist (first one)
+#                     # wf.close()
+#                     if curr_part >= ran[1]:
+#                         break
+#                     # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
+#                     #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
+#
+#                     # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+#
+#         curr_part += 1
+#     return diff_files
+#         # wf.close()
+#
+# with Pool(CPU_CORES) as p:
+#     final_range = [0, 100000]
+#     # final_range = [0, 150]
+#     # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
+#     # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
+#     # ranges = []
+#     # ps = None
+#     # for i in range(CPU_CORES):
+#     #     s = int(final_range[0] + size_per_proc * i)
+#     #     ns = int(final_range[0] + size_per_proc * (i + 1))
+#     #     ranges.append([s, ns])
+#     # # ranges = [[0, 1]]
+#     # res = p.map(handle_giga_file, ranges)
+#
+#     res = handle_giga_file(final_range)
+#     res = sorted(list(res))
+#     if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
+#         os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
+#     with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
+#         pickle.dump(res, pkl_file)
+#     # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
+#     #     mydict2 = pickle.load(pkl_file)
+#     print('test')
--- a/tools/gen_json.kres.py
+++ b/tools/gen_json.kres.py
@ -0,0 +1,114 @@
+from pathlib import Path
+from parser.parser import Parser
+import configparser
+import json
+import sys
+import logging
+from multiprocessing import Pool
+
+# parse config
+config = configparser.ConfigParser()
+config.read("tools.cfg")
+# ORIGPATH = Path(config["tools"]["kres_orig"])
+INPATH = Path(config["tools"]["giga_srl"])
+OUTPATH = Path(config["tools"]["kres_json"])
+DEBUG = config["tools"]["debug"] == "True"
+CPU_CORES = int(config["tools"]["cpu_cores"])
+
+LOGFILE = Path(config["tools"]["logfile"]).absolute()
+LOGFILE.touch(exist_ok=True)
+LOGFILE.resolve()
+
+logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
+
+def get_origfile(filename):
+    for origfile in ORIGPATH.iterdir():
+        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
+            return origfile
+    raise FileNotFoundError
+
+def extract_sentences(line_reader):
+    acc = []
+    # last char in line is \n, remove it
+    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
+        if len(line) == 1:  # empty line
+            tmp = acc
+            acc = []
+            yield tmp
+        else:
+            acc.append(line)
+
+def to_sentence(sentence_arr):
+    return " ".join([token[1] for token in sentence_arr])
+
+def match_sentence_id(sentence, orig_dict):
+    for k, e in orig_dict.items():
+        orig_sentence = " ".join(token[2] for token in e["tokens"])
+        if sentence == orig_sentence:
+            return k
+    raise KeyError
+
+def get_dep_rel(token):
+    logging.debug(token)
+    for i, field in enumerate(token[14:]):
+        if field != "_":
+            return {
+                "arg":  field,
+                "from": i,  # i-th predicate in sentence
+                "dep":  token[0],
+            }
+    return None
+
+def handle_file(infile_tpl):
+    i = infile_tpl[0]
+    infile = infile_tpl[1]
+    outfile = (OUTPATH / infile.name).with_suffix(".json")
+    origfile = get_origfile(infile)
+    orig_dict = par.parse_tei(origfile)
+
+    with infile.open("rb") as fp:
+        outdata = {}
+        for sentence_arr in extract_sentences(fp.readlines()):
+            # tsv dropped sentence ids, match the ID, using original data
+            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
+
+            outdata[sid] = []
+
+            # find all predicate indices in the sentence
+            predicates = []
+            for token in sentence_arr:
+                if token[12] == "Y":
+                    predicates += [token[0]]  # idx
+
+                deprel = get_dep_rel(token)
+                if deprel is not None:
+                    outdata[sid].append(deprel)
+
+            # deprel["from"] points to n-th predicate
+            # replace with predicate's token index
+            for deprel in outdata[sid]:
+                deprel["from"] = predicates[deprel["from"]]
+
+            if DEBUG:
+                print(to_sentence(sentence_arr))
+                print(outdata[sid])
+                print(sid)
+                print()
+                print()
+
+    with outfile.open("w") as fp:
+        json.dump(outdata, fp)
+        logging.info("SRL relations written to: {}".format(outfile))
+
+
+# main
+par = Parser()
+OUTPATH.mkdir(exist_ok=True)
+
+infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
+logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
+
+with Pool(CPU_CORES) as p:
+    p.map(handle_file, infiles)
+
+logging.info("Finished generating .json files.")
--- a/tools/gen_json.py
+++ b/tools/gen_json.py
@ -1,3 +1,8 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import pickle
 from pathlib import Path
 from parser.parser import Parser
 import configparser
@ -9,9 +14,10 @@ from multiprocessing import Pool
 # parse config
 config = configparser.ConfigParser()
 config.read("tools.cfg")
-ORIGPATH = Path(config["tools"]["kres_orig"])
-INPATH = Path(config["tools"]["kres_srl"])
-OUTPATH = Path(config["tools"]["kres_json"])
+ORIGPATH = Path(config["tools"]["giga"])
+INPATH = Path(config["tools"]["giga_srl"])
+OUTPATH = Path(config["tools"]["giga_json"])
+INTERNAL_DATA = Path(config["tools"]["internal_data"])
 DEBUG = config["tools"]["debug"] == "True"
 CPU_CORES = int(config["tools"]["cpu_cores"])

@ -48,6 +54,13 @@ def match_sentence_id(sentence, orig_dict):
            return k
    raise KeyError

+def match_sentence_id_giga(sentence, orig_dict):
+    for k, e in orig_dict.items():
+        # orig_sentence = " ".join(token[2] for token in e["tokens"])
+        if sentence == e["text"]:
+            return k
+    raise KeyError
+
 def get_dep_rel(token):
    logging.debug(token)
    for i, field in enumerate(token[14:]):
@ -59,7 +72,7 @@ def get_dep_rel(token):
            }
    return None

-def handle_file(infile_tpl):
+def handle_file_old(infile_tpl):
    i = infile_tpl[0]
    infile = infile_tpl[1]
    outfile = (OUTPATH / infile.name).with_suffix(".json")
@ -101,14 +114,283 @@ def handle_file(infile_tpl):
        logging.info("SRL relations written to: {}".format(outfile))


+def handle_file(whole_input):
+    # sentence_id = whole_input[0][3]
+    # orig_infile = whole_input[0][1]
+    sentence_id = whole_input[3]
+    orig_infile = whole_input[1]
+
+    # origfile = origfiles[0][1]
+    # infile_tpl = infile_tpl[0]
+
+    # i = infile_tpl[0]
+    # infile = infile_tpl[1]
+    outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
+
+    if outfile.exists():
+        return
+    # origfile = get_origfile()
+    orig_dict = par.parse_tei(orig_infile)
+    outdata = {}
+
+    gen = srl_multiple_files_sentences_generator(sentence_id)
+    # gen = srl_multiple_files_sentences_generator(whole_input[1])
+
+    mismatch_sentences = 0
+
+    for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
+        if orig_id == 'GF0014802.2685.7':
+            print('PAUSE')
+
+        # look at neighbouring sentences if they are correct
+        for i in range(100):
+            sentence, sentence_arr = next(gen)
+            # orig_sentence = " ".join(token[2] for token in e["tokens"])
+            if sentence == orig_val["text"]:
+                # if i != 10 and i != 0:
+                #     print('OK!')
+                sid = orig_id
+
+                outdata[sid] = []
+
+                # find all predicate indices in the sentence
+                predicates = []
+                for token in sentence_arr:
+                    if token[12] == "Y":
+                        predicates += [token[0]]  # idx
+
+                    deprel = get_dep_rel(token)
+                    if deprel is not None:
+                        outdata[sid].append(deprel)
+
+                # deprel["from"] points to n-th predicate
+                # replace with predicate's token index
+                for deprel in outdata[sid]:
+                    deprel["from"] = predicates[deprel["from"]]
+
+                if DEBUG:
+                    print(to_sentence(sentence_arr))
+                    print(outdata[sid])
+                    print(sid)
+                    print()
+                    print()
+                break
+            else:
+                if i == 99:
+                    mismatch_sentences += 1
+                    sid = orig_id
+                    outdata[sid] = []
+                    gen = srl_multiple_files_sentences_generator(sentence_id + sentence_i)
+
+    if mismatch_sentences > 0:
+        if mismatch_sentences / len(orig_dict.items()) < 0.1:
+            print('Slight mismatch - %d' % sentence_id)
+            print(whole_input)
+            print('ABS mitigated %d' % mismatch_sentences)
+            print('------------------------------------------------')
+        else:
+            print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
+            print('Big mismatch - %d' % sentence_id)
+            print(whole_input)
+            print('ABS mitigated errors:')
+            print(mismatch_sentences)
+            print('------------------------------------------------')
+
+
+    with outfile.open("w") as fp:
+        json.dump(outdata, fp)
+        logging.info("SRL relations written to: {}".format(outfile))
+
+def count_orig_file_sentences(filename):
+
+    if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
+        return
+    print(filename[0])
+    orig_dict = par.parse_tei(filename[1])
+    # return filename[0], filename[1], len(orig_dict)
+    with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
+        pickle.dump((filename[0], filename[1], len(orig_dict)), output)
+
+
+def count_srl_file_sentences(filename):
+    if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
+        return
+
+    print(filename[0])
+    num_sentences = 0
+    with filename[1].open("r") as fp:
+        for line in fp:
+            if line == '\n':
+                num_sentences += 1
+
+    # return filename[0], filename[1], num_sentences
+    with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
+        pickle.dump((filename[0], filename[1], num_sentences), output)
+
+def srl_sentences_generator(infile, curr_index, sen_start_index):
+    with infile.open("rb") as fp:
+        outdata = {}
+        for sentence_arr in extract_sentences(fp.readlines()):
+            if curr_index < sen_start_index:
+                curr_index += 1
+            else:
+                yield to_sentence(sentence_arr), sentence_arr
+    yield None
+
+
+def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
+    sentence_id = max(0, sentence_id - 10)
+    for i, srl_file in enumerate(srl_file_sizes):
+        if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
+            srl_files = srl_file_sizes[i:]
+            break
+
+    for file_info in srl_files:
+        # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
+        srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
+        el = next(srl_gen)
+        while el is not None:
+            yield el
+            el = next(srl_gen)
+
+    yield None
+
+
 # main
 par = Parser()
 OUTPATH.mkdir(exist_ok=True)

-infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
+infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
 logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))

+origfiles = []
+for subdir, dirs, files in os.walk(ORIGPATH):
+    for file in files:
+        origfiles.append(Path(os.path.join(subdir, file)))
+origfiles=list(enumerate(sorted(origfiles)))
+##### REMOVE ############
+# origfiles = origfiles[:3]
+
+# count sentences in orig (if not counted before)
+# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
+if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
+    # srl_file_sizes = {}
+    if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
+        os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
+    # with Pool(CPU_CORES) as p:
+    #     # p.map(handle_file, infiles)
+    #     p.map(count_orig_file_sentences, origfiles)
+    for i in range(len(origfiles)):
+        count_orig_file_sentences(origfiles[i])
+    orig_file_sizes = []
+    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
+        print(x.name)
+        if x.is_file():
+            with x.open('rb') as pkl_small_file:
+                orig_file_sizes.append(pickle.load(pkl_small_file))
+    # orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
+    print("Sorting orig files")
+    orig_file_sizes = sorted(orig_file_sizes)
+    total_size = 0
+    orig_file_sizes_final = []
+    print("Calculating orig files size")
+    for n, pa, si in orig_file_sizes:
+        orig_file_sizes_final.append((n, pa, si, total_size))
+        total_size += si
+    orig_file_sizes = orig_file_sizes_final
+    print("Saving orig files size")
+    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
+        pickle.dump(orig_file_sizes, output)
+    print("Orig files saved")
+else:
+    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
+        orig_file_sizes = pickle.load(pkl_file)
+
+
+# count sentences in srl (if not counted before)
+# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
+if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
+    # srl_file_sizes = {}
+    if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
+        os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
+    # with Pool(CPU_CORES) as p:
+    #     # p.map(handle_file, infiles)
+    #     p.map(count_srl_file_sentences, infiles)
+
+    for i in range(len(infiles)):
+        count_srl_file_sentences(infiles[i])
+
+    srl_file_sizes = []
+    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
+        print(x.name)
+        if x.is_file():
+            with x.open('rb') as pkl_small_file:
+                srl_file_sizes.append(pickle.load(pkl_small_file))
+    print("Sorting srl files")
+    srl_file_sizes = sorted(srl_file_sizes)
+    total_size = 0
+    srl_file_sizes_final = []
+    print("Calculating srl files size")
+    for n, pa, si in srl_file_sizes:
+        srl_file_sizes_final.append((n, pa, si, total_size))
+        total_size += si
+    srl_file_sizes = srl_file_sizes_final
+    print("Saving srl files size")
+    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
+        pickle.dump(srl_file_sizes, output)
+    print("Srl files saved")
+else:
+    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
+        srl_file_sizes = pickle.load(pkl_file)
+
+
+# print(len(orig_file_sizes))
+# print('asd' + 2)
+
+# inputs = []
+# srl_i = 0
+# srl_file = srl_file_sizes[srl_i]
+# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
+#     interesting_srl_files = []
+#     # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
+#     # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
+#     #     srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
+#     while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
+#         # if beginning of file is in
+#         if srl_file[3] > orig_first_sent_i:
+#             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
+#             # print('if %d' % srl_file[3])
+#         else:
+#             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
+#             # print('else %d' % orig_first_sent_i)
+#
+#         if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
+#             srl_i += 1
+#             if srl_i < len(srl_file_sizes):
+#                 srl_file = srl_file_sizes[srl_i]
+#             else:
+#                 break
+#                 # print(srl_i)
+#                 # print('a ' + 2)
+#         else:
+#             break
+#
+#     inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
+    # print(inputs[-1])
+
+
+
+# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
+# a = next(srl_gen)
+# b = next(srl_gen)
+# c = next(srl_gen)
+
+print('beginning processing')
 with Pool(CPU_CORES) as p:
-    p.map(handle_file, infiles)
+    # p.map(handle_file, inputs)
+    p.map(handle_file, orig_file_sizes)
+
+# for of in orig_file_sizes:
+#     handle_file(of)

 logging.info("Finished generating .json files.")
--- a/tools/gen_json_fix_errors.py
+++ b/tools/gen_json_fix_errors.py
@ -0,0 +1,294 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import pickle
+from pathlib import Path
+from parser.parser import Parser
+import configparser
+import json
+import sys
+import logging
+from multiprocessing import Pool
+
+# parse config
+config = configparser.ConfigParser()
+config.read("tools.cfg")
+ORIGPATH = Path(config["tools"]["giga"])
+INPATH = Path(config["tools"]["giga_srl_errors"])
+OUTPATH = Path(config["tools"]["giga_json"])
+INTERNAL_DATA = Path(config["tools"]["internal_data"])
+DEBUG = config["tools"]["debug"] == "True"
+CPU_CORES = int(config["tools"]["cpu_cores"])
+
+LOGFILE = Path(config["tools"]["logfile"]).absolute()
+LOGFILE.touch(exist_ok=True)
+LOGFILE.resolve()
+
+logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
+error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
+
+
+
+
+def get_origfile(filename):
+    for origfile in ORIGPATH.iterdir():
+        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
+            return origfile
+    raise FileNotFoundError
+
+def extract_sentences(line_reader):
+    acc = []
+    # last char in line is \n, remove it
+    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
+        if len(line) == 1:  # empty line
+            tmp = acc
+            acc = []
+            yield tmp
+        else:
+            acc.append(line)
+
+def to_sentence(sentence_arr):
+    return " ".join([token[1] for token in sentence_arr])
+
+def match_sentence_id(sentence, orig_dict):
+    for k, e in orig_dict.items():
+        orig_sentence = " ".join(token[2] for token in e["tokens"])
+        if sentence == orig_sentence:
+            return k
+    raise KeyError
+
+def match_sentence_id_giga(sentence, orig_dict):
+    for k, e in orig_dict.items():
+        # orig_sentence = " ".join(token[2] for token in e["tokens"])
+        if sentence == e["text"]:
+            return k
+    raise KeyError
+
+def get_dep_rel(token):
+    logging.debug(token)
+    for i, field in enumerate(token[14:]):
+        if field != "_":
+            return {
+                "arg":  field,
+                "from": i,  # i-th predicate in sentence
+                "dep":  token[0],
+            }
+    return None
+
+def handle_file_old(infile_tpl):
+    i = infile_tpl[0]
+    infile = infile_tpl[1]
+    outfile = (OUTPATH / infile.name).with_suffix(".json")
+    origfile = get_origfile(infile)
+    orig_dict = par.parse_tei(origfile)
+
+    with infile.open("rb") as fp:
+        outdata = {}
+        for sentence_arr in extract_sentences(fp.readlines()):
+            # tsv dropped sentence ids, match the ID, using original data
+            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
+
+            outdata[sid] = []
+
+            # find all predicate indices in the sentence
+            predicates = []
+            for token in sentence_arr:
+                if token[12] == "Y":
+                    predicates += [token[0]]  # idx
+
+                deprel = get_dep_rel(token)
+                if deprel is not None:
+                    outdata[sid].append(deprel)
+
+            # deprel["from"] points to n-th predicate
+            # replace with predicate's token index
+            for deprel in outdata[sid]:
+                deprel["from"] = predicates[deprel["from"]]
+
+            if DEBUG:
+                print(to_sentence(sentence_arr))
+                print(outdata[sid])
+                print(sid)
+                print()
+                print()
+
+    with outfile.open("w") as fp:
+        json.dump(outdata, fp)
+        logging.info("SRL relations written to: {}".format(outfile))
+
+
+def fix_json(srl_gen, error_sentence, orig_json_data):
+    # sentence_id = whole_input[0][3]
+    # orig_infile = whole_input[0][1]
+    # sentence_id = whole_input[3]
+    # orig_infile = whole_input[1]
+
+    # origfile = origfiles[0][1]
+    # infile_tpl = infile_tpl[0]
+
+    # i = infile_tpl[0]
+    # infile = infile_tpl[1]
+    # outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
+
+    # if outfile.exists():
+    #     return
+    # origfile = get_origfile()
+    # orig_dict = par.parse_tei(orig_infile)
+    # outdata = {}
+
+    # gen = srl_multiple_files_sentences_generator(sentence_id)
+    # gen = srl_multiple_files_sentences_generator(whole_input[1])
+
+    # mismatch_sentences = 0
+
+    # look at neighbouring sentences if they are correct
+    sentence, sentence_arr = next(srl_gen)
+    # orig_sentence = " ".join(token[2] for token in e["tokens"])
+    sid = error_sentence
+    # a = orig_json_data[sid]
+    if orig_json_data[sid] != []:
+        # print('POSSIBLE ERROR:')
+        # print(orig_json_data[sid])
+        orig_json_data[sid] = []
+
+    # find all predicate indices in the sentence
+    predicates = []
+    for token in sentence_arr:
+        if token[12] == "Y":
+            predicates += [token[0]]  # idx
+
+        deprel = get_dep_rel(token)
+        if deprel is not None:
+            orig_json_data[sid].append(deprel)
+
+    # deprel["from"] points to n-th predicate
+    # replace with predicate's token index
+    for deprel in orig_json_data[sid]:
+        deprel["from"] = predicates[deprel["from"]]
+
+    if DEBUG:
+        print(to_sentence(sentence_arr))
+        print(orig_json_data[sid])
+        print(sid)
+        print()
+        print()
+    # a = orig_json_data[sid]
+    return orig_json_data
+
+def count_orig_file_sentences(filename):
+
+    if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
+        return
+    print(filename[0])
+    orig_dict = par.parse_tei(filename[1])
+    # return filename[0], filename[1], len(orig_dict)
+    with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
+        pickle.dump((filename[0], filename[1], len(orig_dict)), output)
+
+
+def count_srl_file_sentences(filename):
+    if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
+        return
+
+    print(filename[0])
+    num_sentences = 0
+    with filename[1].open("r") as fp:
+        for line in fp:
+            if line == '\n':
+                num_sentences += 1
+
+    # return filename[0], filename[1], num_sentences
+    with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
+        pickle.dump((filename[0], filename[1], num_sentences), output)
+
+def srl_error_fix_generator(infile):
+    with infile.open("rb") as fp:
+        for sentence_arr in extract_sentences(fp.readlines()):
+            yield to_sentence(sentence_arr), sentence_arr
+    yield None
+
+def srl_sentences_generator(infile, curr_index, sen_start_index):
+    with infile.open("rb") as fp:
+        outdata = {}
+        for sentence_arr in extract_sentences(fp.readlines()):
+            if curr_index < sen_start_index:
+                curr_index += 1
+            else:
+                yield to_sentence(sentence_arr), sentence_arr
+    yield None
+
+
+def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
+    sentence_id = max(0, sentence_id - 10)
+    for i, srl_file in enumerate(srl_file_sizes):
+        if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
+            srl_files = srl_file_sizes[i:]
+            break
+
+    for file_info in srl_files:
+        # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
+        srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
+        el = next(srl_gen)
+        while el is not None:
+            yield el
+            el = next(srl_gen)
+
+    yield None
+
+error_sentences_grouped = []
+group = False
+prev_name = ''
+# group sentences by their files
+for name in error_sentences:
+    if name[:9] == prev_name:
+        group.append(name)
+    else:
+        prev_name = name[:9]
+        if group:
+            error_sentences_grouped.append(group)
+        group = [name]
+error_sentences_grouped.append(group)
+
+srl_gen = srl_error_fix_generator(INPATH)
+
+# find errors in json files:
+# with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output:
+#     sentence_ids = pickle.load(output)
+#
+#
+#
+# origfiles = []
+# for subdir, dirs, files in os.walk(OUTPATH):
+#     for file in files:
+#         origfiles.append(Path(os.path.join(subdir, file)))
+# origfiles=sorted(origfiles)
+#
+#
+#
+# for sent in origfiles:
+# # for sent in sentence_ids:
+# #     outfile = Path(OUTPATH, sent[:9] + '-dedup.json')
+#     outfile = sent
+#
+#     try:
+#         with outfile.open() as json_file:
+#             json.load(json_file)
+#             pass
+#     except:
+#         print(outfile.name)
+#
+#
+# raise Exception('test')
+# iterate over all wronged sentences and fix them
+for errors_in_file in error_sentences_grouped:
+    outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json')
+    with outfile.open() as json_file:
+        print(outfile.name)
+        orig_json_data = json.load(json_file)
+        for error_sentence in errors_in_file:
+            orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data)
+
+    with outfile.open('w') as json_file:
+        json.dump(orig_json_data, json_file)
+        logging.info("SRL relations written to: {}".format(outfile))
--- a/tools/parse_all.py
+++ b/tools/parse_all.py
@ -1,3 +1,5 @@
+import pickle
+
 from parser.parser import Parser
 import os
 from os.path import join, dirname
@ -15,8 +17,21 @@ par = Parser()
 # path to data
 config = configparser.ConfigParser()
 config.read("tools.cfg")
-INDIR = Path(config["tools"]["kres_orig"])
-OUTDIR = Path(config["tools"]["kres_tsv"])
+analysis = ''
+if 'kres_orig' in config["tools"]:
+    analysis = 'kres'
+    INDIR = Path(config["tools"]["kres_orig"])
+    OUTDIR = Path(config["tools"]["kres_tsv"])
+elif 'giga_orig' in config["tools"]:
+    # analysis = 'gigafida'
+    analysis = 'giga'
+    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
+    INDIR_GIGA = Path(config["tools"]["giga_orig"])
+    INDIR_JOS = Path(config["tools"]["giga_jos"])
+    OUTDIR = Path(config["tools"]["giga_tsv"])
+    GIGA_PARTS = int(config["tools"]["giga_parts"])
+    INTERNAL_DATA = config["tools"]["internal_data"]
+
 CPU_CORES = int(config["tools"]["cpu_cores"])

 LOGFILE = Path(config["tools"]["logfile"]).absolute()
@ -36,8 +51,9 @@ print("end parsing ssj")
 # kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
 OUTDIR.mkdir(exist_ok=True)

-infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
-logging.info("Parsing kres: {} files.".format(len(infiles)))
+if analysis == 'kres':
+    infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
+    logging.info("Parsing kres: {} files.".format(len(infiles)))

 def handle_file(infile):
    i = infile[0]
@ -65,10 +81,297 @@ def handle_file(infile):
        return True
    return False

+def giga_orig_generator():
+    with open(INDIR_GIGA, 'r') as gof:
+        previous_new_line = False
+        for l_gof in gof:
+            if l_gof == '\n':
+                if previous_new_line:
+                    continue
+                previous_new_line = True
+            elif previous_new_line:
+                previous_new_line = False
+            yield l_gof
+
+
+def handle_gigafida_file():
+    """
+    File that splits big text file into more minor files. Only split on empty lines.  
+    """
+    # with open(INDIR_GIGA, 'r') as gof:
+    #     with open(INDIR_JOS, 'r') as gjf:
+    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+    #             pass
+    #     num_lines = i + 1
+    # print(num_lines)
+    num_lines = 1393184026
+    # 1393184026
+    # 1393184033
+    # return
+    num_lines_per_part = num_lines / GIGA_PARTS
+    curr_part = 0
+    gof_generator = giga_orig_generator()
+    # with open(INDIR_GIGA, 'r') as gof:
+    with open(INDIR_JOS, 'r') as gjf:
+        sentence = {}
+        sentence['tokens'] = []
+        sentence['links'] = {}
+        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
+            ignore_lines = True
+            wf = False
+        else:
+            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+            ignore_lines = False
+        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+        for i, l_gjf in enumerate(gjf):
+            l_gof = next(gof_generator)
+            if ignore_lines:
+                if i > num_lines_per_part * curr_part and l_gof == '\n':
+                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
+                        ignore_lines = False
+                        # delete last file (probably not whole)
+                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
+                    if ignore_lines:
+                        print(curr_part)
+                        curr_part += 1
+                        continue
+                else:
+                    continue
+            l_gof_split = l_gof.split('\t')
+            l_gjf_split = l_gjf.split('\t')
+
+            # if punctuation
+            if l_gof != '\n':
+                if l_gof_split[1][-1] == 'u':
+                    # print(l_gjf_split)
+                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
+                else:
+                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
+
+                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
+
+            # if l_gof == '\n':
+            else:
+                if wf:
+                    # print(i)
+                    wf.write(par.to_conll_2009_SRL(sentence))
+                sentence['tokens'] = []
+                sentence['links'] = {}
+            # wf.flush()
+            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
+                if i > num_lines_per_part * (curr_part + 1):
+                    curr_part += 1
+                    # if wf doesn't exist (first one)
+                    if wf:
+                        wf.close()
+                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+        curr_part += 1
+        wf.close()
+
+import  time
+def handle_giga_file(ran):
+    """
+    File that splits big text file into more minor files. Only split on empty lines.
+    """
+    # with open(INDIR_GIGA, 'r') as gof:
+    #     with open(INDIR_JOS, 'r') as gjf:
+    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+    #             pass
+    #     num_lines = i + 1
+    # print(num_lines)
+    num_lines = 1393184026
+    # 1393184026
+    # 1393184033
+    # return
+    num_lines_per_part = num_lines / GIGA_PARTS
+    curr_part = 0
+    gof_generator = giga_orig_generator()
+    # with open(INDIR_GIGA, 'r') as gof:
+    with open(INDIR_JOS, 'r') as gjf:
+        sentence = {}
+        sentence['tokens'] = []
+        sentence['links'] = {}
+        wf = None
+        if curr_part in file_indices:
+            if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
+                os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
+
+            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
+
+        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+        for i, l_gjf in enumerate(gjf):
+            l_gof = next(gof_generator)
+            if curr_part < ran[0]:
+                if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
+                    if curr_part < ran[0]:
+                        print(curr_part)
+                        curr_part += 1
+                        continue
+                else:
+                    continue
+
+            l_gof_split = l_gof.split('\t')
+            l_gjf_split = l_gjf.split('\t')
+
+            # if punctuation
+            if l_gof != '\n':
+                if curr_part not in file_indices:
+                    continue
+                if l_gof_split[1][-1] == 'u':
+                    # print(l_gjf_split)
+                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
+                else:
+                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
+
+                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
+
+            # if l_gof == '\n':
+            else:
+                if curr_part in file_indices:
+                    wf.write(par.to_conll_2009_SRL(sentence))
+                    sentence['tokens'] = []
+                    sentence['links'] = {}
+            # wf.flush()
+            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
+                if i > num_lines_per_part * (curr_part + 1):
+                    curr_part += 1
+                    # if wf doesn't exist (first one)
+                    if curr_part in file_indices and wf:
+                        wf.close()
+                    if curr_part >= ran[1]:
+                        break
+                    if curr_part in file_indices:
+                        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
+                            os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
+
+                        wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+
+        curr_part += 1
+        wf.close()
+
+def handle_giga_file_selected_sentences(error_sentences):
+    """
+    File that splits big text file into more minor files. Only split on empty lines.
+    """
+    # with open(INDIR_GIGA, 'r') as gof:
+    #     with open(INDIR_JOS, 'r') as gjf:
+    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+    #             pass
+    #     num_lines = i + 1
+    # print(num_lines)
+    # print('num_lines' + 3)
+    # num_lines = 1393184026
+    num_lines = 1393222523
+    # 1393184026
+    # 1393184033
+    # return
+    # num_lines_per_part = num_lines / GIGA_PARTS
+    # curr_part = 0
+    gof_generator = giga_orig_generator()
+    # with open(INDIR_GIGA, 'r') as gof:
+    with open(INDIR_JOS, 'r') as gjf:
+        sentence = {}
+        sentence['tokens'] = []
+        sentence['links'] = {}
+        wf = None
+        if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
+            os.remove(os.path.join(OUTDIR, 'giga_errors'))
+
+        wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
+
+        with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
+            sentence_ids_list = pickle.load(pkl_file)
+
+        sentence_id = 0
+        skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
+
+        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+        for i, l_gjf in enumerate(gjf):
+            l_gof = next(gof_generator)
+
+
+            if l_gjf == '\n':
+                if not skip_sentence:
+                    wf.write(par.to_conll_2009_SRL(sentence))
+                    sentence['tokens'] = []
+                    sentence['links'] = {}
+                sentence_id += 1
+                if sentence_ids_list[sentence_id] in error_sentences:
+                    print(sentence_ids_list[sentence_id])
+                    skip_sentence = False
+                else:
+                    skip_sentence = True
+
+            if skip_sentence:
+                continue
+
+
+            # if curr_part < ran[0]:
+            #     if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
+            #         if curr_part < ran[0]:
+            #             print(curr_part)
+            #             curr_part += 1
+            #             continue
+            #     else:
+            #         continue
+
+            l_gof_split = l_gof.split('\t')
+            l_gjf_split = l_gjf.split('\t')
+
+            # if punctuation
+            if l_gof != '\n':
+                if l_gof_split[1][-1] == 'u':
+                    # print(l_gjf_split)
+                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
+                else:
+                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
+
+                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
+
+            # if l_gof == '\n':
+            # wf.flush()
+            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
+            #     if i > num_lines_per_part * (curr_part + 1):
+            #         curr_part += 1
+            #         # if wf doesn't exist (first one)
+            #         if curr_part in file_indices and wf:
+            #             wf.close()
+            #         if curr_part >= ran[1]:
+            #             break
+                    # if curr_part in file_indices:
+                    #     if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
+                    #         os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
+                    #
+                    #     wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+
+        # curr_part += 1
+        wf.close()
+
+file_indices = set(range(0, 100000))
+with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
+    file_indices = set(pickle.load(pkl_file))
+
 with Pool(CPU_CORES) as p:
-    p.map(handle_file, infiles)
+    if analysis == 'kres':
+        p.map(handle_file, infiles)
+    elif analysis == 'gigafida':
+        handle_gigafida_file()
+    elif analysis == 'giga':
+        final_range = [0, 100000]
+        size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
+        # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
+        ranges = []
+        ps = None
+        for i in range(CPU_CORES):
+            s = int(final_range[0] + size_per_proc * i)
+            ns = int(final_range[0] + size_per_proc * (i + 1))
+            ranges.append([s, ns])
+        # ranges = [[0, 1]]
+
+        # p.map(handle_giga_file, ranges)
+        # p.map(handle_giga_file, ranges)
+        error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
+        handle_giga_file_selected_sentences(set(error_sentences))


 logging.info("end parsing kres")
-
-
--- a/tools/parser/parser.py
+++ b/tools/parser/parser.py
@ -57,7 +57,10 @@ class Parser:
            divs = []  # in ssj, there are divs, in Kres, there are separate files
            if "id" in root.keys():
                # Kres files start with <TEI id=...>
-                guess_corpus = "KRES"
+                if root.get("id")[0:2] == 'GF':
+                    guess_corpus = "GIGA"
+                else:
+                    guess_corpus = "KRES"
                divs = [root]
            else:
                guess_corpus = "SSJ"
@ -65,7 +68,10 @@ class Parser:

            # parse divs
            for div in divs:
-                f_id = div.get("id")
+                f_id = div.get("id")[:-6]
+
+                if guess_corpus == "GIGA":
+                    div = div.findall(".//body")[0]

                # parse paragraphs
                for p in div.findall(".//p"):
@ -75,46 +81,62 @@ class Parser:
                    for s in p.findall(".//s"):
                        s_id = s.get("id").split(".")[-1]
                        sentence_text = ""
+                        sentence_list = []
                        sentence_tokens = []

                        # parse tokens
                        for el in s.iter():
                            if el.tag in self.W_TAGS:
-                                el_id = el.get("id").split(".")[-1]
-                                if el_id[0] == 't':
-                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
-                                sentence_text += el.text
-                                sentence_tokens += [(
-                                    "w",
-                                    int(el_id),
-                                    el.text,
-                                    el.get("lemma"),
-                                    (el.get("msd") if guess_corpus == "KRES"
-                                        else el.get("ana").split(":")[-1]),
-                                )]
+                                if guess_corpus != "GIGA":
+                                    el_id = el.get("id").split(".")[-1]
+                                    if el_id[0] == 't':
+                                        el_id = el_id[1:]  # ssj W_TAG ids start with t
+                                    sentence_text += el.text
+                                    sentence_tokens += [(
+                                        "w",
+                                        int(el_id),
+                                        el.text,
+                                        el.get("lemma"),
+                                        (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
+                                         else el.get("ana").split(":")[-1]),
+                                    )]
+                                else:
+                                    sentence_list.append(el.text)
                            elif el.tag in self.C_TAGS:
                                # only Kres' C_TAGS have ids
-                                el_id = el.get("id") or "none"
-                                el_id = el_id.split(".")[-1]
-                                sentence_text += el.text
-                                sentence_tokens += [("c", el_id, el.text,)]
+                                if guess_corpus != "GIGA":
+                                    el_id = el.get("id") or "none"
+                                    el_id = el_id.split(".")[-1]
+                                    sentence_text += el.text
+                                    sentence_tokens += [("c", el_id, el.text,)]
                            elif el.tag in self.S_TAGS:
                                # Kres' <S /> doesn't contain .text
-                                sentence_text += " "
+                                if guess_corpus == "GIGA":
+                                    sentence_list.append(el.text)
+                                else:
+                                    sentence_text += " "
                            else:
                                # pass links and linkGroups
                                pass
                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
                        if sentence_id in res_dict:
                            raise KeyError("duplicated id: {}".format(sentence_id))
-                        res_dict[sentence_id] = {
-                            "sid": sentence_id,
-                            "text": sentence_text,
-                            "tokens": sentence_tokens,
-                            "links": (
-                                parse_links(s) if guess_corpus == "KRES" else None
-                            )
-                        }
+                        if guess_corpus == "GIGA":
+                            res_dict[sentence_id] = {
+                                "sid": sentence_id,
+                                "text": ' '.join(sentence_list),
+                                "tokens": None,
+                                "links": None
+                            }
+                        else:
+                            res_dict[sentence_id] = {
+                                "sid": sentence_id,
+                                "text": sentence_text,
+                                "tokens": sentence_tokens,
+                                "links": (
+                                    parse_links(s) if guess_corpus == "KRES" else None
+                                )
+                            }
        fp.close()
        return res_dict

@ -123,7 +145,7 @@ class Parser:

        def fillpred(tsv_row):
            mrow = build_model_row(tsv_row)
-            x = mrow[:-1] 
+            x = mrow[:-1]
            y = self.fillpred_model.predict([x])
            return y[0]  # bool

--- a/tools/srl-20131216/scripts/parse_srl_only_mod.sh
+++ b/tools/srl-20131216/scripts/parse_srl_only_mod.sh
@ -34,7 +34,8 @@ JVM_ARGS="-cp $CP -Xmx$MEM"
 NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step. This setting is equivalent to the CoNLL 2009 ST.


-CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang $INPUT $MODEL $RERANKER $NOPI $OUTPUT"
-echo "Executing: $CMD"
+$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang "$INPUT" $MODEL $RERANKER $NOPI "$OUTPUT"
+# CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang '$INPUT' $MODEL $RERANKER $NOPI '$OUTPUT'"
+# echo "Executing: $CMD"

-$CMD
+# $CMD
--- a/tools/srl-20131216/tag_all.gigafida.sh
+++ b/tools/srl-20131216/tag_all.gigafida.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+# parsing tools.cfg values
+IN_FOLDER="$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg)"
+echo "input folder: $IN_FOLDER"
+OUT_FOLDER="$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg)"
+echo "output folder: $OUT_FOLDER"
+
+SUFFIX="srl.tsv"
+
+mkdir -p "$OUT_FOLDER"
+rm "$OUT_FOLDER/*${SUFFIX}" &> /dev/null
+
+for infile in "$IN_FOLDER/*"; do
+	echo "Tagging: ${infile}"
+	base=$(basename $infile | cut -d'.' -f1)
+	outfile="${OUT_FOLDER}/${base}.${SUFFIX}"
+
+	# mate-tools tagger
+	./scripts/parse_srl_only_mod.sh "$infile" "$outfile"
+
+	if [ $? -eq 0 ]; then
+		echo "Saved as ${outfile}"
+	else
+		echo "ERR"
+		exit 1
+	fi
+done
+
--- a/tools/srl-20131216/tag_all.kres.sh
+++ b/tools/srl-20131216/tag_all.kres.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+# parsing tools.cfg values
+IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
+echo "input folder: $IN_FOLDER"
+OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
+echo "output folder: $OUT_FOLDER"
+
+SUFFIX="srl.tsv"
+
+mkdir -p $OUT_FOLDER
+rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
+
+for infile in $IN_FOLDER/*; do
+	echo "Tagging: ${infile}"
+	base=$(basename $infile | cut -d'.' -f1)
+	outfile=${OUT_FOLDER}/${base}.${SUFFIX}
+
+	# mate-tools tagger
+	./scripts/parse_srl_only_mod.sh $infile $outfile
+
+	if [ $? -eq 0 ]; then
+		echo "Saved as ${outfile}"
+	else
+		echo "ERR"
+		exit 1
+	fi
+done
+
--- a/tools/srl-20131216/tag_all.sh
+++ b/tools/srl-20131216/tag_all.sh
@ -1,15 +1,16 @@
 #!/bin/bash

 # parsing tools.cfg values
-IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
+IN_FOLDER="../$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg.kres_new)"
+IN_FOLDER=$IN_FOLDER$1
 echo "input folder: $IN_FOLDER"
-OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
+OUT_FOLDER="../$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg.kres_new)"
 echo "output folder: $OUT_FOLDER"

 SUFFIX="srl.tsv"

 mkdir -p $OUT_FOLDER
-rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
+# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null

 for infile in $IN_FOLDER/*; do
 	echo "Tagging: ${infile}"
--- a/tools/tools.cfg
+++ b/tools/tools.cfg
@ -1,8 +1,18 @@
 [tools]
-kres_orig = /kres_mount/kres_parsed/tei
-kres_tsv = ../data/kres_out/1_tsv
-kres_srl = ../data/kres_out/2_srl
-kres_json = ../data/kres_out/final_json
+giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
+giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
+; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
+giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
+giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
+; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
+; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
+; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
+giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
+giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
+; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP
+giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
+internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
+giga_parts = 100000
 logfile = ../progress.log
-cpu_cores = 5
+cpu_cores = 16
 debug = False
--- a/tools/tools.cfg.gigafida
+++ b/tools/tools.cfg.gigafida
@ -0,0 +1,16 @@
+[tools]
+giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
+giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
+; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
+giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
+giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
+; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
+; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
+; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
+giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
+giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
+internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
+giga_parts = 100000
+logfile = ../progress.log
+cpu_cores = 1
+debug = False
--- a/tools/tools.cfg.kres
+++ b/tools/tools.cfg.kres
@ -0,0 +1,8 @@
+[tools]
+kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
+kres_tsv = ../data/kres_out/1_tsv
+kres_srl = ../data/kres_out/2_srl
+kres_json = ../data/kres_out/final_json
+logfile = ../progress.log
+cpu_cores = 5
+debug = False
--- a/tools/tools.cfg.kres_new
+++ b/tools/tools.cfg.kres_new
@ -0,0 +1,8 @@
+[tools]
+kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
+giga_tsv = ../data/giga_out/1_tsv
+giga_srl = ../data/giga_out/2_srl
+kres_json = ../data/giga_out/final_json
+logfile = ../progress.log
+cpu_cores = 5
+debug = False