Big changes

18 changed files with 1384 additions and 53 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,6 @@ nohup.out
 data/kres_out/*
 data/kres_example/
 venv/
 .idea/
 data/
--- a/dockerfiles/python-java/Makefile
+++ b/dockerfiles/python-java/Makefile
@ -15,6 +15,6 @@ run:
 	-v /etc/group:/etc/group \
    	-v $(shell pwd)/../../:/cjvt-srl-tagging \
 	-w /cjvt-srl-tagging \
-	-v /home/kristjan/kres_mount:/kres_mount:ro \
+	-v /home/luka/Development/srl/data:/kres_mount:ro \
    python-java \
    /bin/bash
--- a/tools/check_all_files_existence.py
+++ b/tools/check_all_files_existence.py
@ -0,0 +1,19 @@
 import os
 # INPATH = Path(config["tools"]["giga_srl"])
 # infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
 SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files'
 from shutil import copyfile
 INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl'
 OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv'
 for i in range(100000):
    # print(os.path.join(INPATH, 'giga.%07d.tsv' % i))
    # if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)):
    #     print('giga.%07d.tsv' % i)
    if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)):
        copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i))
        print('giga%07d.srl.tsv' % i)
    if i % 1000 == 0:
        print(i)
--- a/tools/fillpred_model.srl.tsv
+++ b/tools/fillpred_model.srl.tsv
--- a/tools/find_diff_sentence_ids.py
+++ b/tools/find_diff_sentence_ids.py
@ -0,0 +1,192 @@
 import pickle
 from parser.parser import Parser
 import os
 from os.path import join, dirname
 from pathlib import Path
 import re
 import sys
 import cProfile
 import configparser
 import logging
 from multiprocessing import Pool
 SSJ500K_2_1 = 27829  # number of sentences
 par = Parser()
 # path to data
 config = configparser.ConfigParser()
 config.read("tools.cfg")
 analysis = ''
 if 'kres_orig' in config["tools"]:
    analysis = 'kres'
    INDIR = Path(config["tools"]["kres_orig"])
    OUTDIR = Path(config["tools"]["kres_tsv"])
 elif 'giga_orig' in config["tools"]:
    # analysis = 'gigafida'
    analysis = 'giga'
    INDIR_GIGA = Path(config["tools"]["giga_orig"])
    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
    INDIR_JOS = Path(config["tools"]["giga_jos"])
    OUTDIR = Path(config["tools"]["giga_tsv"])
    GIGA_PARTS = int(config["tools"]["giga_parts"])
    INTERNAL_DATA = config["tools"]["internal_data"]
 CPU_CORES = int(config["tools"]["cpu_cores"])
 LOGFILE = Path(config["tools"]["logfile"]).absolute()
 LOGFILE.touch(exist_ok=True)
 LOGFILE.resolve()
 logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
 origfiles = []
 for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
    for file in files:
        origfiles.append(Path(os.path.join(subdir, file)))
 origfiles=list(enumerate(sorted(origfiles)))
 def giga_orig_sentence_generator():
    with open(INDIR_GIGA, 'r') as gof:
        previous_new_line = False
        sentence_words = []
        for l_gof in gof:
            if l_gof == '\n':
                yield ' '.join(sentence_words)
                sentence_words = []
            else:
                sentence_words.append(l_gof.split('\t')[0])
            # yield l_gof
 sentence_generator = giga_orig_sentence_generator()
 sentence_ids = []
 for origfile in origfiles:
    split_file_sentences = par.parse_tei(origfile[1])
    for k, v in split_file_sentences.items():
        one_file_sentence = next(sentence_generator)
        if one_file_sentence == v['text']:
            sentence_ids.append(v['sid'])
        else:
            print('----------------')
            print('ERROR')
            print(v['sid'])
            print(one_file_sentence)
            print(v['text'])
    print(origfile[0])
 # count sentences in orig (if not counted before)
 # os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
 if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
    os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
 with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
    pickle.dump(sentence_ids, output)
 # def giga_orig_generator():
 #     with open(INDIR_GIGA, 'r') as gof:
 #         previous_new_line = False
 #         for l_gof in gof:
 #             if l_gof == '\n':
 #                 if previous_new_line:
 #                     continue
 #                 previous_new_line = True
 #             elif previous_new_line:
 #                 previous_new_line = False
 #             yield l_gof
 # import  time
 # def handle_giga_file(ran):
 #     """
 #     File that splits big text file into more minor files. Only split on empty lines.
 #     """
 #     # with open(INDIR_GIGA, 'r') as gof:
 #     #     with open(INDIR_JOS, 'r') as gjf:
 #     #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
 #     #             pass
 #     #     num_lines = i + 1
 #     # print(num_lines)
 #     num_lines = 1393184026
 #     # 1393184026
 #     # 1393184033
 #     # return
 #     num_lines_per_part = num_lines / GIGA_PARTS
 #     curr_part = 0
 #     gof_generator = giga_orig_generator()
 #
 #     diff_files = set()
 #     # with open(INDIR_GIGA, 'r') as gof:
 #     with open(INDIR_GIGA_OLD, 'r') as gjf:
 #         # sentence = {}
 #         # sentence['tokens'] = []
 #         # sentence['links'] = {}
 #         # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
 #         #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
 #
 #         # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
 #
 #         # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
 #         for i, l_gjf in enumerate(gjf):
 #             l_gof = next(gof_generator)
 #             if curr_part < ran[0]:
 #                 if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
 #                     if curr_part < ran[0]:
 #                         print(curr_part)
 #                         curr_part += 1
 #                         continue
 #                 else:
 #                     continue
 #
 #             l_gof_split = l_gof.split('\t')
 #             l_gjf_split = l_gjf.split('\t')
 #
 #             # if punctuation
 #             if l_gof != '\n':
 #                 if l_gof_split != l_gjf_split:
 #                     print(curr_part)
 #                     diff_files.add(curr_part)
 #                     l_gof = next(gof_generator)
 #
 #
 #             # if l_gof == '\n':
 #             else:
 #             # wf.flush()
 #             # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
 #                 if i > num_lines_per_part * (curr_part + 1):
 #                     curr_part += 1
 #                     # if wf doesn't exist (first one)
 #                     # wf.close()
 #                     if curr_part >= ran[1]:
 #                         break
 #                     # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
 #                     #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
 #
 #                     # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
 #
 #         curr_part += 1
 #     return diff_files
 #         # wf.close()
 #
 # with Pool(CPU_CORES) as p:
 #     final_range = [0, 100000]
 #     # final_range = [0, 150]
 #     # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
 #     # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
 #     # ranges = []
 #     # ps = None
 #     # for i in range(CPU_CORES):
 #     #     s = int(final_range[0] + size_per_proc * i)
 #     #     ns = int(final_range[0] + size_per_proc * (i + 1))
 #     #     ranges.append([s, ns])
 #     # # ranges = [[0, 1]]
 #     # res = p.map(handle_giga_file, ranges)
 #
 #     res = handle_giga_file(final_range)
 #     res = sorted(list(res))
 #     if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
 #         os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
 #     with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
 #         pickle.dump(res, pkl_file)
 #     # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
 #     #     mydict2 = pickle.load(pkl_file)
 #     print('test')
--- a/tools/gen_json.kres.py
+++ b/tools/gen_json.kres.py
@ -0,0 +1,114 @@
 from pathlib import Path
 from parser.parser import Parser
 import configparser
 import json
 import sys
 import logging
 from multiprocessing import Pool
 # parse config
 config = configparser.ConfigParser()
 config.read("tools.cfg")
 # ORIGPATH = Path(config["tools"]["kres_orig"])
 INPATH = Path(config["tools"]["giga_srl"])
 OUTPATH = Path(config["tools"]["kres_json"])
 DEBUG = config["tools"]["debug"] == "True"
 CPU_CORES = int(config["tools"]["cpu_cores"])
 LOGFILE = Path(config["tools"]["logfile"]).absolute()
 LOGFILE.touch(exist_ok=True)
 LOGFILE.resolve()
 logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
 def get_origfile(filename):
    for origfile in ORIGPATH.iterdir():
        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
            return origfile
    raise FileNotFoundError
 def extract_sentences(line_reader):
    acc = []
    # last char in line is \n, remove it
    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
        if len(line) == 1:  # empty line
            tmp = acc
            acc = []
            yield tmp
        else:
            acc.append(line)
 def to_sentence(sentence_arr):
    return " ".join([token[1] for token in sentence_arr])
 def match_sentence_id(sentence, orig_dict):
    for k, e in orig_dict.items():
        orig_sentence = " ".join(token[2] for token in e["tokens"])
        if sentence == orig_sentence:
            return k
    raise KeyError
 def get_dep_rel(token):
    logging.debug(token)
    for i, field in enumerate(token[14:]):
        if field != "_":
            return {
                "arg":  field,
                "from": i,  # i-th predicate in sentence
                "dep":  token[0],
            }
    return None
 def handle_file(infile_tpl):
    i = infile_tpl[0]
    infile = infile_tpl[1]
    outfile = (OUTPATH / infile.name).with_suffix(".json")
    origfile = get_origfile(infile)
    orig_dict = par.parse_tei(origfile)
    with infile.open("rb") as fp:
        outdata = {}
        for sentence_arr in extract_sentences(fp.readlines()):
            # tsv dropped sentence ids, match the ID, using original data
            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
            outdata[sid] = []
            # find all predicate indices in the sentence
            predicates = []
            for token in sentence_arr:
                if token[12] == "Y":
                    predicates += [token[0]]  # idx
                deprel = get_dep_rel(token)
                if deprel is not None:
                    outdata[sid].append(deprel)
            # deprel["from"] points to n-th predicate
            # replace with predicate's token index
            for deprel in outdata[sid]:
                deprel["from"] = predicates[deprel["from"]]
            if DEBUG:
                print(to_sentence(sentence_arr))
                print(outdata[sid])
                print(sid)
                print()
                print()
    with outfile.open("w") as fp:
        json.dump(outdata, fp)
        logging.info("SRL relations written to: {}".format(outfile))
 # main
 par = Parser()
 OUTPATH.mkdir(exist_ok=True)
 infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
 logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
 with Pool(CPU_CORES) as p:
    p.map(handle_file, infiles)
 logging.info("Finished generating .json files.")
--- a/tools/gen_json.py
+++ b/tools/gen_json.py
@ -1,3 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import os
 import pickle
 from pathlib import Path
 from parser.parser import Parser
 import configparser
@ -9,9 +14,10 @@ from multiprocessing import Pool
 # parse config
 config = configparser.ConfigParser()
 config.read("tools.cfg")
-ORIGPATH = Path(config["tools"]["kres_orig"])
+ORIGPATH = Path(config["tools"]["giga"])
-INPATH = Path(config["tools"]["kres_srl"])
+INPATH = Path(config["tools"]["giga_srl"])
-OUTPATH = Path(config["tools"]["kres_json"])
+OUTPATH = Path(config["tools"]["giga_json"])
 INTERNAL_DATA = Path(config["tools"]["internal_data"])
 DEBUG = config["tools"]["debug"] == "True"
 CPU_CORES = int(config["tools"]["cpu_cores"])
@ -48,6 +54,13 @@ def match_sentence_id(sentence, orig_dict):
            return k
    raise KeyError
 def match_sentence_id_giga(sentence, orig_dict):
    for k, e in orig_dict.items():
        # orig_sentence = " ".join(token[2] for token in e["tokens"])
        if sentence == e["text"]:
            return k
    raise KeyError
 def get_dep_rel(token):
    logging.debug(token)
    for i, field in enumerate(token[14:]):
@ -59,7 +72,7 @@ def get_dep_rel(token):
            }
    return None
-def handle_file(infile_tpl):
+def handle_file_old(infile_tpl):
    i = infile_tpl[0]
    infile = infile_tpl[1]
    outfile = (OUTPATH / infile.name).with_suffix(".json")
@ -101,14 +114,283 @@ def handle_file(infile_tpl):
        logging.info("SRL relations written to: {}".format(outfile))
 def handle_file(whole_input):
    # sentence_id = whole_input[0][3]
    # orig_infile = whole_input[0][1]
    sentence_id = whole_input[3]
    orig_infile = whole_input[1]
    # origfile = origfiles[0][1]
    # infile_tpl = infile_tpl[0]
    # i = infile_tpl[0]
    # infile = infile_tpl[1]
    outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
    if outfile.exists():
        return
    # origfile = get_origfile()
    orig_dict = par.parse_tei(orig_infile)
    outdata = {}
    gen = srl_multiple_files_sentences_generator(sentence_id)
    # gen = srl_multiple_files_sentences_generator(whole_input[1])
    mismatch_sentences = 0
    for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
        if orig_id == 'GF0014802.2685.7':
            print('PAUSE')
        # look at neighbouring sentences if they are correct
        for i in range(100):
            sentence, sentence_arr = next(gen)
            # orig_sentence = " ".join(token[2] for token in e["tokens"])
            if sentence == orig_val["text"]:
                # if i != 10 and i != 0:
                #     print('OK!')
                sid = orig_id
                outdata[sid] = []
                # find all predicate indices in the sentence
                predicates = []
                for token in sentence_arr:
                    if token[12] == "Y":
                        predicates += [token[0]]  # idx
                    deprel = get_dep_rel(token)
                    if deprel is not None:
                        outdata[sid].append(deprel)
                # deprel["from"] points to n-th predicate
                # replace with predicate's token index
                for deprel in outdata[sid]:
                    deprel["from"] = predicates[deprel["from"]]
                if DEBUG:
                    print(to_sentence(sentence_arr))
                    print(outdata[sid])
                    print(sid)
                    print()
                    print()
                break
            else:
                if i == 99:
                    mismatch_sentences += 1
                    sid = orig_id
                    outdata[sid] = []
                    gen = srl_multiple_files_sentences_generator(sentence_id + sentence_i)
    if mismatch_sentences > 0:
        if mismatch_sentences / len(orig_dict.items()) < 0.1:
            print('Slight mismatch - %d' % sentence_id)
            print(whole_input)
            print('ABS mitigated %d' % mismatch_sentences)
            print('------------------------------------------------')
        else:
            print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
            print('Big mismatch - %d' % sentence_id)
            print(whole_input)
            print('ABS mitigated errors:')
            print(mismatch_sentences)
            print('------------------------------------------------')
    with outfile.open("w") as fp:
        json.dump(outdata, fp)
        logging.info("SRL relations written to: {}".format(outfile))
 def count_orig_file_sentences(filename):
    if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
        return
    print(filename[0])
    orig_dict = par.parse_tei(filename[1])
    # return filename[0], filename[1], len(orig_dict)
    with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
        pickle.dump((filename[0], filename[1], len(orig_dict)), output)
 def count_srl_file_sentences(filename):
    if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
        return
    print(filename[0])
    num_sentences = 0
    with filename[1].open("r") as fp:
        for line in fp:
            if line == '\n':
                num_sentences += 1
    # return filename[0], filename[1], num_sentences
    with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
        pickle.dump((filename[0], filename[1], num_sentences), output)
 def srl_sentences_generator(infile, curr_index, sen_start_index):
    with infile.open("rb") as fp:
        outdata = {}
        for sentence_arr in extract_sentences(fp.readlines()):
            if curr_index < sen_start_index:
                curr_index += 1
            else:
                yield to_sentence(sentence_arr), sentence_arr
    yield None
 def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
    sentence_id = max(0, sentence_id - 10)
    for i, srl_file in enumerate(srl_file_sizes):
        if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
            srl_files = srl_file_sizes[i:]
            break
    for file_info in srl_files:
        # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
        srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
        el = next(srl_gen)
        while el is not None:
            yield el
            el = next(srl_gen)
    yield None
 # main
 par = Parser()
 OUTPATH.mkdir(exist_ok=True)
-infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
+infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
 logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
 origfiles = []
 for subdir, dirs, files in os.walk(ORIGPATH):
    for file in files:
        origfiles.append(Path(os.path.join(subdir, file)))
 origfiles=list(enumerate(sorted(origfiles)))
 ##### REMOVE ############
 # origfiles = origfiles[:3]
 # count sentences in orig (if not counted before)
 # os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
 if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
    # srl_file_sizes = {}
    if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
        os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
    # with Pool(CPU_CORES) as p:
    #     # p.map(handle_file, infiles)
    #     p.map(count_orig_file_sentences, origfiles)
    for i in range(len(origfiles)):
        count_orig_file_sentences(origfiles[i])
    orig_file_sizes = []
    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
        print(x.name)
        if x.is_file():
            with x.open('rb') as pkl_small_file:
                orig_file_sizes.append(pickle.load(pkl_small_file))
    # orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
    print("Sorting orig files")
    orig_file_sizes = sorted(orig_file_sizes)
    total_size = 0
    orig_file_sizes_final = []
    print("Calculating orig files size")
    for n, pa, si in orig_file_sizes:
        orig_file_sizes_final.append((n, pa, si, total_size))
        total_size += si
    orig_file_sizes = orig_file_sizes_final
    print("Saving orig files size")
    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
        pickle.dump(orig_file_sizes, output)
    print("Orig files saved")
 else:
    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
        orig_file_sizes = pickle.load(pkl_file)
 # count sentences in srl (if not counted before)
 # os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
 if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
    # srl_file_sizes = {}
    if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
        os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
    # with Pool(CPU_CORES) as p:
    #     # p.map(handle_file, infiles)
    #     p.map(count_srl_file_sentences, infiles)
    for i in range(len(infiles)):
        count_srl_file_sentences(infiles[i])
    srl_file_sizes = []
    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
        print(x.name)
        if x.is_file():
            with x.open('rb') as pkl_small_file:
                srl_file_sizes.append(pickle.load(pkl_small_file))
    print("Sorting srl files")
    srl_file_sizes = sorted(srl_file_sizes)
    total_size = 0
    srl_file_sizes_final = []
    print("Calculating srl files size")
    for n, pa, si in srl_file_sizes:
        srl_file_sizes_final.append((n, pa, si, total_size))
        total_size += si
    srl_file_sizes = srl_file_sizes_final
    print("Saving srl files size")
    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
        pickle.dump(srl_file_sizes, output)
    print("Srl files saved")
 else:
    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
        srl_file_sizes = pickle.load(pkl_file)
 # print(len(orig_file_sizes))
 # print('asd' + 2)
 # inputs = []
 # srl_i = 0
 # srl_file = srl_file_sizes[srl_i]
 # for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
 #     interesting_srl_files = []
 #     # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
 #     # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
 #     #     srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
 #     while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
 #         # if beginning of file is in
 #         if srl_file[3] > orig_first_sent_i:
 #             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
 #             # print('if %d' % srl_file[3])
 #         else:
 #             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
 #             # print('else %d' % orig_first_sent_i)
 #
 #         if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
 #             srl_i += 1
 #             if srl_i < len(srl_file_sizes):
 #                 srl_file = srl_file_sizes[srl_i]
 #             else:
 #                 break
 #                 # print(srl_i)
 #                 # print('a ' + 2)
 #         else:
 #             break
 #
 #     inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
    # print(inputs[-1])
 # srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
 # a = next(srl_gen)
 # b = next(srl_gen)
 # c = next(srl_gen)
 print('beginning processing')
 with Pool(CPU_CORES) as p:
-    p.map(handle_file, infiles)
+    # p.map(handle_file, inputs)
    p.map(handle_file, orig_file_sizes)
 # for of in orig_file_sizes:
 #     handle_file(of)
 logging.info("Finished generating .json files.")
--- a/tools/gen_json_fix_errors.py
+++ b/tools/gen_json_fix_errors.py
@ -0,0 +1,294 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import os
 import pickle
 from pathlib import Path
 from parser.parser import Parser
 import configparser
 import json
 import sys
 import logging
 from multiprocessing import Pool
 # parse config
 config = configparser.ConfigParser()
 config.read("tools.cfg")
 ORIGPATH = Path(config["tools"]["giga"])
 INPATH = Path(config["tools"]["giga_srl_errors"])
 OUTPATH = Path(config["tools"]["giga_json"])
 INTERNAL_DATA = Path(config["tools"]["internal_data"])
 DEBUG = config["tools"]["debug"] == "True"
 CPU_CORES = int(config["tools"]["cpu_cores"])
 LOGFILE = Path(config["tools"]["logfile"]).absolute()
 LOGFILE.touch(exist_ok=True)
 LOGFILE.resolve()
 logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
 error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
 def get_origfile(filename):
    for origfile in ORIGPATH.iterdir():
        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
            return origfile
    raise FileNotFoundError
 def extract_sentences(line_reader):
    acc = []
    # last char in line is \n, remove it
    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
        if len(line) == 1:  # empty line
            tmp = acc
            acc = []
            yield tmp
        else:
            acc.append(line)
 def to_sentence(sentence_arr):
    return " ".join([token[1] for token in sentence_arr])
 def match_sentence_id(sentence, orig_dict):
    for k, e in orig_dict.items():
        orig_sentence = " ".join(token[2] for token in e["tokens"])
        if sentence == orig_sentence:
            return k
    raise KeyError
 def match_sentence_id_giga(sentence, orig_dict):
    for k, e in orig_dict.items():
        # orig_sentence = " ".join(token[2] for token in e["tokens"])
        if sentence == e["text"]:
            return k
    raise KeyError
 def get_dep_rel(token):
    logging.debug(token)
    for i, field in enumerate(token[14:]):
        if field != "_":
            return {
                "arg":  field,
                "from": i,  # i-th predicate in sentence
                "dep":  token[0],
            }
    return None
 def handle_file_old(infile_tpl):
    i = infile_tpl[0]
    infile = infile_tpl[1]
    outfile = (OUTPATH / infile.name).with_suffix(".json")
    origfile = get_origfile(infile)
    orig_dict = par.parse_tei(origfile)
    with infile.open("rb") as fp:
        outdata = {}
        for sentence_arr in extract_sentences(fp.readlines()):
            # tsv dropped sentence ids, match the ID, using original data
            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
            outdata[sid] = []
            # find all predicate indices in the sentence
            predicates = []
            for token in sentence_arr:
                if token[12] == "Y":
                    predicates += [token[0]]  # idx
                deprel = get_dep_rel(token)
                if deprel is not None:
                    outdata[sid].append(deprel)
            # deprel["from"] points to n-th predicate
            # replace with predicate's token index
            for deprel in outdata[sid]:
                deprel["from"] = predicates[deprel["from"]]
            if DEBUG:
                print(to_sentence(sentence_arr))
                print(outdata[sid])
                print(sid)
                print()
                print()
    with outfile.open("w") as fp:
        json.dump(outdata, fp)
        logging.info("SRL relations written to: {}".format(outfile))
 def fix_json(srl_gen, error_sentence, orig_json_data):
    # sentence_id = whole_input[0][3]
    # orig_infile = whole_input[0][1]
    # sentence_id = whole_input[3]
    # orig_infile = whole_input[1]
    # origfile = origfiles[0][1]
    # infile_tpl = infile_tpl[0]
    # i = infile_tpl[0]
    # infile = infile_tpl[1]
    # outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
    # if outfile.exists():
    #     return
    # origfile = get_origfile()
    # orig_dict = par.parse_tei(orig_infile)
    # outdata = {}
    # gen = srl_multiple_files_sentences_generator(sentence_id)
    # gen = srl_multiple_files_sentences_generator(whole_input[1])
    # mismatch_sentences = 0
    # look at neighbouring sentences if they are correct
    sentence, sentence_arr = next(srl_gen)
    # orig_sentence = " ".join(token[2] for token in e["tokens"])
    sid = error_sentence
    # a = orig_json_data[sid]
    if orig_json_data[sid] != []:
        # print('POSSIBLE ERROR:')
        # print(orig_json_data[sid])
        orig_json_data[sid] = []
    # find all predicate indices in the sentence
    predicates = []
    for token in sentence_arr:
        if token[12] == "Y":
            predicates += [token[0]]  # idx
        deprel = get_dep_rel(token)
        if deprel is not None:
            orig_json_data[sid].append(deprel)
    # deprel["from"] points to n-th predicate
    # replace with predicate's token index
    for deprel in orig_json_data[sid]:
        deprel["from"] = predicates[deprel["from"]]
    if DEBUG:
        print(to_sentence(sentence_arr))
        print(orig_json_data[sid])
        print(sid)
        print()
        print()
    # a = orig_json_data[sid]
    return orig_json_data
 def count_orig_file_sentences(filename):
    if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
        return
    print(filename[0])
    orig_dict = par.parse_tei(filename[1])
    # return filename[0], filename[1], len(orig_dict)
    with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
        pickle.dump((filename[0], filename[1], len(orig_dict)), output)
 def count_srl_file_sentences(filename):
    if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
        return
    print(filename[0])
    num_sentences = 0
    with filename[1].open("r") as fp:
        for line in fp:
            if line == '\n':
                num_sentences += 1
    # return filename[0], filename[1], num_sentences
    with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
        pickle.dump((filename[0], filename[1], num_sentences), output)
 def srl_error_fix_generator(infile):
    with infile.open("rb") as fp:
        for sentence_arr in extract_sentences(fp.readlines()):
            yield to_sentence(sentence_arr), sentence_arr
    yield None
 def srl_sentences_generator(infile, curr_index, sen_start_index):
    with infile.open("rb") as fp:
        outdata = {}
        for sentence_arr in extract_sentences(fp.readlines()):
            if curr_index < sen_start_index:
                curr_index += 1
            else:
                yield to_sentence(sentence_arr), sentence_arr
    yield None
 def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
    sentence_id = max(0, sentence_id - 10)
    for i, srl_file in enumerate(srl_file_sizes):
        if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
            srl_files = srl_file_sizes[i:]
            break
    for file_info in srl_files:
        # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
        srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
        el = next(srl_gen)
        while el is not None:
            yield el
            el = next(srl_gen)
    yield None
 error_sentences_grouped = []
 group = False
 prev_name = ''
 # group sentences by their files
 for name in error_sentences:
    if name[:9] == prev_name:
        group.append(name)
    else:
        prev_name = name[:9]
        if group:
            error_sentences_grouped.append(group)
        group = [name]
 error_sentences_grouped.append(group)
 srl_gen = srl_error_fix_generator(INPATH)
 # find errors in json files:
 # with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output:
 #     sentence_ids = pickle.load(output)
 #
 #
 #
 # origfiles = []
 # for subdir, dirs, files in os.walk(OUTPATH):
 #     for file in files:
 #         origfiles.append(Path(os.path.join(subdir, file)))
 # origfiles=sorted(origfiles)
 #
 #
 #
 # for sent in origfiles:
 # # for sent in sentence_ids:
 # #     outfile = Path(OUTPATH, sent[:9] + '-dedup.json')
 #     outfile = sent
 #
 #     try:
 #         with outfile.open() as json_file:
 #             json.load(json_file)
 #             pass
 #     except:
 #         print(outfile.name)
 #
 #
 # raise Exception('test')
 # iterate over all wronged sentences and fix them
 for errors_in_file in error_sentences_grouped:
    outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json')
    with outfile.open() as json_file:
        print(outfile.name)
        orig_json_data = json.load(json_file)
        for error_sentence in errors_in_file:
            orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data)
    with outfile.open('w') as json_file:
        json.dump(orig_json_data, json_file)
        logging.info("SRL relations written to: {}".format(outfile))
--- a/tools/parse_all.py
+++ b/tools/parse_all.py
@ -1,3 +1,5 @@
 import pickle
 from parser.parser import Parser
 import os
 from os.path import join, dirname
@ -15,8 +17,21 @@ par = Parser()
 # path to data
 config = configparser.ConfigParser()
 config.read("tools.cfg")
-INDIR = Path(config["tools"]["kres_orig"])
+analysis = ''
-OUTDIR = Path(config["tools"]["kres_tsv"])
+if 'kres_orig' in config["tools"]:
    analysis = 'kres'
    INDIR = Path(config["tools"]["kres_orig"])
    OUTDIR = Path(config["tools"]["kres_tsv"])
 elif 'giga_orig' in config["tools"]:
    # analysis = 'gigafida'
    analysis = 'giga'
    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
    INDIR_GIGA = Path(config["tools"]["giga_orig"])
    INDIR_JOS = Path(config["tools"]["giga_jos"])
    OUTDIR = Path(config["tools"]["giga_tsv"])
    GIGA_PARTS = int(config["tools"]["giga_parts"])
    INTERNAL_DATA = config["tools"]["internal_data"]
 CPU_CORES = int(config["tools"]["cpu_cores"])
 LOGFILE = Path(config["tools"]["logfile"]).absolute()
@ -36,8 +51,9 @@ print("end parsing ssj")
 # kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
 OUTDIR.mkdir(exist_ok=True)
-infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
+if analysis == 'kres':
-logging.info("Parsing kres: {} files.".format(len(infiles)))
+    infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
    logging.info("Parsing kres: {} files.".format(len(infiles)))
 def handle_file(infile):
    i = infile[0]
@ -65,10 +81,297 @@ def handle_file(infile):
        return True
    return False
-with Pool(CPU_CORES) as p:
+def giga_orig_generator():
-    p.map(handle_file, infiles)
+    with open(INDIR_GIGA, 'r') as gof:
        previous_new_line = False
        for l_gof in gof:
            if l_gof == '\n':
                if previous_new_line:
                    continue
                previous_new_line = True
            elif previous_new_line:
                previous_new_line = False
            yield l_gof
-logging.info("end parsing kres")
+def handle_gigafida_file():
    """
    File that splits big text file into more minor files. Only split on empty lines.  
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    num_lines = 1393184026
    # 1393184026
    # 1393184033
    # return
    num_lines_per_part = num_lines / GIGA_PARTS
    curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
            ignore_lines = True
            wf = False
        else:
            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
            ignore_lines = False
        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if ignore_lines:
                if i > num_lines_per_part * curr_part and l_gof == '\n':
                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
                        ignore_lines = False
                        # delete last file (probably not whole)
                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
                    if ignore_lines:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue
            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')
            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
            # if l_gof == '\n':
            else:
                if wf:
                    # print(i)
                    wf.write(par.to_conll_2009_SRL(sentence))
                sentence['tokens'] = []
                sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if wf:
                        wf.close()
                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
        curr_part += 1
        wf.close()
 import  time
 def handle_giga_file(ran):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    num_lines = 1393184026
    # 1393184026
    # 1393184033
    # return
    num_lines_per_part = num_lines / GIGA_PARTS
    curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if curr_part in file_indices:
            if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
                os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if curr_part < ran[0]:
                if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
                    if curr_part < ran[0]:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue
            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')
            # if punctuation
            if l_gof != '\n':
                if curr_part not in file_indices:
                    continue
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
            # if l_gof == '\n':
            else:
                if curr_part in file_indices:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if curr_part in file_indices and wf:
                        wf.close()
                    if curr_part >= ran[1]:
                        break
                    if curr_part in file_indices:
                        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                            os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
                        wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
        curr_part += 1
        wf.close()
 def handle_giga_file_selected_sentences(error_sentences):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    # print('num_lines' + 3)
    # num_lines = 1393184026
    num_lines = 1393222523
    # 1393184026
    # 1393184033
    # return
    # num_lines_per_part = num_lines / GIGA_PARTS
    # curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
            os.remove(os.path.join(OUTDIR, 'giga_errors'))
        wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
        with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
            sentence_ids_list = pickle.load(pkl_file)
        sentence_id = 0
        skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if l_gjf == '\n':
                if not skip_sentence:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
                sentence_id += 1
                if sentence_ids_list[sentence_id] in error_sentences:
                    print(sentence_ids_list[sentence_id])
                    skip_sentence = False
                else:
                    skip_sentence = True
            if skip_sentence:
                continue
            # if curr_part < ran[0]:
            #     if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
            #         if curr_part < ran[0]:
            #             print(curr_part)
            #             curr_part += 1
            #             continue
            #     else:
            #         continue
            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')
            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
            # if l_gof == '\n':
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
            #     if i > num_lines_per_part * (curr_part + 1):
            #         curr_part += 1
            #         # if wf doesn't exist (first one)
            #         if curr_part in file_indices and wf:
            #             wf.close()
            #         if curr_part >= ran[1]:
            #             break
                    # if curr_part in file_indices:
                    #     if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                    #         os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
                    #
                    #     wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
        # curr_part += 1
        wf.close()
 file_indices = set(range(0, 100000))
 with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
    file_indices = set(pickle.load(pkl_file))
 with Pool(CPU_CORES) as p:
    if analysis == 'kres':
        p.map(handle_file, infiles)
    elif analysis == 'gigafida':
        handle_gigafida_file()
    elif analysis == 'giga':
        final_range = [0, 100000]
        size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
        # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
        ranges = []
        ps = None
        for i in range(CPU_CORES):
            s = int(final_range[0] + size_per_proc * i)
            ns = int(final_range[0] + size_per_proc * (i + 1))
            ranges.append([s, ns])
        # ranges = [[0, 1]]
        # p.map(handle_giga_file, ranges)
        # p.map(handle_giga_file, ranges)
        error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
        handle_giga_file_selected_sentences(set(error_sentences))
 logging.info("end parsing kres")
--- a/tools/parser/parser.py
+++ b/tools/parser/parser.py
@ -57,7 +57,10 @@ class Parser:
            divs = []  # in ssj, there are divs, in Kres, there are separate files
            if "id" in root.keys():
                # Kres files start with <TEI id=...>
-                guess_corpus = "KRES"
+                if root.get("id")[0:2] == 'GF':
                    guess_corpus = "GIGA"
                else:
                    guess_corpus = "KRES"
                divs = [root]
            else:
                guess_corpus = "SSJ"
@ -65,7 +68,10 @@ class Parser:
            # parse divs
            for div in divs:
-                f_id = div.get("id")
+                f_id = div.get("id")[:-6]
                if guess_corpus == "GIGA":
                    div = div.findall(".//body")[0]
                # parse paragraphs
                for p in div.findall(".//p"):
@ -75,46 +81,62 @@ class Parser:
                    for s in p.findall(".//s"):
                        s_id = s.get("id").split(".")[-1]
                        sentence_text = ""
                        sentence_list = []
                        sentence_tokens = []
                        # parse tokens
                        for el in s.iter():
                            if el.tag in self.W_TAGS:
-                                el_id = el.get("id").split(".")[-1]
+                                if guess_corpus != "GIGA":
-                                if el_id[0] == 't':
+                                    el_id = el.get("id").split(".")[-1]
-                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
+                                    if el_id[0] == 't':
-                                sentence_text += el.text
+                                        el_id = el_id[1:]  # ssj W_TAG ids start with t
-                                sentence_tokens += [(
+                                    sentence_text += el.text
-                                    "w",
+                                    sentence_tokens += [(
-                                    int(el_id),
+                                        "w",
-                                    el.text,
+                                        int(el_id),
-                                    el.get("lemma"),
+                                        el.text,
-                                    (el.get("msd") if guess_corpus == "KRES"
+                                        el.get("lemma"),
-                                        else el.get("ana").split(":")[-1]),
+                                        (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
-                                )]
+                                         else el.get("ana").split(":")[-1]),
                                    )]
                                else:
                                    sentence_list.append(el.text)
                            elif el.tag in self.C_TAGS:
                                # only Kres' C_TAGS have ids
-                                el_id = el.get("id") or "none"
+                                if guess_corpus != "GIGA":
-                                el_id = el_id.split(".")[-1]
+                                    el_id = el.get("id") or "none"
-                                sentence_text += el.text
+                                    el_id = el_id.split(".")[-1]
-                                sentence_tokens += [("c", el_id, el.text,)]
+                                    sentence_text += el.text
                                    sentence_tokens += [("c", el_id, el.text,)]
                            elif el.tag in self.S_TAGS:
                                # Kres' <S /> doesn't contain .text
-                                sentence_text += " "
+                                if guess_corpus == "GIGA":
                                    sentence_list.append(el.text)
                                else:
                                    sentence_text += " "
                            else:
                                # pass links and linkGroups
                                pass
                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
                        if sentence_id in res_dict:
                            raise KeyError("duplicated id: {}".format(sentence_id))
-                        res_dict[sentence_id] = {
+                        if guess_corpus == "GIGA":
-                            "sid": sentence_id,
+                            res_dict[sentence_id] = {
-                            "text": sentence_text,
+                                "sid": sentence_id,
-                            "tokens": sentence_tokens,
+                                "text": ' '.join(sentence_list),
-                            "links": (
+                                "tokens": None,
-                                parse_links(s) if guess_corpus == "KRES" else None
+                                "links": None
-                            )
+                            }
-                        }
+                        else:
                            res_dict[sentence_id] = {
                                "sid": sentence_id,
                                "text": sentence_text,
                                "tokens": sentence_tokens,
                                "links": (
                                    parse_links(s) if guess_corpus == "KRES" else None
                                )
                            }
        fp.close()
        return res_dict
@ -123,7 +145,7 @@ class Parser:
        def fillpred(tsv_row):
            mrow = build_model_row(tsv_row)
-            x = mrow[:-1] 
+            x = mrow[:-1]
            y = self.fillpred_model.predict([x])
            return y[0]  # bool
--- a/tools/srl-20131216/scripts/parse_srl_only_mod.sh
+++ b/tools/srl-20131216/scripts/parse_srl_only_mod.sh
@ -34,7 +34,8 @@ JVM_ARGS="-cp $CP -Xmx$MEM"
 NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step. This setting is equivalent to the CoNLL 2009 ST.
-CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang $INPUT $MODEL $RERANKER $NOPI $OUTPUT"
+$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang "$INPUT" $MODEL $RERANKER $NOPI "$OUTPUT"
-echo "Executing: $CMD"
+# CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang '$INPUT' $MODEL $RERANKER $NOPI '$OUTPUT'"
 # echo "Executing: $CMD"
-$CMD
+# $CMD
--- a/tools/srl-20131216/tag_all.gigafida.sh
+++ b/tools/srl-20131216/tag_all.gigafida.sh
@ -0,0 +1,29 @@
 #!/bin/bash
 # parsing tools.cfg values
 IN_FOLDER="$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg)"
 echo "input folder: $IN_FOLDER"
 OUT_FOLDER="$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg)"
 echo "output folder: $OUT_FOLDER"
 SUFFIX="srl.tsv"
 mkdir -p "$OUT_FOLDER"
 rm "$OUT_FOLDER/*${SUFFIX}" &> /dev/null
 for infile in "$IN_FOLDER/*"; do
 	echo "Tagging: ${infile}"
 	base=$(basename $infile | cut -d'.' -f1)
 	outfile="${OUT_FOLDER}/${base}.${SUFFIX}"
 	# mate-tools tagger
 	./scripts/parse_srl_only_mod.sh "$infile" "$outfile"
 	if [ $? -eq 0 ]; then
 		echo "Saved as ${outfile}"
 	else
 		echo "ERR"
 		exit 1
 	fi
 done
--- a/tools/srl-20131216/tag_all.kres.sh
+++ b/tools/srl-20131216/tag_all.kres.sh
@ -0,0 +1,29 @@
 #!/bin/bash
 # parsing tools.cfg values
 IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
 echo "input folder: $IN_FOLDER"
 OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
 echo "output folder: $OUT_FOLDER"
 SUFFIX="srl.tsv"
 mkdir -p $OUT_FOLDER
 rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
 for infile in $IN_FOLDER/*; do
 	echo "Tagging: ${infile}"
 	base=$(basename $infile | cut -d'.' -f1)
 	outfile=${OUT_FOLDER}/${base}.${SUFFIX}
 	# mate-tools tagger
 	./scripts/parse_srl_only_mod.sh $infile $outfile
 	if [ $? -eq 0 ]; then
 		echo "Saved as ${outfile}"
 	else
 		echo "ERR"
 		exit 1
 	fi
 done
--- a/tools/srl-20131216/tag_all.sh
+++ b/tools/srl-20131216/tag_all.sh
@ -1,15 +1,16 @@
 #!/bin/bash
 # parsing tools.cfg values
-IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
+IN_FOLDER="../$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg.kres_new)"
 IN_FOLDER=$IN_FOLDER$1
 echo "input folder: $IN_FOLDER"
-OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
+OUT_FOLDER="../$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg.kres_new)"
 echo "output folder: $OUT_FOLDER"
 SUFFIX="srl.tsv"
 mkdir -p $OUT_FOLDER
-rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
+# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
 for infile in $IN_FOLDER/*; do
 	echo "Tagging: ${infile}"
--- a/tools/tools.cfg
+++ b/tools/tools.cfg
@ -1,8 +1,18 @@
 [tools]
-kres_orig = /kres_mount/kres_parsed/tei
+giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
-kres_tsv = ../data/kres_out/1_tsv
+giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
-kres_srl = ../data/kres_out/2_srl
+; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
-kres_json = ../data/kres_out/final_json
+giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
 giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
 ; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
 ; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
 ; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
 giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
 giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
 ; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP
 giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
 internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
 giga_parts = 100000
 logfile = ../progress.log
-cpu_cores = 5
+cpu_cores = 16
 debug = False
--- a/tools/tools.cfg.gigafida
+++ b/tools/tools.cfg.gigafida
@ -0,0 +1,16 @@
 [tools]
 giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
 giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
 ; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
 giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
 giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
 ; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
 ; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
 ; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
 giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
 giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
 internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
 giga_parts = 100000
 logfile = ../progress.log
 cpu_cores = 1
 debug = False
--- a/tools/tools.cfg.kres
+++ b/tools/tools.cfg.kres
@ -0,0 +1,8 @@
 [tools]
 kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
 kres_tsv = ../data/kres_out/1_tsv
 kres_srl = ../data/kres_out/2_srl
 kres_json = ../data/kres_out/final_json
 logfile = ../progress.log
 cpu_cores = 5
 debug = False
--- a/tools/tools.cfg.kres_new
+++ b/tools/tools.cfg.kres_new
@ -0,0 +1,8 @@
 [tools]
 kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
 giga_tsv = ../data/giga_out/1_tsv
 giga_srl = ../data/giga_out/2_srl
 kres_json = ../data/giga_out/final_json
 logfile = ../progress.log
 cpu_cores = 5
 debug = False