Big changes

migrated to cjvt-gitea
Setup that SRL tagged kres
2022-02-04 11:24:47 +01:00 · 2019-03-03 21:35:05 +01:00 · 2019-03-03 21:10:23 +01:00 · 2019-02-28 23:37:47 +01:00 · 2019-02-28 21:49:49 +01:00 · 2019-02-28 15:05:10 +01:00
28 changed files with 1725 additions and 114 deletions
@@ -1,5 +1,11 @@
 *.pyc
 *.pickle
+*.log

-data/*/*.xml
-data/*/*.tsv
+nohup.out
+
+data/kres_out/*
+data/kres_example/
+venv/
+.idea/
+data/
@@ -1,19 +1,22 @@
-.PHONY: tsv_files srl_tagged_files json_files env
+.PHONY: tsv_files srl_tagged_files json_files env clean

-all: json_files
+all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files

-json_files: #TODO srl_tagged_files
+json_files: # srl_tagged_files
 	cd tools; python3 gen_json.py

-srl_tagged_files: tsv_files
+srl_tagged_files: # tsv_files
 	# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
 	cd tools/srl-20131216; ./tag_all.sh

-tsv_files: fillpred_model/model.pickle
+tsv_files: # tools/fillpred_model/model.pickle
 	cd tools; python3 parse_all.py

-fillpred_model/model.pickle:
+tools/fillpred_model/model.pickle:
 	cd tools/fillpred_model; $(MAKE)

 env:
 	cd dockerfiles; cd python-java; $(MAKE)
+
+clean:
+	rm tools/fillpred_model/model.pickle
@@ -11,7 +11,9 @@ Check out `./tools/srl-20131216/README.md`.

 ## Scripts
 Check all possible xml tags (that occur after the <body> tag.  
-'cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq'
+``` bash
+cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq
+```

 ## Tools
 * Parser for reading both `SSJ500k 2.1 TEI xml` and `Kres F....xml.parsed.xml"` files found in `./tools/parser/parser.py`.  
@@ -26,6 +28,12 @@ $ cd ./cjvt-srl-tagging
 $ make
 ```

+If you want to run it on a server overnight, you might want to use `nohup`, so you can close the ssh connection without closing the process.  
+```
+$ nohup make &
+```
+See progress in generated logfile (check git root).  
+
 # Makefile
 The Makefile follows certain steps:
 1. Create a fillpred model.
@@ -7,6 +7,8 @@ default-jdk \
 python3 \
 python3-pip

+RUN apt-get install -y sshfs
+
 RUN pip3 install lxml pandas sklearn

 ENV PYTHONIOENCODING UTF-8
@@ -5,14 +5,16 @@ all: build run
 build:
 	docker build . -t $(IMAGE_NAME)

+
 run:
 	docker run \
    	-it \
-	--user $(shell id -u):$(shell id -g) \
 	-v /home/${USER}:/home/${USER} \
+	--user $(shell id -u):$(shell id -g) \
 	-v /etc/passwd:/etc/passwd \
 	-v /etc/group:/etc/group \
    	-v $(shell pwd)/../../:/cjvt-srl-tagging \
 	-w /cjvt-srl-tagging \
-    	python-java \
-    	/bin/bash
+	-v /home/luka/Development/srl/data:/kres_mount:ro \
+    python-java \
+    /bin/bash
@@ -0,0 +1,15 @@
+#!/usr/bin/python3
+
+from lxml import etree
+
+def tei_to_dict(s_el):
+
+
+if __name__ == "__main__":
+    with open("/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml") as f:
+        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
+        xmlstring = xmlstring.replace(' xml:', ' ')
+        xml_tree = ElementTree.XML(xmlstring)
+
+
+
@@ -0,0 +1,151 @@
+#!/usr/bin/python2
+
+from __future__ import print_function, unicode_literals, division
+import sys
+import os
+import re
+import pickle
+from pathlib import Path
+
+try:
+    from lxml import etree as ElementTree
+except ImportError:
+    import xml.etree.ElementTree as ElementTree
+
+
+# attributes
+ID_ATTR = "id"
+LEMMA_ATTR = "lemma"
+ANA_ATTR = "ana"
+
+
+# tags
+SENTENCE_TAG = 's'
+BIBL_TAG = 'bibl'
+PARAGRAPH_TAG = 'p'
+PC_TAG = 'pc'
+WORD_TAG = 'w'
+C_TAG = 'c'
+S_TAG = 'S'
+SEG_TAG = 'seg'
+
+
+class Sentence:
+    def __init__(self, sentence, s_id):
+        self.id = s_id
+        self.words = []
+        self.text = ""
+
+        for word in sentence:
+            self.handle_word(word)
+
+    def handle_word(self, word):
+        # handle space after
+        if word.tag == S_TAG:
+            assert(word.text is None)
+            self.text += ' '
+            return
+
+        # ASK am I handling this correctly?
+        elif word.tag == SEG_TAG:
+            for segword in word:
+                self.handle_word(segword)
+            return
+
+        # ASK handle unknown tags (are there others?)
+        elif word.tag not in (WORD_TAG, C_TAG):
+            return
+
+        # ID
+        idx = str(len(self.words) + 1)
+
+        # TOKEN
+        token = word.text
+
+        # LEMMA
+        if word.tag == WORD_TAG:
+            lemma = word.get(LEMMA_ATTR)
+            assert(lemma is not None)
+        else:
+            lemma = token
+
+        # XPOS
+        xpos = word.get('msd')
+        if word.tag == C_TAG:
+            xpos = "Z"
+        elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
+            xpos = "N"
+        elif xpos is None:
+            print(self.id)
+
+        # save word entry
+        self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
+
+        # save for text
+        self.text += word.text
+
+
+    def to_conllu(self):
+        lines = []
+        # lines.append('# sent_id = ' + self.id)
+        # CONLLu does not like spaces at the end of # text
+        # lines.append('# text = ' + self.text.strip())
+        for word in self.words:
+            lines.append('\t'.join('_' if data is None else data for data in word))
+
+        return lines
+
+def convert_file(in_file, out_file):
+    print("Nalaganje xml: {}".format(in_file))
+    with open(str(in_file), 'r') as fp:
+        uni_str = fp.read().decode("utf-8")
+        xmlstring = re.sub(' xmlns="[^"]+"', '', uni_str, count=1)
+        xmlstring = xmlstring.replace(' xml:', ' ')
+        print(xmlstring[:1000])
+        xml_tree = ElementTree.XML(xmlstring)
+
+    print("Pretvarjanje TEI -> TSV-U ...")
+    lines = []
+
+    for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
+        sidx = 1
+        for sentence in paragraph:
+            if sentence.tag != SENTENCE_TAG:
+                continue
+
+            sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
+            lines.extend(sentence.to_conllu())
+            lines.append('') # ASK newline between sentences
+            sidx += 1
+
+    if len(lines) == 0:
+        raise RuntimeError("Nobenih stavkov najdenih")
+
+    print("Zapisovanje izhodne datoteke: {}".format(out_file))
+    with open(out_file, 'w') as fp:
+        for line in lines:
+            if sys.version_info < (3, 0):
+                line = line.encode('utf-8')
+            print(line, file=fp)
+
+
+if __name__ == "__main__":
+    """
+    Input: folder of TEI files, msds are encoded as msd="Z"
+    Ouput: just a folder
+    """
+	 
+    infile = "/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml"
+    outfile = "test.out"
+    convert_file(infile, outfile)
+    sys.exit()
+
+    in_folder = sys.argv[1]
+    out_folder = sys.argv[2]
+    num_processes = int(sys.argv[3])
+
+    files = Path(in_folder).rglob("*.xml")
+    in_out = []
+    for filename in files:
+        out_file = out_folder + "/" + filename.name[:-4] + ".txt"
+        convert_file(filename, out_file)
@@ -0,0 +1,19 @@
+import os
+
+# INPATH = Path(config["tools"]["giga_srl"])
+# infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
+SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files'
+from shutil import copyfile
+
+INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl'
+OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv'
+for i in range(100000):
+    # print(os.path.join(INPATH, 'giga.%07d.tsv' % i))
+    # if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)):
+    #     print('giga.%07d.tsv' % i)
+    if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)):
+        copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i))
+        print('giga%07d.srl.tsv' % i)
+
+    if i % 1000 == 0:
+        print(i)
@@ -51,4 +51,4 @@ if __name__ == "__main__":
 			print(i, df.shape)

 	print(ndf.head())
-	ndf.to_pickle(OUTFILE)
+	ndf.to_pickle(Path(OUTFILE))
@@ -27,4 +27,6 @@ if __name__ == "__main__":
 	clf_full = DecisionTreeClassifier()
 	clf_full.fit(X, y)

-	pickle.dump(clf_full, open(OUTFILE, "wb"))
+	with open(OUTFILE, "wb") as fp:
+		pickle.dump(clf_full, fp)
+
@@ -0,0 +1,192 @@
+import pickle
+
+from parser.parser import Parser
+import os
+from os.path import join, dirname
+from pathlib import Path
+import re
+import sys
+import cProfile
+import configparser
+import logging
+from multiprocessing import Pool
+
+SSJ500K_2_1 = 27829  # number of sentences
+par = Parser()
+
+# path to data
+config = configparser.ConfigParser()
+config.read("tools.cfg")
+analysis = ''
+if 'kres_orig' in config["tools"]:
+    analysis = 'kres'
+    INDIR = Path(config["tools"]["kres_orig"])
+    OUTDIR = Path(config["tools"]["kres_tsv"])
+elif 'giga_orig' in config["tools"]:
+    # analysis = 'gigafida'
+    analysis = 'giga'
+    INDIR_GIGA = Path(config["tools"]["giga_orig"])
+    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
+    INDIR_JOS = Path(config["tools"]["giga_jos"])
+    OUTDIR = Path(config["tools"]["giga_tsv"])
+    GIGA_PARTS = int(config["tools"]["giga_parts"])
+    INTERNAL_DATA = config["tools"]["internal_data"]
+
+CPU_CORES = int(config["tools"]["cpu_cores"])
+
+LOGFILE = Path(config["tools"]["logfile"]).absolute()
+LOGFILE.touch(exist_ok=True)
+LOGFILE.resolve()
+
+logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
+
+origfiles = []
+for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
+    for file in files:
+        origfiles.append(Path(os.path.join(subdir, file)))
+origfiles=list(enumerate(sorted(origfiles)))
+
+def giga_orig_sentence_generator():
+    with open(INDIR_GIGA, 'r') as gof:
+        previous_new_line = False
+        sentence_words = []
+        for l_gof in gof:
+            if l_gof == '\n':
+                yield ' '.join(sentence_words)
+                sentence_words = []
+            else:
+                sentence_words.append(l_gof.split('\t')[0])
+            # yield l_gof
+
+sentence_generator = giga_orig_sentence_generator()
+
+sentence_ids = []
+for origfile in origfiles:
+    split_file_sentences = par.parse_tei(origfile[1])
+    for k, v in split_file_sentences.items():
+        one_file_sentence = next(sentence_generator)
+        if one_file_sentence == v['text']:
+            sentence_ids.append(v['sid'])
+        else:
+            print('----------------')
+            print('ERROR')
+            print(v['sid'])
+            print(one_file_sentence)
+            print(v['text'])
+    print(origfile[0])
+
+# count sentences in orig (if not counted before)
+# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
+if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
+    os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
+
+with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
+    pickle.dump(sentence_ids, output)
+
+# def giga_orig_generator():
+#     with open(INDIR_GIGA, 'r') as gof:
+#         previous_new_line = False
+#         for l_gof in gof:
+#             if l_gof == '\n':
+#                 if previous_new_line:
+#                     continue
+#                 previous_new_line = True
+#             elif previous_new_line:
+#                 previous_new_line = False
+#             yield l_gof
+
+# import  time
+# def handle_giga_file(ran):
+#     """
+#     File that splits big text file into more minor files. Only split on empty lines.
+#     """
+#     # with open(INDIR_GIGA, 'r') as gof:
+#     #     with open(INDIR_JOS, 'r') as gjf:
+#     #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+#     #             pass
+#     #     num_lines = i + 1
+#     # print(num_lines)
+#     num_lines = 1393184026
+#     # 1393184026
+#     # 1393184033
+#     # return
+#     num_lines_per_part = num_lines / GIGA_PARTS
+#     curr_part = 0
+#     gof_generator = giga_orig_generator()
+#
+#     diff_files = set()
+#     # with open(INDIR_GIGA, 'r') as gof:
+#     with open(INDIR_GIGA_OLD, 'r') as gjf:
+#         # sentence = {}
+#         # sentence['tokens'] = []
+#         # sentence['links'] = {}
+#         # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
+#         #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
+#
+#         # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
+#
+#         # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+#         for i, l_gjf in enumerate(gjf):
+#             l_gof = next(gof_generator)
+#             if curr_part < ran[0]:
+#                 if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
+#                     if curr_part < ran[0]:
+#                         print(curr_part)
+#                         curr_part += 1
+#                         continue
+#                 else:
+#                     continue
+#
+#             l_gof_split = l_gof.split('\t')
+#             l_gjf_split = l_gjf.split('\t')
+#
+#             # if punctuation
+#             if l_gof != '\n':
+#                 if l_gof_split != l_gjf_split:
+#                     print(curr_part)
+#                     diff_files.add(curr_part)
+#                     l_gof = next(gof_generator)
+#
+#
+#             # if l_gof == '\n':
+#             else:
+#             # wf.flush()
+#             # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
+#                 if i > num_lines_per_part * (curr_part + 1):
+#                     curr_part += 1
+#                     # if wf doesn't exist (first one)
+#                     # wf.close()
+#                     if curr_part >= ran[1]:
+#                         break
+#                     # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
+#                     #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
+#
+#                     # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+#
+#         curr_part += 1
+#     return diff_files
+#         # wf.close()
+#
+# with Pool(CPU_CORES) as p:
+#     final_range = [0, 100000]
+#     # final_range = [0, 150]
+#     # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
+#     # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
+#     # ranges = []
+#     # ps = None
+#     # for i in range(CPU_CORES):
+#     #     s = int(final_range[0] + size_per_proc * i)
+#     #     ns = int(final_range[0] + size_per_proc * (i + 1))
+#     #     ranges.append([s, ns])
+#     # # ranges = [[0, 1]]
+#     # res = p.map(handle_giga_file, ranges)
+#
+#     res = handle_giga_file(final_range)
+#     res = sorted(list(res))
+#     if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
+#         os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
+#     with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
+#         pickle.dump(res, pkl_file)
+#     # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
+#     #     mydict2 = pickle.load(pkl_file)
+#     print('test')
@@ -0,0 +1,114 @@
+from pathlib import Path
+from parser.parser import Parser
+import configparser
+import json
+import sys
+import logging
+from multiprocessing import Pool
+
+# parse config
+config = configparser.ConfigParser()
+config.read("tools.cfg")
+# ORIGPATH = Path(config["tools"]["kres_orig"])
+INPATH = Path(config["tools"]["giga_srl"])
+OUTPATH = Path(config["tools"]["kres_json"])
+DEBUG = config["tools"]["debug"] == "True"
+CPU_CORES = int(config["tools"]["cpu_cores"])
+
+LOGFILE = Path(config["tools"]["logfile"]).absolute()
+LOGFILE.touch(exist_ok=True)
+LOGFILE.resolve()
+
+logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
+
+def get_origfile(filename):
+    for origfile in ORIGPATH.iterdir():
+        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
+            return origfile
+    raise FileNotFoundError
+
+def extract_sentences(line_reader):
+    acc = []
+    # last char in line is \n, remove it
+    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
+        if len(line) == 1:  # empty line
+            tmp = acc
+            acc = []
+            yield tmp
+        else:
+            acc.append(line)
+
+def to_sentence(sentence_arr):
+    return " ".join([token[1] for token in sentence_arr])
+
+def match_sentence_id(sentence, orig_dict):
+    for k, e in orig_dict.items():
+        orig_sentence = " ".join(token[2] for token in e["tokens"])
+        if sentence == orig_sentence:
+            return k
+    raise KeyError
+
+def get_dep_rel(token):
+    logging.debug(token)
+    for i, field in enumerate(token[14:]):
+        if field != "_":
+            return {
+                "arg":  field,
+                "from": i,  # i-th predicate in sentence
+                "dep":  token[0],
+            }
+    return None
+
+def handle_file(infile_tpl):
+    i = infile_tpl[0]
+    infile = infile_tpl[1]
+    outfile = (OUTPATH / infile.name).with_suffix(".json")
+    origfile = get_origfile(infile)
+    orig_dict = par.parse_tei(origfile)
+
+    with infile.open("rb") as fp:
+        outdata = {}
+        for sentence_arr in extract_sentences(fp.readlines()):
+            # tsv dropped sentence ids, match the ID, using original data
+            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
+
+            outdata[sid] = []
+
+            # find all predicate indices in the sentence
+            predicates = []
+            for token in sentence_arr:
+                if token[12] == "Y":
+                    predicates += [token[0]]  # idx
+
+                deprel = get_dep_rel(token)
+                if deprel is not None:
+                    outdata[sid].append(deprel)
+
+            # deprel["from"] points to n-th predicate
+            # replace with predicate's token index
+            for deprel in outdata[sid]:
+                deprel["from"] = predicates[deprel["from"]]
+
+            if DEBUG:
+                print(to_sentence(sentence_arr))
+                print(outdata[sid])
+                print(sid)
+                print()
+                print()
+
+    with outfile.open("w") as fp:
+        json.dump(outdata, fp)
+        logging.info("SRL relations written to: {}".format(outfile))
+
+
+# main
+par = Parser()
+OUTPATH.mkdir(exist_ok=True)
+
+infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
+logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
+
+with Pool(CPU_CORES) as p:
+    p.map(handle_file, infiles)
+
+logging.info("Finished generating .json files.")
@@ -1,57 +1,396 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import pickle
 from pathlib import Path
 from parser.parser import Parser
 import configparser
-
-# defaults
-ORIGPATH = Path("../data/kres_example")  # we need the IDs
-INPATH = Path("../data/kres_example_srl")
-OUTPATH = Path("../data/kres_example_json")
+import json
+import sys
+import logging
+from multiprocessing import Pool

 # parse config
 config = configparser.ConfigParser()
 config.read("tools.cfg")
-ORIGPATH = Path(config["tools"]["kres_orig"])
-INPATH = Path(config["tools"]["kres_srl"])
-OUTPATH = Path(config["tools"]["kres_json"])
+ORIGPATH = Path(config["tools"]["giga"])
+INPATH = Path(config["tools"]["giga_srl"])
+OUTPATH = Path(config["tools"]["giga_json"])
+INTERNAL_DATA = Path(config["tools"]["internal_data"])
+DEBUG = config["tools"]["debug"] == "True"
+CPU_CORES = int(config["tools"]["cpu_cores"])
+
+LOGFILE = Path(config["tools"]["logfile"]).absolute()
+LOGFILE.touch(exist_ok=True)
+LOGFILE.resolve()
+
+logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

 def get_origfile(filename):
-	for origfile in ORIGPATH.iterdir():
-		if filename.name.split('.')[0] == origfile.name.split('.')[0]:
-			return origfile
-	raise FileNotFoundError
+    for origfile in ORIGPATH.iterdir():
+        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
+            return origfile
+    raise FileNotFoundError

 def extract_sentences(line_reader):
-	acc = []
-	for line in [x.decode("utf-8").split('\t') for x in line_reader]:
-		if line[0] == '\n':
-			tmp = acc
-			acc = []
-			yield tmp
-		else:
-			acc.append(line)
+    acc = []
+    # last char in line is \n, remove it
+    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
+        if len(line) == 1:  # empty line
+            tmp = acc
+            acc = []
+            yield tmp
+        else:
+            acc.append(line)

-def match_sentence_id(string, rd):
-	str1 = " ".join([token[1] for token in sentence_arr])
-	for k, e in rd.items():
-		str2 = " ".join(token[2] for token in dict_entry["tokens"])
-		if str1 == str2:
-			return k
-	raise KeyError
+def to_sentence(sentence_arr):
+    return " ".join([token[1] for token in sentence_arr])
+
+def match_sentence_id(sentence, orig_dict):
+    for k, e in orig_dict.items():
+        orig_sentence = " ".join(token[2] for token in e["tokens"])
+        if sentence == orig_sentence:
+            return k
+    raise KeyError
+
+def match_sentence_id_giga(sentence, orig_dict):
+    for k, e in orig_dict.items():
+        # orig_sentence = " ".join(token[2] for token in e["tokens"])
+        if sentence == e["text"]:
+            return k
+    raise KeyError
+
+def get_dep_rel(token):
+    logging.debug(token)
+    for i, field in enumerate(token[14:]):
+        if field != "_":
+            return {
+                "arg":  field,
+                "from": i,  # i-th predicate in sentence
+                "dep":  token[0],
+            }
+    return None
+
+def handle_file_old(infile_tpl):
+    i = infile_tpl[0]
+    infile = infile_tpl[1]
+    outfile = (OUTPATH / infile.name).with_suffix(".json")
+    origfile = get_origfile(infile)
+    orig_dict = par.parse_tei(origfile)
+
+    with infile.open("rb") as fp:
+        outdata = {}
+        for sentence_arr in extract_sentences(fp.readlines()):
+            # tsv dropped sentence ids, match the ID, using original data
+            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
+
+            outdata[sid] = []
+
+            # find all predicate indices in the sentence
+            predicates = []
+            for token in sentence_arr:
+                if token[12] == "Y":
+                    predicates += [token[0]]  # idx
+
+                deprel = get_dep_rel(token)
+                if deprel is not None:
+                    outdata[sid].append(deprel)
+
+            # deprel["from"] points to n-th predicate
+            # replace with predicate's token index
+            for deprel in outdata[sid]:
+                deprel["from"] = predicates[deprel["from"]]
+
+            if DEBUG:
+                print(to_sentence(sentence_arr))
+                print(outdata[sid])
+                print(sid)
+                print()
+                print()
+
+    with outfile.open("w") as fp:
+        json.dump(outdata, fp)
+        logging.info("SRL relations written to: {}".format(outfile))


-if __name__ == "__main__":
+def handle_file(whole_input):
+    # sentence_id = whole_input[0][3]
+    # orig_infile = whole_input[0][1]
+    sentence_id = whole_input[3]
+    orig_infile = whole_input[1]

-	par = Parser()
+    # origfile = origfiles[0][1]
+    # infile_tpl = infile_tpl[0]

-	for infile in [x for x in INPATH.iterdir() if x.is_file()]:
-		origfile = get_origfile(infile)
-		rd = par.parse_tei(origfile)
+    # i = infile_tpl[0]
+    # infile = infile_tpl[1]
+    outfile = (OUTPATH / orig_infile.name).with_suffix(".json")

-		fp = infile.open("rb")
-		for sentence_arr in extract_sentences(fp.readlines()):
-			sid = match_sentence_id(sentence_arr, rd)
-			print(sid)
-			# OK, we got the sentence id, now generate the predicate map!
+    if outfile.exists():
+        return
+    # origfile = get_origfile()
+    orig_dict = par.parse_tei(orig_infile)
+    outdata = {}
+
+    gen = srl_multiple_files_sentences_generator(sentence_id)
+    # gen = srl_multiple_files_sentences_generator(whole_input[1])
+
+    mismatch_sentences = 0
+
+    for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
+        if orig_id == 'GF0014802.2685.7':
+            print('PAUSE')
+
+        # look at neighbouring sentences if they are correct
+        for i in range(100):
+            sentence, sentence_arr = next(gen)
+            # orig_sentence = " ".join(token[2] for token in e["tokens"])
+            if sentence == orig_val["text"]:
+                # if i != 10 and i != 0:
+                #     print('OK!')
+                sid = orig_id
+
+                outdata[sid] = []
+
+                # find all predicate indices in the sentence
+                predicates = []
+                for token in sentence_arr:
+                    if token[12] == "Y":
+                        predicates += [token[0]]  # idx
+
+                    deprel = get_dep_rel(token)
+                    if deprel is not None:
+                        outdata[sid].append(deprel)
+
+                # deprel["from"] points to n-th predicate
+                # replace with predicate's token index
+                for deprel in outdata[sid]:
+                    deprel["from"] = predicates[deprel["from"]]
+
+                if DEBUG:
+                    print(to_sentence(sentence_arr))
+                    print(outdata[sid])
+                    print(sid)
+                    print()
+                    print()
+                break
+            else:
+                if i == 99:
+                    mismatch_sentences += 1
+                    sid = orig_id
+                    outdata[sid] = []
+                    gen = srl_multiple_files_sentences_generator(sentence_id + sentence_i)
+
+    if mismatch_sentences > 0:
+        if mismatch_sentences / len(orig_dict.items()) < 0.1:
+            print('Slight mismatch - %d' % sentence_id)
+            print(whole_input)
+            print('ABS mitigated %d' % mismatch_sentences)
+            print('------------------------------------------------')
+        else:
+            print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
+            print('Big mismatch - %d' % sentence_id)
+            print(whole_input)
+            print('ABS mitigated errors:')
+            print(mismatch_sentences)
+            print('------------------------------------------------')


-		outfile = (OUTPATH / infile.name).with_suffix(".json")
+    with outfile.open("w") as fp:
+        json.dump(outdata, fp)
+        logging.info("SRL relations written to: {}".format(outfile))
+
+def count_orig_file_sentences(filename):
+
+    if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
+        return
+    print(filename[0])
+    orig_dict = par.parse_tei(filename[1])
+    # return filename[0], filename[1], len(orig_dict)
+    with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
+        pickle.dump((filename[0], filename[1], len(orig_dict)), output)
+
+
+def count_srl_file_sentences(filename):
+    if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
+        return
+
+    print(filename[0])
+    num_sentences = 0
+    with filename[1].open("r") as fp:
+        for line in fp:
+            if line == '\n':
+                num_sentences += 1
+
+    # return filename[0], filename[1], num_sentences
+    with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
+        pickle.dump((filename[0], filename[1], num_sentences), output)
+
+def srl_sentences_generator(infile, curr_index, sen_start_index):
+    with infile.open("rb") as fp:
+        outdata = {}
+        for sentence_arr in extract_sentences(fp.readlines()):
+            if curr_index < sen_start_index:
+                curr_index += 1
+            else:
+                yield to_sentence(sentence_arr), sentence_arr
+    yield None
+
+
+def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
+    sentence_id = max(0, sentence_id - 10)
+    for i, srl_file in enumerate(srl_file_sizes):
+        if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
+            srl_files = srl_file_sizes[i:]
+            break
+
+    for file_info in srl_files:
+        # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
+        srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
+        el = next(srl_gen)
+        while el is not None:
+            yield el
+            el = next(srl_gen)
+
+    yield None
+
+
+# main
+par = Parser()
+OUTPATH.mkdir(exist_ok=True)
+
+infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
+logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
+
+origfiles = []
+for subdir, dirs, files in os.walk(ORIGPATH):
+    for file in files:
+        origfiles.append(Path(os.path.join(subdir, file)))
+origfiles=list(enumerate(sorted(origfiles)))
+##### REMOVE ############
+# origfiles = origfiles[:3]
+
+# count sentences in orig (if not counted before)
+# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
+if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
+    # srl_file_sizes = {}
+    if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
+        os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
+    # with Pool(CPU_CORES) as p:
+    #     # p.map(handle_file, infiles)
+    #     p.map(count_orig_file_sentences, origfiles)
+    for i in range(len(origfiles)):
+        count_orig_file_sentences(origfiles[i])
+    orig_file_sizes = []
+    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
+        print(x.name)
+        if x.is_file():
+            with x.open('rb') as pkl_small_file:
+                orig_file_sizes.append(pickle.load(pkl_small_file))
+    # orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
+    print("Sorting orig files")
+    orig_file_sizes = sorted(orig_file_sizes)
+    total_size = 0
+    orig_file_sizes_final = []
+    print("Calculating orig files size")
+    for n, pa, si in orig_file_sizes:
+        orig_file_sizes_final.append((n, pa, si, total_size))
+        total_size += si
+    orig_file_sizes = orig_file_sizes_final
+    print("Saving orig files size")
+    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
+        pickle.dump(orig_file_sizes, output)
+    print("Orig files saved")
+else:
+    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
+        orig_file_sizes = pickle.load(pkl_file)
+
+
+# count sentences in srl (if not counted before)
+# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
+if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
+    # srl_file_sizes = {}
+    if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
+        os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
+    # with Pool(CPU_CORES) as p:
+    #     # p.map(handle_file, infiles)
+    #     p.map(count_srl_file_sentences, infiles)
+
+    for i in range(len(infiles)):
+        count_srl_file_sentences(infiles[i])
+
+    srl_file_sizes = []
+    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
+        print(x.name)
+        if x.is_file():
+            with x.open('rb') as pkl_small_file:
+                srl_file_sizes.append(pickle.load(pkl_small_file))
+    print("Sorting srl files")
+    srl_file_sizes = sorted(srl_file_sizes)
+    total_size = 0
+    srl_file_sizes_final = []
+    print("Calculating srl files size")
+    for n, pa, si in srl_file_sizes:
+        srl_file_sizes_final.append((n, pa, si, total_size))
+        total_size += si
+    srl_file_sizes = srl_file_sizes_final
+    print("Saving srl files size")
+    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
+        pickle.dump(srl_file_sizes, output)
+    print("Srl files saved")
+else:
+    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
+        srl_file_sizes = pickle.load(pkl_file)
+
+
+# print(len(orig_file_sizes))
+# print('asd' + 2)
+
+# inputs = []
+# srl_i = 0
+# srl_file = srl_file_sizes[srl_i]
+# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
+#     interesting_srl_files = []
+#     # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
+#     # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
+#     #     srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
+#     while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
+#         # if beginning of file is in
+#         if srl_file[3] > orig_first_sent_i:
+#             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
+#             # print('if %d' % srl_file[3])
+#         else:
+#             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
+#             # print('else %d' % orig_first_sent_i)
+#
+#         if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
+#             srl_i += 1
+#             if srl_i < len(srl_file_sizes):
+#                 srl_file = srl_file_sizes[srl_i]
+#             else:
+#                 break
+#                 # print(srl_i)
+#                 # print('a ' + 2)
+#         else:
+#             break
+#
+#     inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
+    # print(inputs[-1])
+
+
+
+# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
+# a = next(srl_gen)
+# b = next(srl_gen)
+# c = next(srl_gen)
+
+print('beginning processing')
+with Pool(CPU_CORES) as p:
+    # p.map(handle_file, inputs)
+    p.map(handle_file, orig_file_sizes)
+
+# for of in orig_file_sizes:
+#     handle_file(of)
+
+logging.info("Finished generating .json files.")
@@ -0,0 +1,294 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import pickle
+from pathlib import Path
+from parser.parser import Parser
+import configparser
+import json
+import sys
+import logging
+from multiprocessing import Pool
+
+# parse config
+config = configparser.ConfigParser()
+config.read("tools.cfg")
+ORIGPATH = Path(config["tools"]["giga"])
+INPATH = Path(config["tools"]["giga_srl_errors"])
+OUTPATH = Path(config["tools"]["giga_json"])
+INTERNAL_DATA = Path(config["tools"]["internal_data"])
+DEBUG = config["tools"]["debug"] == "True"
+CPU_CORES = int(config["tools"]["cpu_cores"])
+
+LOGFILE = Path(config["tools"]["logfile"]).absolute()
+LOGFILE.touch(exist_ok=True)
+LOGFILE.resolve()
+
+logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
+error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
+
+
+
+
+def get_origfile(filename):
+    for origfile in ORIGPATH.iterdir():
+        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
+            return origfile
+    raise FileNotFoundError
+
+def extract_sentences(line_reader):
+    acc = []
+    # last char in line is \n, remove it
+    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
+        if len(line) == 1:  # empty line
+            tmp = acc
+            acc = []
+            yield tmp
+        else:
+            acc.append(line)
+
+def to_sentence(sentence_arr):
+    return " ".join([token[1] for token in sentence_arr])
+
+def match_sentence_id(sentence, orig_dict):
+    for k, e in orig_dict.items():
+        orig_sentence = " ".join(token[2] for token in e["tokens"])
+        if sentence == orig_sentence:
+            return k
+    raise KeyError
+
+def match_sentence_id_giga(sentence, orig_dict):
+    for k, e in orig_dict.items():
+        # orig_sentence = " ".join(token[2] for token in e["tokens"])
+        if sentence == e["text"]:
+            return k
+    raise KeyError
+
+def get_dep_rel(token):
+    logging.debug(token)
+    for i, field in enumerate(token[14:]):
+        if field != "_":
+            return {
+                "arg":  field,
+                "from": i,  # i-th predicate in sentence
+                "dep":  token[0],
+            }
+    return None
+
+def handle_file_old(infile_tpl):
+    i = infile_tpl[0]
+    infile = infile_tpl[1]
+    outfile = (OUTPATH / infile.name).with_suffix(".json")
+    origfile = get_origfile(infile)
+    orig_dict = par.parse_tei(origfile)
+
+    with infile.open("rb") as fp:
+        outdata = {}
+        for sentence_arr in extract_sentences(fp.readlines()):
+            # tsv dropped sentence ids, match the ID, using original data
+            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
+
+            outdata[sid] = []
+
+            # find all predicate indices in the sentence
+            predicates = []
+            for token in sentence_arr:
+                if token[12] == "Y":
+                    predicates += [token[0]]  # idx
+
+                deprel = get_dep_rel(token)
+                if deprel is not None:
+                    outdata[sid].append(deprel)
+
+            # deprel["from"] points to n-th predicate
+            # replace with predicate's token index
+            for deprel in outdata[sid]:
+                deprel["from"] = predicates[deprel["from"]]
+
+            if DEBUG:
+                print(to_sentence(sentence_arr))
+                print(outdata[sid])
+                print(sid)
+                print()
+                print()
+
+    with outfile.open("w") as fp:
+        json.dump(outdata, fp)
+        logging.info("SRL relations written to: {}".format(outfile))
+
+
+def fix_json(srl_gen, error_sentence, orig_json_data):
+    # sentence_id = whole_input[0][3]
+    # orig_infile = whole_input[0][1]
+    # sentence_id = whole_input[3]
+    # orig_infile = whole_input[1]
+
+    # origfile = origfiles[0][1]
+    # infile_tpl = infile_tpl[0]
+
+    # i = infile_tpl[0]
+    # infile = infile_tpl[1]
+    # outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
+
+    # if outfile.exists():
+    #     return
+    # origfile = get_origfile()
+    # orig_dict = par.parse_tei(orig_infile)
+    # outdata = {}
+
+    # gen = srl_multiple_files_sentences_generator(sentence_id)
+    # gen = srl_multiple_files_sentences_generator(whole_input[1])
+
+    # mismatch_sentences = 0
+
+    # look at neighbouring sentences if they are correct
+    sentence, sentence_arr = next(srl_gen)
+    # orig_sentence = " ".join(token[2] for token in e["tokens"])
+    sid = error_sentence
+    # a = orig_json_data[sid]
+    if orig_json_data[sid] != []:
+        # print('POSSIBLE ERROR:')
+        # print(orig_json_data[sid])
+        orig_json_data[sid] = []
+
+    # find all predicate indices in the sentence
+    predicates = []
+    for token in sentence_arr:
+        if token[12] == "Y":
+            predicates += [token[0]]  # idx
+
+        deprel = get_dep_rel(token)
+        if deprel is not None:
+            orig_json_data[sid].append(deprel)
+
+    # deprel["from"] points to n-th predicate
+    # replace with predicate's token index
+    for deprel in orig_json_data[sid]:
+        deprel["from"] = predicates[deprel["from"]]
+
+    if DEBUG:
+        print(to_sentence(sentence_arr))
+        print(orig_json_data[sid])
+        print(sid)
+        print()
+        print()
+    # a = orig_json_data[sid]
+    return orig_json_data
+
+def count_orig_file_sentences(filename):
+
+    if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
+        return
+    print(filename[0])
+    orig_dict = par.parse_tei(filename[1])
+    # return filename[0], filename[1], len(orig_dict)
+    with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
+        pickle.dump((filename[0], filename[1], len(orig_dict)), output)
+
+
+def count_srl_file_sentences(filename):
+    if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
+        return
+
+    print(filename[0])
+    num_sentences = 0
+    with filename[1].open("r") as fp:
+        for line in fp:
+            if line == '\n':
+                num_sentences += 1
+
+    # return filename[0], filename[1], num_sentences
+    with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
+        pickle.dump((filename[0], filename[1], num_sentences), output)
+
+def srl_error_fix_generator(infile):
+    with infile.open("rb") as fp:
+        for sentence_arr in extract_sentences(fp.readlines()):
+            yield to_sentence(sentence_arr), sentence_arr
+    yield None
+
+def srl_sentences_generator(infile, curr_index, sen_start_index):
+    with infile.open("rb") as fp:
+        outdata = {}
+        for sentence_arr in extract_sentences(fp.readlines()):
+            if curr_index < sen_start_index:
+                curr_index += 1
+            else:
+                yield to_sentence(sentence_arr), sentence_arr
+    yield None
+
+
+def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
+    sentence_id = max(0, sentence_id - 10)
+    for i, srl_file in enumerate(srl_file_sizes):
+        if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
+            srl_files = srl_file_sizes[i:]
+            break
+
+    for file_info in srl_files:
+        # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
+        srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
+        el = next(srl_gen)
+        while el is not None:
+            yield el
+            el = next(srl_gen)
+
+    yield None
+
+error_sentences_grouped = []
+group = False
+prev_name = ''
+# group sentences by their files
+for name in error_sentences:
+    if name[:9] == prev_name:
+        group.append(name)
+    else:
+        prev_name = name[:9]
+        if group:
+            error_sentences_grouped.append(group)
+        group = [name]
+error_sentences_grouped.append(group)
+
+srl_gen = srl_error_fix_generator(INPATH)
+
+# find errors in json files:
+# with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output:
+#     sentence_ids = pickle.load(output)
+#
+#
+#
+# origfiles = []
+# for subdir, dirs, files in os.walk(OUTPATH):
+#     for file in files:
+#         origfiles.append(Path(os.path.join(subdir, file)))
+# origfiles=sorted(origfiles)
+#
+#
+#
+# for sent in origfiles:
+# # for sent in sentence_ids:
+# #     outfile = Path(OUTPATH, sent[:9] + '-dedup.json')
+#     outfile = sent
+#
+#     try:
+#         with outfile.open() as json_file:
+#             json.load(json_file)
+#             pass
+#     except:
+#         print(outfile.name)
+#
+#
+# raise Exception('test')
+# iterate over all wronged sentences and fix them
+for errors_in_file in error_sentences_grouped:
+    outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json')
+    with outfile.open() as json_file:
+        print(outfile.name)
+        orig_json_data = json.load(json_file)
+        for error_sentence in errors_in_file:
+            orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data)
+
+    with outfile.open('w') as json_file:
+        json.dump(orig_json_data, json_file)
+        logging.info("SRL relations written to: {}".format(outfile))
@@ -1,3 +1,5 @@
+import pickle
+
 from parser.parser import Parser
 import os
 from os.path import join, dirname
@@ -6,10 +8,8 @@ import re
 import sys
 import cProfile
 import configparser
-
-# some defaults
-INDIR = Path("../data/kres_example")
-OUTDIR = Path("../data/kres_example_tsv")
+import logging
+from multiprocessing import Pool

 SSJ500K_2_1 = 27829  # number of sentences
 par = Parser()
@@ -17,8 +17,28 @@ par = Parser()
 # path to data
 config = configparser.ConfigParser()
 config.read("tools.cfg")
-INDIR = Path(config["tools"]["kres_orig"])
-OUTDIR = Path(config["tools"]["kres_tsv"])
+analysis = ''
+if 'kres_orig' in config["tools"]:
+    analysis = 'kres'
+    INDIR = Path(config["tools"]["kres_orig"])
+    OUTDIR = Path(config["tools"]["kres_tsv"])
+elif 'giga_orig' in config["tools"]:
+    # analysis = 'gigafida'
+    analysis = 'giga'
+    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
+    INDIR_GIGA = Path(config["tools"]["giga_orig"])
+    INDIR_JOS = Path(config["tools"]["giga_jos"])
+    OUTDIR = Path(config["tools"]["giga_tsv"])
+    GIGA_PARTS = int(config["tools"]["giga_parts"])
+    INTERNAL_DATA = config["tools"]["internal_data"]
+
+CPU_CORES = int(config["tools"]["cpu_cores"])
+
+LOGFILE = Path(config["tools"]["logfile"]).absolute()
+LOGFILE.touch(exist_ok=True)
+LOGFILE.resolve()
+
+logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

 """
 print("parsing ssj")
@@ -28,22 +48,330 @@ ssj_dict = par.parse_tei(ssj_file)
 print("end parsing ssj")
 """

-print("parsing kres")
 # kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
 OUTDIR.mkdir(exist_ok=True)

-for kres_file in [x for x in INDIR.iterdir() if x.is_file()]:
+if analysis == 'kres':
+    infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
+    logging.info("Parsing kres: {} files.".format(len(infiles)))

-    print("Processing file: " + str(kres_file))
-    res_dict = par.parse_tei(kres_file)
-    longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
-    print("Longest sentence: ", longest_sent)
-    kres_out_str = ""
+def handle_file(infile):
+    i = infile[0]
+    kres_file = infile[1]
+    outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")

-    for _, sentence in res_dict.items():
-        kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
+    if outfile.is_file():
+        logging.info("Skipping existing file: {}.".format(str(kres_file)))
+        return True

-    with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
+    try:
+        res_dict = par.parse_tei(kres_file)
+        kres_out_str = ""
+        for _, sentence in res_dict.items():
+            kres_out_str += par.to_conll_2009_SRL(sentence)
+    except Exception as exc:
+        logging.info("Failed processing file: {}".format(str(kres_file)))
+        logging.error(exc)
+        return False
+
+
+    with outfile.open("wb+") as fp:
        fp.write(kres_out_str.encode("utf-8"))
-        fp.close()
-print("end parsing kres")
+        logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
+        return True
+    return False
+
+def giga_orig_generator():
+    with open(INDIR_GIGA, 'r') as gof:
+        previous_new_line = False
+        for l_gof in gof:
+            if l_gof == '\n':
+                if previous_new_line:
+                    continue
+                previous_new_line = True
+            elif previous_new_line:
+                previous_new_line = False
+            yield l_gof
+
+
+def handle_gigafida_file():
+    """
+    File that splits big text file into more minor files. Only split on empty lines.  
+    """
+    # with open(INDIR_GIGA, 'r') as gof:
+    #     with open(INDIR_JOS, 'r') as gjf:
+    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+    #             pass
+    #     num_lines = i + 1
+    # print(num_lines)
+    num_lines = 1393184026
+    # 1393184026
+    # 1393184033
+    # return
+    num_lines_per_part = num_lines / GIGA_PARTS
+    curr_part = 0
+    gof_generator = giga_orig_generator()
+    # with open(INDIR_GIGA, 'r') as gof:
+    with open(INDIR_JOS, 'r') as gjf:
+        sentence = {}
+        sentence['tokens'] = []
+        sentence['links'] = {}
+        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
+            ignore_lines = True
+            wf = False
+        else:
+            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+            ignore_lines = False
+        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+        for i, l_gjf in enumerate(gjf):
+            l_gof = next(gof_generator)
+            if ignore_lines:
+                if i > num_lines_per_part * curr_part and l_gof == '\n':
+                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
+                        ignore_lines = False
+                        # delete last file (probably not whole)
+                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
+                    if ignore_lines:
+                        print(curr_part)
+                        curr_part += 1
+                        continue
+                else:
+                    continue
+            l_gof_split = l_gof.split('\t')
+            l_gjf_split = l_gjf.split('\t')
+
+            # if punctuation
+            if l_gof != '\n':
+                if l_gof_split[1][-1] == 'u':
+                    # print(l_gjf_split)
+                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
+                else:
+                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
+
+                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
+
+            # if l_gof == '\n':
+            else:
+                if wf:
+                    # print(i)
+                    wf.write(par.to_conll_2009_SRL(sentence))
+                sentence['tokens'] = []
+                sentence['links'] = {}
+            # wf.flush()
+            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
+                if i > num_lines_per_part * (curr_part + 1):
+                    curr_part += 1
+                    # if wf doesn't exist (first one)
+                    if wf:
+                        wf.close()
+                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+        curr_part += 1
+        wf.close()
+
+import  time
+def handle_giga_file(ran):
+    """
+    File that splits big text file into more minor files. Only split on empty lines.
+    """
+    # with open(INDIR_GIGA, 'r') as gof:
+    #     with open(INDIR_JOS, 'r') as gjf:
+    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+    #             pass
+    #     num_lines = i + 1
+    # print(num_lines)
+    num_lines = 1393184026
+    # 1393184026
+    # 1393184033
+    # return
+    num_lines_per_part = num_lines / GIGA_PARTS
+    curr_part = 0
+    gof_generator = giga_orig_generator()
+    # with open(INDIR_GIGA, 'r') as gof:
+    with open(INDIR_JOS, 'r') as gjf:
+        sentence = {}
+        sentence['tokens'] = []
+        sentence['links'] = {}
+        wf = None
+        if curr_part in file_indices:
+            if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
+                os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
+
+            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
+
+        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+        for i, l_gjf in enumerate(gjf):
+            l_gof = next(gof_generator)
+            if curr_part < ran[0]:
+                if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
+                    if curr_part < ran[0]:
+                        print(curr_part)
+                        curr_part += 1
+                        continue
+                else:
+                    continue
+
+            l_gof_split = l_gof.split('\t')
+            l_gjf_split = l_gjf.split('\t')
+
+            # if punctuation
+            if l_gof != '\n':
+                if curr_part not in file_indices:
+                    continue
+                if l_gof_split[1][-1] == 'u':
+                    # print(l_gjf_split)
+                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
+                else:
+                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
+
+                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
+
+            # if l_gof == '\n':
+            else:
+                if curr_part in file_indices:
+                    wf.write(par.to_conll_2009_SRL(sentence))
+                    sentence['tokens'] = []
+                    sentence['links'] = {}
+            # wf.flush()
+            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
+                if i > num_lines_per_part * (curr_part + 1):
+                    curr_part += 1
+                    # if wf doesn't exist (first one)
+                    if curr_part in file_indices and wf:
+                        wf.close()
+                    if curr_part >= ran[1]:
+                        break
+                    if curr_part in file_indices:
+                        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
+                            os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
+
+                        wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+
+        curr_part += 1
+        wf.close()
+
+def handle_giga_file_selected_sentences(error_sentences):
+    """
+    File that splits big text file into more minor files. Only split on empty lines.
+    """
+    # with open(INDIR_GIGA, 'r') as gof:
+    #     with open(INDIR_JOS, 'r') as gjf:
+    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+    #             pass
+    #     num_lines = i + 1
+    # print(num_lines)
+    # print('num_lines' + 3)
+    # num_lines = 1393184026
+    num_lines = 1393222523
+    # 1393184026
+    # 1393184033
+    # return
+    # num_lines_per_part = num_lines / GIGA_PARTS
+    # curr_part = 0
+    gof_generator = giga_orig_generator()
+    # with open(INDIR_GIGA, 'r') as gof:
+    with open(INDIR_JOS, 'r') as gjf:
+        sentence = {}
+        sentence['tokens'] = []
+        sentence['links'] = {}
+        wf = None
+        if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
+            os.remove(os.path.join(OUTDIR, 'giga_errors'))
+
+        wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
+
+        with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
+            sentence_ids_list = pickle.load(pkl_file)
+
+        sentence_id = 0
+        skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
+
+        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+        for i, l_gjf in enumerate(gjf):
+            l_gof = next(gof_generator)
+
+
+            if l_gjf == '\n':
+                if not skip_sentence:
+                    wf.write(par.to_conll_2009_SRL(sentence))
+                    sentence['tokens'] = []
+                    sentence['links'] = {}
+                sentence_id += 1
+                if sentence_ids_list[sentence_id] in error_sentences:
+                    print(sentence_ids_list[sentence_id])
+                    skip_sentence = False
+                else:
+                    skip_sentence = True
+
+            if skip_sentence:
+                continue
+
+
+            # if curr_part < ran[0]:
+            #     if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
+            #         if curr_part < ran[0]:
+            #             print(curr_part)
+            #             curr_part += 1
+            #             continue
+            #     else:
+            #         continue
+
+            l_gof_split = l_gof.split('\t')
+            l_gjf_split = l_gjf.split('\t')
+
+            # if punctuation
+            if l_gof != '\n':
+                if l_gof_split[1][-1] == 'u':
+                    # print(l_gjf_split)
+                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
+                else:
+                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
+
+                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
+
+            # if l_gof == '\n':
+            # wf.flush()
+            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
+            #     if i > num_lines_per_part * (curr_part + 1):
+            #         curr_part += 1
+            #         # if wf doesn't exist (first one)
+            #         if curr_part in file_indices and wf:
+            #             wf.close()
+            #         if curr_part >= ran[1]:
+            #             break
+                    # if curr_part in file_indices:
+                    #     if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
+                    #         os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
+                    #
+                    #     wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+
+        # curr_part += 1
+        wf.close()
+
+file_indices = set(range(0, 100000))
+with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
+    file_indices = set(pickle.load(pkl_file))
+
+with Pool(CPU_CORES) as p:
+    if analysis == 'kres':
+        p.map(handle_file, infiles)
+    elif analysis == 'gigafida':
+        handle_gigafida_file()
+    elif analysis == 'giga':
+        final_range = [0, 100000]
+        size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
+        # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
+        ranges = []
+        ps = None
+        for i in range(CPU_CORES):
+            s = int(final_range[0] + size_per_proc * i)
+            ns = int(final_range[0] + size_per_proc * (i + 1))
+            ranges.append([s, ns])
+        # ranges = [[0, 1]]
+
+        # p.map(handle_giga_file, ranges)
+        # p.map(handle_giga_file, ranges)
+        error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
+        handle_giga_file_selected_sentences(set(error_sentences))
+
+
+logging.info("end parsing kres")
@@ -35,7 +35,11 @@ class Msdmap:
    def slo_msd_to_eng_long(self, slo_msd):
        # old, slow
        # return (self.msd_table[self.msd_table["slo_msd"] == slo_msd]["eng_long"]).values[0]
-        return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
+        # return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
+        query = self.msd_table.query("slo_msd == '{}'".format(slo_msd))
+        if query.empty:
+            return "No-matching-msd-found"
+        return query["eng_long"].values[0]

    def slo_msd_to_eng_pos(self, slo_msd):
        # first letter in slo_msd == slo_pos 
@@ -57,7 +57,10 @@ class Parser:
            divs = []  # in ssj, there are divs, in Kres, there are separate files
            if "id" in root.keys():
                # Kres files start with <TEI id=...>
-                guess_corpus = "KRES"
+                if root.get("id")[0:2] == 'GF':
+                    guess_corpus = "GIGA"
+                else:
+                    guess_corpus = "KRES"
                divs = [root]
            else:
                guess_corpus = "SSJ"
@@ -65,7 +68,10 @@ class Parser:

            # parse divs
            for div in divs:
-                f_id = div.get("id")
+                f_id = div.get("id")[:-6]
+
+                if guess_corpus == "GIGA":
+                    div = div.findall(".//body")[0]

                # parse paragraphs
                for p in div.findall(".//p"):
@@ -75,60 +81,74 @@ class Parser:
                    for s in p.findall(".//s"):
                        s_id = s.get("id").split(".")[-1]
                        sentence_text = ""
+                        sentence_list = []
                        sentence_tokens = []

                        # parse tokens
                        for el in s.iter():
                            if el.tag in self.W_TAGS:
-                                el_id = el.get("id").split(".")[-1]
-                                if el_id[0] == 't':
-                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
-                                sentence_text += el.text
-                                sentence_tokens += [(
-                                    "w",
-                                    int(el_id),
-                                    el.text,
-                                    el.get("lemma"),
-                                    (el.get("msd") if guess_corpus == "KRES"
-                                        else el.get("ana").split(":")[-1]),
-                                )]
+                                if guess_corpus != "GIGA":
+                                    el_id = el.get("id").split(".")[-1]
+                                    if el_id[0] == 't':
+                                        el_id = el_id[1:]  # ssj W_TAG ids start with t
+                                    sentence_text += el.text
+                                    sentence_tokens += [(
+                                        "w",
+                                        int(el_id),
+                                        el.text,
+                                        el.get("lemma"),
+                                        (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
+                                         else el.get("ana").split(":")[-1]),
+                                    )]
+                                else:
+                                    sentence_list.append(el.text)
                            elif el.tag in self.C_TAGS:
                                # only Kres' C_TAGS have ids
-                                el_id = el.get("id") or "none"
-                                el_id = el_id.split(".")[-1]
-                                sentence_text += el.text
-                                sentence_tokens += [("c", el_id, el.text,)]
+                                if guess_corpus != "GIGA":
+                                    el_id = el.get("id") or "none"
+                                    el_id = el_id.split(".")[-1]
+                                    sentence_text += el.text
+                                    sentence_tokens += [("c", el_id, el.text,)]
                            elif el.tag in self.S_TAGS:
                                # Kres' <S /> doesn't contain .text
-                                sentence_text += " "
+                                if guess_corpus == "GIGA":
+                                    sentence_list.append(el.text)
+                                else:
+                                    sentence_text += " "
                            else:
                                # pass links and linkGroups
                                pass
                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
                        if sentence_id in res_dict:
                            raise KeyError("duplicated id: {}".format(sentence_id))
-                        res_dict[sentence_id] = {
-                            "sid": sentence_id,
-                            "text": sentence_text,
-                            "tokens": sentence_tokens,
-                            "links": (
-                                parse_links(s) if guess_corpus == "KRES" else None
-                            )
-                        }
+                        if guess_corpus == "GIGA":
+                            res_dict[sentence_id] = {
+                                "sid": sentence_id,
+                                "text": ' '.join(sentence_list),
+                                "tokens": None,
+                                "links": None
+                            }
+                        else:
+                            res_dict[sentence_id] = {
+                                "sid": sentence_id,
+                                "text": sentence_text,
+                                "tokens": sentence_tokens,
+                                "links": (
+                                    parse_links(s) if guess_corpus == "KRES" else None
+                                )
+                            }
        fp.close()
        return res_dict


-    def to_conll_2009_SRL(self, sentence_entry, napreds=9):
+    def to_conll_2009_SRL(self, sentence_entry):

        def fillpred(tsv_row):
            mrow = build_model_row(tsv_row)
-            x = mrow[:-1] 
+            x = mrow[:-1]
            y = self.fillpred_model.predict([x])
            return y[0]  # bool

-        apreds_string = '\t'.join(["_" for x in range(napreds)])
-
        # works with kres, with parsed links
        out_str = ""
        for token in sentence_entry["tokens"]:
@@ -141,7 +161,7 @@ class Parser:
                    [t_id] +
                    [form for x in range(7)] + 
                    ["0", "0", "modra", "modra", "_", "_"] +
-                    [apreds_string, "\n"]
+                    ["\n"]
                )
                continue 

@@ -170,7 +190,6 @@ class Parser:
                    sentence_entry["links"][t_id][0],  # pdeprel
                    "_",  # fillpred
                    "_",  # pred
-                    apreds_string,
                    "\n",
            ]
            fprd = fillpred(row_list)
@@ -34,7 +34,8 @@ JVM_ARGS="-cp $CP -Xmx$MEM"
 NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step. This setting is equivalent to the CoNLL 2009 ST.


-CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang $INPUT $MODEL $RERANKER $NOPI $OUTPUT"
-echo "Executing: $CMD"
+$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang "$INPUT" $MODEL $RERANKER $NOPI "$OUTPUT"
+# CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang '$INPUT' $MODEL $RERANKER $NOPI '$OUTPUT'"
+# echo "Executing: $CMD"

-$CMD
+# $CMD
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# parsing tools.cfg values
+IN_FOLDER="$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg)"
+echo "input folder: $IN_FOLDER"
+OUT_FOLDER="$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg)"
+echo "output folder: $OUT_FOLDER"
+
+SUFFIX="srl.tsv"
+
+mkdir -p "$OUT_FOLDER"
+rm "$OUT_FOLDER/*${SUFFIX}" &> /dev/null
+
+for infile in "$IN_FOLDER/*"; do
+	echo "Tagging: ${infile}"
+	base=$(basename $infile | cut -d'.' -f1)
+	outfile="${OUT_FOLDER}/${base}.${SUFFIX}"
+
+	# mate-tools tagger
+	./scripts/parse_srl_only_mod.sh "$infile" "$outfile"
+
+	if [ $? -eq 0 ]; then
+		echo "Saved as ${outfile}"
+	else
+		echo "ERR"
+		exit 1
+	fi
+done
+
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# parsing tools.cfg values
+IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
+echo "input folder: $IN_FOLDER"
+OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
+echo "output folder: $OUT_FOLDER"
+
+SUFFIX="srl.tsv"
+
+mkdir -p $OUT_FOLDER
+rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
+
+for infile in $IN_FOLDER/*; do
+	echo "Tagging: ${infile}"
+	base=$(basename $infile | cut -d'.' -f1)
+	outfile=${OUT_FOLDER}/${base}.${SUFFIX}
+
+	# mate-tools tagger
+	./scripts/parse_srl_only_mod.sh $infile $outfile
+
+	if [ $? -eq 0 ]; then
+		echo "Saved as ${outfile}"
+	else
+		echo "ERR"
+		exit 1
+	fi
+done
+
@@ -1,15 +1,16 @@
 #!/bin/bash

 # parsing tools.cfg values
-IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
+IN_FOLDER="../$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg.kres_new)"
+IN_FOLDER=$IN_FOLDER$1
 echo "input folder: $IN_FOLDER"
-OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
+OUT_FOLDER="../$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg.kres_new)"
 echo "output folder: $OUT_FOLDER"

 SUFFIX="srl.tsv"

 mkdir -p $OUT_FOLDER
-rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
+# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null

 for infile in $IN_FOLDER/*; do
 	echo "Tagging: ${infile}"
@@ -1,5 +1,18 @@
 [tools]
-kres_orig = ../data/kres_example
-kres_tsv = ../data/kres_example_tsv
-kres_srl = ../data/kres_example_srl
-kres_json = ../data/kres/example_json
+giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
+giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
+; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
+giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
+giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
+; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
+; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
+; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
+giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
+giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
+; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP
+giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
+internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
+giga_parts = 100000
+logfile = ../progress.log
+cpu_cores = 16
+debug = False
@@ -0,0 +1,16 @@
+[tools]
+giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
+giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
+; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
+giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
+giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
+; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
+; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
+; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
+giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
+giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
+internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
+giga_parts = 100000
+logfile = ../progress.log
+cpu_cores = 1
+debug = False
@@ -0,0 +1,8 @@
+[tools]
+kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
+kres_tsv = ../data/kres_out/1_tsv
+kres_srl = ../data/kres_out/2_srl
+kres_json = ../data/kres_out/final_json
+logfile = ../progress.log
+cpu_cores = 5
+debug = False
@@ -0,0 +1,8 @@
+[tools]
+kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
+giga_tsv = ../data/giga_out/1_tsv
+giga_srl = ../data/giga_out/2_srl
+kres_json = ../data/giga_out/final_json
+logfile = ../progress.log
+cpu_cores = 5
+debug = False
@@ -0,0 +1,8 @@
+[tools]
+kres_orig = ../data/kres_example
+kres_tsv = ../data/kres_out/1_tsv
+kres_srl = ../data/kres_out/2_srl
+kres_json = ../data/kres_out/final_json
+logfile = ../progress.log
+cpu_cores = 1
+debug = False
Author	SHA1	Message	Date
lkrsnik	c1ecc4cdbc	Big changes	2022-02-04 11:24:47 +01:00
kristjan	a6cee3d459	migrated to cjvt-gitea	2019-03-03 21:35:05 +01:00
kristjan	b32bd3e7c6	Setup that SRL tagged kres	2019-03-03 21:10:23 +01:00
kristjan	044fae2001	added parallel json output creation	2019-02-28 23:37:47 +01:00
kristjan	406e88ade8	added msd-not-found exception	2019-02-28 21:49:49 +01:00
kristjan	bf999a965f	sending some pipe-breaking files	2019-02-28 15:05:10 +01:00
kristjan	d45b6d9f47	added number of cores to config	2019-02-28 13:57:27 +01:00
kristjan	a61ec8770a	parsing...	2019-02-28 11:12:12 +01:00
kristjan	ff25acd3c7	small cfg fix	2019-02-28 10:54:37 +01:00
kristjan	3881c74613	added multiprocessing to parse_all.py	2019-02-28 10:53:27 +01:00
kristjan	17cb0677a7	added logging; paralelize the first part now	2019-02-28 10:34:12 +01:00
kristjan	fd0f9794f1	added logger	2019-02-28 10:15:14 +01:00
kristjan	12f3994115	todo: add logger	2019-02-28 09:57:46 +01:00
kristjan	dcc2935c3c	some changes on server	2019-02-28 09:40:25 +01:00
kristjan	60ac569f40	ready to go	2019-02-28 08:20:21 +01:00
kristjan	b4c7ac5427	fixed paths	2019-02-27 17:32:19 +01:00
kristjan	5c9cf59723	testing new config	2019-02-27 17:04:03 +01:00
kristjan	577c8418d2	tmp	2019-02-27 16:58:04 +01:00