cjvt-srl-tagging/tools/gen_json.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import pickle
from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
import logging
from multiprocessing import Pool

# parse config
config = configparser.ConfigParser()
config.read("tools.cfg.ssj500k2.3")
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
INPATH = Path(config["tools"]["ssj500k_srl"])
OUTPATH = Path(config["tools"]["ssj500k_json"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])

LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()

logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

def get_origfile(filename):
    for origfile in ORIGPATH.iterdir():
        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
            return origfile
    raise FileNotFoundError

def extract_sentences(line_reader):
    acc = []
    # last char in line is \n, remove it
    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
        if len(line) == 1:  # empty line
            tmp = acc
            acc = []
            yield tmp
        else:
            acc.append(line)

def to_sentence(sentence_arr):
    return " ".join([token[1] for token in sentence_arr])

def match_sentence_id(sentence, orig_dict):
    for k, e in orig_dict.items():
        orig_sentence = " ".join(token[2] for token in e["tokens"])
        if sentence == orig_sentence:
            return k
    raise KeyError

def match_sentence_id_giga(sentence, orig_dict):
    for k, e in orig_dict.items():
        # orig_sentence = " ".join(token[2] for token in e["tokens"])
        if sentence == e["text"]:
            return k
    raise KeyError

def get_dep_rel(token):
    logging.debug(token)
    for i, field in enumerate(token[14:]):
        if field != "_":
            return {
                "arg":  field,
                "from": i,  # i-th predicate in sentence
                "dep":  token[0],
            }
    return None

def handle_file_old(infile_tpl):
    i = infile_tpl[0]
    infile = infile_tpl[1]
    outfile = (OUTPATH / infile.name).with_suffix(".json")
    origfile = get_origfile(infile)
    orig_dict = par.parse_tei(origfile)

    with infile.open("rb") as fp:
        outdata = {}
        for sentence_arr in extract_sentences(fp.readlines()):
            # tsv dropped sentence ids, match the ID, using original data
            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)

            outdata[sid] = []

            # find all predicate indices in the sentence
            predicates = []
            for token in sentence_arr:
                if token[12] == "Y":
                    predicates += [token[0]]  # idx

                deprel = get_dep_rel(token)
                if deprel is not None:
                    outdata[sid].append(deprel)

            # deprel["from"] points to n-th predicate
            # replace with predicate's token index
            for deprel in outdata[sid]:
                deprel["from"] = predicates[deprel["from"]]

            if DEBUG:
                print(to_sentence(sentence_arr))
                print(outdata[sid])
                print(sid)
                print()
                print()

    with outfile.open("w") as fp:
        json.dump(outdata, fp)
        logging.info("SRL relations written to: {}".format(outfile))


def handle_file(whole_input):
    # sentence_id = whole_input[0][3]
    # orig_infile = whole_input[0][1]
    sentence_id = whole_input[3]
    orig_infile = whole_input[1]

    # origfile = origfiles[0][1]
    # infile_tpl = infile_tpl[0]

    # i = infile_tpl[0]
    # infile = infile_tpl[1]
    outfile = (OUTPATH / orig_infile.name).with_suffix(".json")

    if outfile.exists():
        return
    # origfile = get_origfile()
    orig_dict = par.parse_tei(orig_infile)
    outdata = {}

    gen = srl_multiple_files_sentences_generator(sentence_id)
    # gen = srl_multiple_files_sentences_generator(whole_input[1])

    mismatch_sentences = 0

    for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
        if orig_id == 'GF0014802.2685.7':
            print('PAUSE')

        # look at neighbouring sentences if they are correct
        sentence, sentence_arr = next(gen)
        # orig_sentence = " ".join(token[2] for token in e["tokens"])
        assert sentence.replace(' ', '') == orig_val['text']
        # if i != 10 and i != 0:
        #     print('OK!')
        sid = orig_id

        outdata[sid] = []

        # find all predicate indices in the sentence
        predicates = []
        for token in sentence_arr:
            if token[12] == "Y":
                predicates += [token[0]]  # idx

            deprel = get_dep_rel(token)
            if deprel is not None:
                outdata[sid].append(deprel)

        # deprel["from"] points to n-th predicate
        # replace with predicate's token index
        for deprel in outdata[sid]:
            deprel["from"] = predicates[deprel["from"]]

        if DEBUG:
            print(to_sentence(sentence_arr))
            print(outdata[sid])
            print(sid)
            print()
            print()

    if mismatch_sentences > 0:
        if mismatch_sentences / len(orig_dict.items()) < 0.1:
            print('Slight mismatch - %d' % sentence_id)
            print(whole_input)
            print('ABS mitigated %d' % mismatch_sentences)
            print('------------------------------------------------')
        else:
            print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
            print('Big mismatch - %d' % sentence_id)
            print(whole_input)
            print('ABS mitigated errors:')
            print(mismatch_sentences)
            print('------------------------------------------------')


    with outfile.open("w") as fp:
        json.dump(outdata, fp)
        logging.info("SRL relations written to: {}".format(outfile))

def count_orig_file_sentences(filename):

    if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
        return
    print(filename[0])
    orig_dict = par.parse_tei(filename[1])
    # return filename[0], filename[1], len(orig_dict)
    with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
        pickle.dump((filename[0], filename[1], len(orig_dict)), output)


def count_srl_file_sentences(filename):
    if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
        return

    print(filename[0])
    num_sentences = 0
    with filename[1].open("r") as fp:
        for line in fp:
            if line == '\n':
                num_sentences += 1

    # return filename[0], filename[1], num_sentences
    with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
        pickle.dump((filename[0], filename[1], num_sentences), output)

def srl_sentences_generator(infile, curr_index, sen_start_index):
    with infile.open("rb") as fp:
        outdata = {}
        for sentence_arr in extract_sentences(fp.readlines()):
            if curr_index < sen_start_index:
                curr_index += 1
            else:
                yield to_sentence(sentence_arr), sentence_arr
    yield None


def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
    sentence_id = max(0, sentence_id - 10)
    for i, srl_file in enumerate(srl_file_sizes):
        if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
            srl_files = srl_file_sizes[i:]
            break

    for file_info in srl_files:
        # srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
        srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
        el = next(srl_gen)
        while el is not None:
            yield el
            el = next(srl_gen)

    yield None


# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)

infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))

origfiles = []
for subdir, dirs, files in os.walk(ORIGPATH):
    for file in files:
        origfiles.append(Path(os.path.join(subdir, file)))
origfiles=list(enumerate(sorted(origfiles)))
##### REMOVE ############
# origfiles = origfiles[:3]

# count sentences in orig (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
    # srl_file_sizes = {}
    if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
        os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
    # with Pool(CPU_CORES) as p:
    #     # p.map(handle_file, infiles)
    #     p.map(count_orig_file_sentences, origfiles)
    for i in range(len(origfiles)):
        count_orig_file_sentences(origfiles[i])
    orig_file_sizes = []
    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
        print(x.name)
        if x.is_file():
            with x.open('rb') as pkl_small_file:
                orig_file_sizes.append(pickle.load(pkl_small_file))
    # orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
    print("Sorting orig files")
    orig_file_sizes = sorted(orig_file_sizes)
    total_size = 0
    orig_file_sizes_final = []
    print("Calculating orig files size")
    for n, pa, si in orig_file_sizes:
        orig_file_sizes_final.append((n, pa, si, total_size))
        total_size += si
    orig_file_sizes = orig_file_sizes_final
    print("Saving orig files size")
    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
        pickle.dump(orig_file_sizes, output)
    print("Orig files saved")
else:
    with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
        orig_file_sizes = pickle.load(pkl_file)


# count sentences in srl (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
    # srl_file_sizes = {}
    if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
        os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
    # with Pool(CPU_CORES) as p:
    #     # p.map(handle_file, infiles)
    #     p.map(count_srl_file_sentences, infiles)

    for i in range(len(infiles)):
        count_srl_file_sentences(infiles[i])

    srl_file_sizes = []
    for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
        print(x.name)
        if x.is_file():
            with x.open('rb') as pkl_small_file:
                srl_file_sizes.append(pickle.load(pkl_small_file))
    print("Sorting srl files")
    srl_file_sizes = sorted(srl_file_sizes)
    total_size = 0
    srl_file_sizes_final = []
    print("Calculating srl files size")
    for n, pa, si in srl_file_sizes:
        srl_file_sizes_final.append((n, pa, si, total_size))
        total_size += si
    srl_file_sizes = srl_file_sizes_final
    print("Saving srl files size")
    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
        pickle.dump(srl_file_sizes, output)
    print("Srl files saved")
else:
    with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
        srl_file_sizes = pickle.load(pkl_file)


# print(len(orig_file_sizes))
# print('asd' + 2)

# inputs = []
# srl_i = 0
# srl_file = srl_file_sizes[srl_i]
# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
#     interesting_srl_files = []
#     # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
#     # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
#     #     srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
#     while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
#         # if beginning of file is in
#         if srl_file[3] > orig_first_sent_i:
#             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
#             # print('if %d' % srl_file[3])
#         else:
#             interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
#             # print('else %d' % orig_first_sent_i)
#
#         if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
#             srl_i += 1
#             if srl_i < len(srl_file_sizes):
#                 srl_file = srl_file_sizes[srl_i]
#             else:
#                 break
#                 # print(srl_i)
#                 # print('a ' + 2)
#         else:
#             break
#
#     inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
    # print(inputs[-1])


# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
# a = next(srl_gen)
# b = next(srl_gen)
# c = next(srl_gen)

print('beginning processing')
with Pool(CPU_CORES) as p:
    # p.map(handle_file, inputs)
    p.map(handle_file, orig_file_sizes)

# for of in orig_file_sizes:
#     handle_file(of)

logging.info("Finished generating .json files.")
Big changes 2022-02-04 10:24:47 +00:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`

			`import os`
			`import pickle`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`from pathlib import Path`
			`from parser.parser import Parser`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`import configparser`
fixed paths 2019-02-27 16:32:19 +00:00			`import json`
			`import sys`
added logger 2019-02-28 09:15:14 +00:00			`import logging`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`from multiprocessing import Pool`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`# parse config`
			`config = configparser.ConfigParser()`
Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00			`config.read("tools.cfg.ssj500k2.3")`
			`ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])`
			`INPATH = Path(config["tools"]["ssj500k_srl"])`
			`OUTPATH = Path(config["tools"]["ssj500k_json"])`
Big changes 2022-02-04 10:24:47 +00:00			`INTERNAL_DATA = Path(config["tools"]["internal_data"])`
fixed paths 2019-02-27 16:32:19 +00:00			`DEBUG = config["tools"]["debug"] == "True"`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`CPU_CORES = int(config["tools"]["cpu_cores"])`
added logging; paralelize the first part now 2019-02-28 09:34:12 +00:00
added parallel json output creation 2019-02-28 22:37:47 +00:00			`LOGFILE = Path(config["tools"]["logfile"]).absolute()`
			`LOGFILE.touch(exist_ok=True)`
			`LOGFILE.resolve()`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
added parallel json output creation 2019-02-28 22:37:47 +00:00			`logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)`
added logger 2019-02-28 09:15:14 +00:00
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00			`def get_origfile(filename):`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`for origfile in ORIGPATH.iterdir():`
			`if filename.name.split('.')[0] == origfile.name.split('.')[0]:`
			`return origfile`
			`raise FileNotFoundError`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
			`def extract_sentences(line_reader):`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`acc = []`
			`# last char in line is \n, remove it`
			`for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:`
			`if len(line) == 1: # empty line`
			`tmp = acc`
			`acc = []`
			`yield tmp`
			`else:`
			`acc.append(line)`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
tmp 2019-02-27 15:58:04 +00:00			`def to_sentence(sentence_arr):`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`return " ".join([token[1] for token in sentence_arr])`
tmp 2019-02-27 15:58:04 +00:00
			`def match_sentence_id(sentence, orig_dict):`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`for k, e in orig_dict.items():`
			`orig_sentence = " ".join(token[2] for token in e["tokens"])`
			`if sentence == orig_sentence:`
			`return k`
			`raise KeyError`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
Big changes 2022-02-04 10:24:47 +00:00			`def match_sentence_id_giga(sentence, orig_dict):`
			`for k, e in orig_dict.items():`
			`# orig_sentence = " ".join(token[2] for token in e["tokens"])`
			`if sentence == e["text"]:`
			`return k`
			`raise KeyError`

tmp 2019-02-27 15:58:04 +00:00			`def get_dep_rel(token):`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`logging.debug(token)`
			`for i, field in enumerate(token[14:]):`
			`if field != "_":`
			`return {`
			`"arg": field,`
			`"from": i, # i-th predicate in sentence`
			`"dep": token[0],`
			`}`
			`return None`

Big changes 2022-02-04 10:24:47 +00:00			`def handle_file_old(infile_tpl):`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`i = infile_tpl[0]`
			`infile = infile_tpl[1]`
			`outfile = (OUTPATH / infile.name).with_suffix(".json")`
			`origfile = get_origfile(infile)`
			`orig_dict = par.parse_tei(origfile)`

			`with infile.open("rb") as fp:`
			`outdata = {}`
			`for sentence_arr in extract_sentences(fp.readlines()):`
			`# tsv dropped sentence ids, match the ID, using original data`
			`sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)`

			`outdata[sid] = []`

			`# find all predicate indices in the sentence`
			`predicates = []`
			`for token in sentence_arr:`
			`if token[12] == "Y":`
			`predicates += [token[0]] # idx`

			`deprel = get_dep_rel(token)`
			`if deprel is not None:`
			`outdata[sid].append(deprel)`

			`# deprel["from"] points to n-th predicate`
			`# replace with predicate's token index`
			`for deprel in outdata[sid]:`
			`deprel["from"] = predicates[deprel["from"]]`

			`if DEBUG:`
			`print(to_sentence(sentence_arr))`
			`print(outdata[sid])`
			`print(sid)`
			`print()`
			`print()`

			`with outfile.open("w") as fp:`
			`json.dump(outdata, fp)`
			`logging.info("SRL relations written to: {}".format(outfile))`


Big changes 2022-02-04 10:24:47 +00:00			`def handle_file(whole_input):`
			`# sentence_id = whole_input[0][3]`
			`# orig_infile = whole_input[0][1]`
			`sentence_id = whole_input[3]`
			`orig_infile = whole_input[1]`

			`# origfile = origfiles[0][1]`
			`# infile_tpl = infile_tpl[0]`

			`# i = infile_tpl[0]`
			`# infile = infile_tpl[1]`
			`outfile = (OUTPATH / orig_infile.name).with_suffix(".json")`

			`if outfile.exists():`
			`return`
			`# origfile = get_origfile()`
			`orig_dict = par.parse_tei(orig_infile)`
			`outdata = {}`

			`gen = srl_multiple_files_sentences_generator(sentence_id)`
			`# gen = srl_multiple_files_sentences_generator(whole_input[1])`

			`mismatch_sentences = 0`

			`for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):`
			`if orig_id == 'GF0014802.2685.7':`
			`print('PAUSE')`

			`# look at neighbouring sentences if they are correct`
Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00			`sentence, sentence_arr = next(gen)`
			`# orig_sentence = " ".join(token[2] for token in e["tokens"])`
			`assert sentence.replace(' ', '') == orig_val['text']`
			`# if i != 10 and i != 0:`
			`# print('OK!')`
			`sid = orig_id`

			`outdata[sid] = []`

			`# find all predicate indices in the sentence`
			`predicates = []`
			`for token in sentence_arr:`
			`if token[12] == "Y":`
			`predicates += [token[0]] # idx`

			`deprel = get_dep_rel(token)`
			`if deprel is not None:`
			`outdata[sid].append(deprel)`

			`# deprel["from"] points to n-th predicate`
			`# replace with predicate's token index`
			`for deprel in outdata[sid]:`
			`deprel["from"] = predicates[deprel["from"]]`

			`if DEBUG:`
			`print(to_sentence(sentence_arr))`
			`print(outdata[sid])`
			`print(sid)`
			`print()`
			`print()`
Big changes 2022-02-04 10:24:47 +00:00
			`if mismatch_sentences > 0:`
			`if mismatch_sentences / len(orig_dict.items()) < 0.1:`
			`print('Slight mismatch - %d' % sentence_id)`
			`print(whole_input)`
			`print('ABS mitigated %d' % mismatch_sentences)`
			`print('------------------------------------------------')`
			`else:`
			`print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')`
			`print('Big mismatch - %d' % sentence_id)`
			`print(whole_input)`
			`print('ABS mitigated errors:')`
			`print(mismatch_sentences)`
			`print('------------------------------------------------')`


			`with outfile.open("w") as fp:`
			`json.dump(outdata, fp)`
			`logging.info("SRL relations written to: {}".format(outfile))`

			`def count_orig_file_sentences(filename):`

			`if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):`
			`return`
			`print(filename[0])`
			`orig_dict = par.parse_tei(filename[1])`
			`# return filename[0], filename[1], len(orig_dict)`
			`with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:`
			`pickle.dump((filename[0], filename[1], len(orig_dict)), output)`


			`def count_srl_file_sentences(filename):`
			`if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):`
			`return`

			`print(filename[0])`
			`num_sentences = 0`
			`with filename[1].open("r") as fp:`
			`for line in fp:`
			`if line == '\n':`
			`num_sentences += 1`

			`# return filename[0], filename[1], num_sentences`
			`with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:`
			`pickle.dump((filename[0], filename[1], num_sentences), output)`

			`def srl_sentences_generator(infile, curr_index, sen_start_index):`
			`with infile.open("rb") as fp:`
			`outdata = {}`
			`for sentence_arr in extract_sentences(fp.readlines()):`
			`if curr_index < sen_start_index:`
			`curr_index += 1`
			`else:`
			`yield to_sentence(sentence_arr), sentence_arr`
			`yield None`


			`def srl_multiple_files_sentences_generator(sentence_id): # srl_files):`
			`sentence_id = max(0, sentence_id - 10)`
			`for i, srl_file in enumerate(srl_file_sizes):`
			`if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:`
			`srl_files = srl_file_sizes[i:]`
			`break`

			`for file_info in srl_files:`
			`# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])`
			`srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)`
			`el = next(srl_gen)`
			`while el is not None:`
			`yield el`
			`el = next(srl_gen)`

			`yield None`


added parallel json output creation 2019-02-28 22:37:47 +00:00			`# main`
testing new config 2019-02-27 16:04:03 +00:00			`par = Parser()`
			`OUTPATH.mkdir(exist_ok=True)`
gen_json.py needs a bit more work 2019-02-25 23:22:15 +00:00
Big changes 2022-02-04 10:24:47 +00:00			`infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))`

Big changes 2022-02-04 10:24:47 +00:00			`origfiles = []`
			`for subdir, dirs, files in os.walk(ORIGPATH):`
			`for file in files:`
			`origfiles.append(Path(os.path.join(subdir, file)))`
			`origfiles=list(enumerate(sorted(origfiles)))`
			`##### REMOVE ############`
			`# origfiles = origfiles[:3]`

			`# count sentences in orig (if not counted before)`
			`# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))`
			`if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):`
			`# srl_file_sizes = {}`
			`if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):`
			`os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))`
			`# with Pool(CPU_CORES) as p:`
			`# # p.map(handle_file, infiles)`
			`# p.map(count_orig_file_sentences, origfiles)`
			`for i in range(len(origfiles)):`
			`count_orig_file_sentences(origfiles[i])`
			`orig_file_sizes = []`
			`for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):`
			`print(x.name)`
			`if x.is_file():`
			`with x.open('rb') as pkl_small_file:`
			`orig_file_sizes.append(pickle.load(pkl_small_file))`
			`# orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))`
			`print("Sorting orig files")`
			`orig_file_sizes = sorted(orig_file_sizes)`
			`total_size = 0`
			`orig_file_sizes_final = []`
			`print("Calculating orig files size")`
			`for n, pa, si in orig_file_sizes:`
			`orig_file_sizes_final.append((n, pa, si, total_size))`
			`total_size += si`
			`orig_file_sizes = orig_file_sizes_final`
			`print("Saving orig files size")`
			`with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:`
			`pickle.dump(orig_file_sizes, output)`
			`print("Orig files saved")`
			`else:`
			`with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:`
			`orig_file_sizes = pickle.load(pkl_file)`


			`# count sentences in srl (if not counted before)`
			`# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))`
			`if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):`
			`# srl_file_sizes = {}`
			`if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):`
			`os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))`
			`# with Pool(CPU_CORES) as p:`
			`# # p.map(handle_file, infiles)`
			`# p.map(count_srl_file_sentences, infiles)`

			`for i in range(len(infiles)):`
			`count_srl_file_sentences(infiles[i])`

			`srl_file_sizes = []`
			`for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):`
			`print(x.name)`
			`if x.is_file():`
			`with x.open('rb') as pkl_small_file:`
			`srl_file_sizes.append(pickle.load(pkl_small_file))`
			`print("Sorting srl files")`
			`srl_file_sizes = sorted(srl_file_sizes)`
			`total_size = 0`
			`srl_file_sizes_final = []`
			`print("Calculating srl files size")`
			`for n, pa, si in srl_file_sizes:`
			`srl_file_sizes_final.append((n, pa, si, total_size))`
			`total_size += si`
			`srl_file_sizes = srl_file_sizes_final`
			`print("Saving srl files size")`
			`with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:`
			`pickle.dump(srl_file_sizes, output)`
			`print("Srl files saved")`
			`else:`
			`with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:`
			`srl_file_sizes = pickle.load(pkl_file)`


			`# print(len(orig_file_sizes))`
			`# print('asd' + 2)`

			`# inputs = []`
			`# srl_i = 0`
			`# srl_file = srl_file_sizes[srl_i]`
			`# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:`
			`# interesting_srl_files = []`
			`# # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk`
			`# # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \`
			`# # srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:`
			`# while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:`
			`# # if beginning of file is in`
			`# if srl_file[3] > orig_first_sent_i:`
			`# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))`
			`# # print('if %d' % srl_file[3])`
			`# else:`
			`# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))`
			`# # print('else %d' % orig_first_sent_i)`
			`#`
			`# if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:`
			`# srl_i += 1`
			`# if srl_i < len(srl_file_sizes):`
			`# srl_file = srl_file_sizes[srl_i]`
			`# else:`
			`# break`
			`# # print(srl_i)`
			`# # print('a ' + 2)`
			`# else:`
			`# break`
			`#`
			`# inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])`
			`# print(inputs[-1])`



			`# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)`
			`# a = next(srl_gen)`
			`# b = next(srl_gen)`
			`# c = next(srl_gen)`

			`print('beginning processing')`
added parallel json output creation 2019-02-28 22:37:47 +00:00			`with Pool(CPU_CORES) as p:`
Big changes 2022-02-04 10:24:47 +00:00			`# p.map(handle_file, inputs)`
			`p.map(handle_file, orig_file_sizes)`

			`# for of in orig_file_sizes:`
			`# handle_file(of)`
added parallel json output creation 2019-02-28 22:37:47 +00:00
added logger 2019-02-28 09:15:14 +00:00			`logging.info("Finished generating .json files.")`