cjvt-srl-tagging/tools/parse_all.py

import pickle

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool

SSJ500K_2_1 = 27829  # number of sentences
par = Parser()

# path to data
config = configparser.ConfigParser()
# config.read("tools.cfg")
config.read("tools.cfg.ssj500k2.3")
analysis = ''
if 'kres_orig' in config["tools"]:
    analysis = 'kres'
    INDIR = Path(config["tools"]["kres_orig"])
    OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
    # analysis = 'gigafida'
    analysis = 'giga'
    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
    INDIR_GIGA = Path(config["tools"]["giga_orig"])
    INDIR_JOS = Path(config["tools"]["giga_jos"])
    OUTDIR = Path(config["tools"]["giga_tsv"])
    GIGA_PARTS = int(config["tools"]["giga_parts"])
    INTERNAL_DATA = config["tools"]["internal_data"]
elif 'ssj500k_orig' in config["tools"]:
    # analysis = 'gigafida'
    analysis = 'ssj500k'
    INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])
    INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])
    INDIR_JOS = Path(config["tools"]["ssj500k_jos"])
    OUTDIR = Path(config["tools"]["ssj500k_tsv"])
    INTERNAL_DATA = config["tools"]["internal_data"]

CPU_CORES = int(config["tools"]["cpu_cores"])

LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()

logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""

# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
# OUTDIR.mkdir(exist_ok=True)

if analysis == 'kres':
    infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
    logging.info("Parsing kres: {} files.".format(len(infiles)))


def handle_ssj500k_file():
    kres_file = INDIR_SSJ500K_ORIG
    outfile = OUTDIR

    if outfile.is_file():
        logging.info("Skipping existing file: {}.".format(str(kres_file)))
        return True

    # try:
    res_dict = par.parse_tei(kres_file)
    kres_out_str = ""
    for _, sentence in res_dict.items():
        kres_out_str += par.to_conll_2009_SRL(sentence)
    # except Exception as exc:
    #     logging.info("Failed processing file: {}".format(str(kres_file)))
    #     logging.error(exc)
    #     return False


    with outfile.open("wb+") as fp:
        fp.write(kres_out_str.encode("utf-8"))
        # logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
        return True
    return False

def ssj500k_orig_generator():
    with open(INDIR_SSJ500K, 'r') as gof:
        previous_new_line = False
        for l_gof in gof:
            if l_gof == '\n':
                if previous_new_line:
                    continue
                previous_new_line = True
            elif previous_new_line:
                previous_new_line = False
            yield l_gof


def handle_gigafida_file():
    """
    File that splits big text file into more minor files. Only split on empty lines.  
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
            ignore_lines = True
            wf = False
        else:
            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
            ignore_lines = False
        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if ignore_lines:
                if i > num_lines_per_part * curr_part and l_gof == '\n':
                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
                        ignore_lines = False
                        # delete last file (probably not whole)
                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
                    if ignore_lines:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue
            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if wf:
                    # print(i)
                    wf.write(par.to_conll_2009_SRL(sentence))
                sentence['tokens'] = []
                sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if wf:
                        wf.close()
                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
        curr_part += 1
        wf.close()


def handle_ssj500k_file2():
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    gof_generator = ssj500k_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
            ignore_lines = True
            wf = False
        else:
            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
            ignore_lines = False
        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if ignore_lines:
                if i > num_lines_per_part * curr_part and l_gof == '\n':
                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
                        ignore_lines = False
                        # delete last file (probably not whole)
                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
                    if ignore_lines:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue
            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if wf:
                    # print(i)
                    wf.write(par.to_conll_2009_SRL(sentence))
                sentence['tokens'] = []
                sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if wf:
                        wf.close()
                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
        curr_part += 1
        wf.close()


import  time
def handle_giga_file(ran):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    num_lines = 1393184026
    # 1393184026
    # 1393184033
    # return
    num_lines_per_part = num_lines / GIGA_PARTS
    curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if curr_part in file_indices:
            if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
                os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))

            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')

        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if curr_part < ran[0]:
                if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
                    if curr_part < ran[0]:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue

            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if curr_part not in file_indices:
                    continue
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if curr_part in file_indices:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if curr_part in file_indices and wf:
                        wf.close()
                    if curr_part >= ran[1]:
                        break
                    if curr_part in file_indices:
                        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                            os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))

                        wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')

        curr_part += 1
        wf.close()

def handle_giga_file_selected_sentences(error_sentences):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    # print('num_lines' + 3)
    # num_lines = 1393184026
    num_lines = 1393222523
    # 1393184026
    # 1393184033
    # return
    # num_lines_per_part = num_lines / GIGA_PARTS
    # curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
            os.remove(os.path.join(OUTDIR, 'giga_errors'))

        wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')

        with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
            sentence_ids_list = pickle.load(pkl_file)

        sentence_id = 0
        skip_sentence = not sentence_ids_list[sentence_id] in error_sentences

        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)


            if l_gjf == '\n':
                if not skip_sentence:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
                sentence_id += 1
                if sentence_ids_list[sentence_id] in error_sentences:
                    print(sentence_ids_list[sentence_id])
                    skip_sentence = False
                else:
                    skip_sentence = True

            if skip_sentence:
                continue


            # if curr_part < ran[0]:
            #     if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
            #         if curr_part < ran[0]:
            #             print(curr_part)
            #             curr_part += 1
            #             continue
            #     else:
            #         continue

            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
            #     if i > num_lines_per_part * (curr_part + 1):
            #         curr_part += 1
            #         # if wf doesn't exist (first one)
            #         if curr_part in file_indices and wf:
            #             wf.close()
            #         if curr_part >= ran[1]:
            #             break
                    # if curr_part in file_indices:
                    #     if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                    #         os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
                    #
                    #     wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')

        # curr_part += 1
        wf.close()


handle_ssj500k_file()


logging.info("end parsing kres")
Big changes 2022-02-04 10:24:47 +00:00			`import pickle`

finished parse + tag toolchain -> TODO: tagger error 2019-02-18 07:49:04 +00:00			`from parser.parser import Parser`
			`import os`
			`from os.path import join, dirname`
			`from pathlib import Path`
			`import re`
			`import sys`
			`import cProfile`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`import configparser`
added logger 2019-02-28 09:15:14 +00:00			`import logging`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00			`from multiprocessing import Pool`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
			`SSJ500K_2_1 = 27829 # number of sentences`
			`par = Parser()`

			`# path to data`
			`config = configparser.ConfigParser()`
Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00			`# config.read("tools.cfg")`
			`config.read("tools.cfg.ssj500k2.3")`
Big changes 2022-02-04 10:24:47 +00:00			`analysis = ''`
			`if 'kres_orig' in config["tools"]:`
			`analysis = 'kres'`
			`INDIR = Path(config["tools"]["kres_orig"])`
			`OUTDIR = Path(config["tools"]["kres_tsv"])`
			`elif 'giga_orig' in config["tools"]:`
			`# analysis = 'gigafida'`
			`analysis = 'giga'`
			`INDIR_GIGA_ORIG = Path(config["tools"]["giga"])`
			`INDIR_GIGA = Path(config["tools"]["giga_orig"])`
			`INDIR_JOS = Path(config["tools"]["giga_jos"])`
			`OUTDIR = Path(config["tools"]["giga_tsv"])`
			`GIGA_PARTS = int(config["tools"]["giga_parts"])`
			`INTERNAL_DATA = config["tools"]["internal_data"]`
Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00			`elif 'ssj500k_orig' in config["tools"]:`
			`# analysis = 'gigafida'`
			`analysis = 'ssj500k'`
			`INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])`
			`INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])`
			`INDIR_JOS = Path(config["tools"]["ssj500k_jos"])`
			`OUTDIR = Path(config["tools"]["ssj500k_tsv"])`
			`INTERNAL_DATA = config["tools"]["internal_data"]`
Big changes 2022-02-04 10:24:47 +00:00
added number of cores to config 2019-02-28 12:57:27 +00:00			`CPU_CORES = int(config["tools"]["cpu_cores"])`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
			`LOGFILE = Path(config["tools"]["logfile"]).absolute()`
			`LOGFILE.touch(exist_ok=True)`
			`LOGFILE.resolve()`
added logger 2019-02-28 09:15:14 +00:00
added logging; paralelize the first part now 2019-02-28 09:34:12 +00:00			`logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
			`"""`
			`print("parsing ssj")`
			`ssj_file = "../data/ssj500k-sl.sample.xml"`
			`ssj_dict = par.parse_tei(ssj_file)`
			`# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."`
			`print("end parsing ssj")`
			`"""`

			`# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"`
Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00			`# OUTDIR.mkdir(exist_ok=True)`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
Big changes 2022-02-04 10:24:47 +00:00			`if analysis == 'kres':`
			`infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))`
			`logging.info("Parsing kres: {} files.".format(len(infiles)))`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00
			`def handle_ssj500k_file():`
			`kres_file = INDIR_SSJ500K_ORIG`
			`outfile = OUTDIR`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
added number of cores to config 2019-02-28 12:57:27 +00:00			`if outfile.is_file():`
			`logging.info("Skipping existing file: {}.".format(str(kres_file)))`
			`return True`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00			`# try:`
			`res_dict = par.parse_tei(kres_file)`
			`kres_out_str = ""`
			`for _, sentence in res_dict.items():`
			`kres_out_str += par.to_conll_2009_SRL(sentence)`
			`# except Exception as exc:`
			`# logging.info("Failed processing file: {}".format(str(kres_file)))`
			`# logging.error(exc)`
			`# return False`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
added number of cores to config 2019-02-28 12:57:27 +00:00
			`with outfile.open("wb+") as fp:`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`fp.write(kres_out_str.encode("utf-8"))`
Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00			`# logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))`
added number of cores to config 2019-02-28 12:57:27 +00:00			`return True`
			`return False`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00			`def ssj500k_orig_generator():`
			`with open(INDIR_SSJ500K, 'r') as gof:`
Big changes 2022-02-04 10:24:47 +00:00			`previous_new_line = False`
			`for l_gof in gof:`
			`if l_gof == '\n':`
			`if previous_new_line:`
			`continue`
			`previous_new_line = True`
			`elif previous_new_line:`
			`previous_new_line = False`
			`yield l_gof`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00

Big changes 2022-02-04 10:24:47 +00:00			`def handle_gigafida_file():`
			`"""`
			`File that splits big text file into more minor files. Only split on empty lines.`
			`"""`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`# with open(INDIR_JOS, 'r') as gjf:`
			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`# pass`
			`# num_lines = i + 1`
			`# print(num_lines)`
			`gof_generator = giga_orig_generator()`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`with open(INDIR_JOS, 'r') as gjf:`
			`sentence = {}`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):`
			`ignore_lines = True`
			`wf = False`
			`else:`
			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`
			`ignore_lines = False`
			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`for i, l_gjf in enumerate(gjf):`
			`l_gof = next(gof_generator)`
			`if ignore_lines:`
			`if i > num_lines_per_part * curr_part and l_gof == '\n':`
			`if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):`
			`ignore_lines = False`
			`# delete last file (probably not whole)`
			`os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))`
			`if ignore_lines:`
			`print(curr_part)`
			`curr_part += 1`
			`continue`
			`else:`
			`continue`
			`l_gof_split = l_gof.split('\t')`
			`l_gjf_split = l_gjf.split('\t')`

			`# if punctuation`
			`if l_gof != '\n':`
			`if l_gof_split[1][-1] == 'u':`
			`# print(l_gjf_split)`
			`sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))`
			`else:`
			`sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))`

			`sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])`

			`# if l_gof == '\n':`
			`else:`
			`if wf:`
			`# print(i)`
			`wf.write(par.to_conll_2009_SRL(sentence))`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`# wf.flush()`
			`# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':`
			`if i > num_lines_per_part * (curr_part + 1):`
			`curr_part += 1`
			`# if wf doesn't exist (first one)`
			`if wf:`
			`wf.close()`
			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`
			`curr_part += 1`
			`wf.close()`

Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00
			`def handle_ssj500k_file2():`
			`"""`
			`File that splits big text file into more minor files. Only split on empty lines.`
			`"""`
			`gof_generator = ssj500k_orig_generator()`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`with open(INDIR_JOS, 'r') as gjf:`
			`sentence = {}`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):`
			`ignore_lines = True`
			`wf = False`
			`else:`
			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`
			`ignore_lines = False`
			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`for i, l_gjf in enumerate(gjf):`
			`l_gof = next(gof_generator)`
			`if ignore_lines:`
			`if i > num_lines_per_part * curr_part and l_gof == '\n':`
			`if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):`
			`ignore_lines = False`
			`# delete last file (probably not whole)`
			`os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))`
			`if ignore_lines:`
			`print(curr_part)`
			`curr_part += 1`
			`continue`
			`else:`
			`continue`
			`l_gof_split = l_gof.split('\t')`
			`l_gjf_split = l_gjf.split('\t')`

			`# if punctuation`
			`if l_gof != '\n':`
			`if l_gof_split[1][-1] == 'u':`
			`# print(l_gjf_split)`
			`sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))`
			`else:`
			`sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))`

			`sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])`

			`# if l_gof == '\n':`
			`else:`
			`if wf:`
			`# print(i)`
			`wf.write(par.to_conll_2009_SRL(sentence))`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`# wf.flush()`
			`# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':`
			`if i > num_lines_per_part * (curr_part + 1):`
			`curr_part += 1`
			`# if wf doesn't exist (first one)`
			`if wf:`
			`wf.close()`
			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`
			`curr_part += 1`
			`wf.close()`


Big changes 2022-02-04 10:24:47 +00:00			`import time`
			`def handle_giga_file(ran):`
			`"""`
			`File that splits big text file into more minor files. Only split on empty lines.`
			`"""`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`# with open(INDIR_JOS, 'r') as gjf:`
			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`# pass`
			`# num_lines = i + 1`
			`# print(num_lines)`
			`num_lines = 1393184026`
			`# 1393184026`
			`# 1393184033`
			`# return`
			`num_lines_per_part = num_lines / GIGA_PARTS`
			`curr_part = 0`
			`gof_generator = giga_orig_generator()`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`with open(INDIR_JOS, 'r') as gjf:`
			`sentence = {}`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`wf = None`
			`if curr_part in file_indices:`
			`if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):`
			`os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))`

			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')`

			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`for i, l_gjf in enumerate(gjf):`
			`l_gof = next(gof_generator)`
			`if curr_part < ran[0]:`
			`if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :`
			`if curr_part < ran[0]:`
			`print(curr_part)`
			`curr_part += 1`
			`continue`
			`else:`
			`continue`

			`l_gof_split = l_gof.split('\t')`
			`l_gjf_split = l_gjf.split('\t')`

			`# if punctuation`
			`if l_gof != '\n':`
			`if curr_part not in file_indices:`
			`continue`
			`if l_gof_split[1][-1] == 'u':`
			`# print(l_gjf_split)`
			`sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))`
			`else:`
			`sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))`

			`sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])`

			`# if l_gof == '\n':`
			`else:`
			`if curr_part in file_indices:`
			`wf.write(par.to_conll_2009_SRL(sentence))`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`# wf.flush()`
			`# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':`
			`if i > num_lines_per_part * (curr_part + 1):`
			`curr_part += 1`
			`# if wf doesn't exist (first one)`
			`if curr_part in file_indices and wf:`
			`wf.close()`
			`if curr_part >= ran[1]:`
			`break`
			`if curr_part in file_indices:`
			`if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):`
			`os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))`

			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`

			`curr_part += 1`
			`wf.close()`

			`def handle_giga_file_selected_sentences(error_sentences):`
			`"""`
			`File that splits big text file into more minor files. Only split on empty lines.`
			`"""`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`# with open(INDIR_JOS, 'r') as gjf:`
			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`# pass`
			`# num_lines = i + 1`
			`# print(num_lines)`
			`# print('num_lines' + 3)`
			`# num_lines = 1393184026`
			`num_lines = 1393222523`
			`# 1393184026`
			`# 1393184033`
			`# return`
			`# num_lines_per_part = num_lines / GIGA_PARTS`
			`# curr_part = 0`
			`gof_generator = giga_orig_generator()`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`with open(INDIR_JOS, 'r') as gjf:`
			`sentence = {}`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`wf = None`
			`if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):`
			`os.remove(os.path.join(OUTDIR, 'giga_errors'))`

			`wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
Big changes 2022-02-04 10:24:47 +00:00			`with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:`
			`sentence_ids_list = pickle.load(pkl_file)`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
Big changes 2022-02-04 10:24:47 +00:00			`sentence_id = 0`
			`skip_sentence = not sentence_ids_list[sentence_id] in error_sentences`

			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`for i, l_gjf in enumerate(gjf):`
			`l_gof = next(gof_generator)`


			`if l_gjf == '\n':`
			`if not skip_sentence:`
			`wf.write(par.to_conll_2009_SRL(sentence))`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`sentence_id += 1`
			`if sentence_ids_list[sentence_id] in error_sentences:`
			`print(sentence_ids_list[sentence_id])`
			`skip_sentence = False`
			`else:`
			`skip_sentence = True`

			`if skip_sentence:`
			`continue`


			`# if curr_part < ran[0]:`
			`# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :`
			`# if curr_part < ran[0]:`
			`# print(curr_part)`
			`# curr_part += 1`
			`# continue`
			`# else:`
			`# continue`

			`l_gof_split = l_gof.split('\t')`
			`l_gjf_split = l_gjf.split('\t')`

			`# if punctuation`
			`if l_gof != '\n':`
			`if l_gof_split[1][-1] == 'u':`
			`# print(l_gjf_split)`
			`sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))`
			`else:`
			`sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))`

			`sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])`

			`# if l_gof == '\n':`
			`# wf.flush()`
			`# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':`
			`# if i > num_lines_per_part * (curr_part + 1):`
			`# curr_part += 1`
			`# # if wf doesn't exist (first one)`
			`# if curr_part in file_indices and wf:`
			`# wf.close()`
			`# if curr_part >= ran[1]:`
			`# break`
			`# if curr_part in file_indices:`
			`# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):`
			`# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))`
			`#`
			`# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`

			`# curr_part += 1`
			`wf.close()`

Adapted code to ssj500k and added its branch 2022-03-14 10:01:53 +00:00

			`handle_ssj500k_file()`
Big changes 2022-02-04 10:24:47 +00:00

			`logging.info("end parsing kres")`