cjvt-srl-tagging/tools/parse_all.py

import pickle

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool

SSJ500K_2_1 = 27829  # number of sentences
par = Parser()

# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
analysis = ''
if 'kres_orig' in config["tools"]:
    analysis = 'kres'
    INDIR = Path(config["tools"]["kres_orig"])
    OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
    # analysis = 'gigafida'
    analysis = 'giga'
    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
    INDIR_GIGA = Path(config["tools"]["giga_orig"])
    INDIR_JOS = Path(config["tools"]["giga_jos"])
    OUTDIR = Path(config["tools"]["giga_tsv"])
    GIGA_PARTS = int(config["tools"]["giga_parts"])
    INTERNAL_DATA = config["tools"]["internal_data"]

CPU_CORES = int(config["tools"]["cpu_cores"])

LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()

logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""

# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)

if analysis == 'kres':
    infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
    logging.info("Parsing kres: {} files.".format(len(infiles)))

def handle_file(infile):
    i = infile[0]
    kres_file = infile[1]
    outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")

    if outfile.is_file():
        logging.info("Skipping existing file: {}.".format(str(kres_file)))
        return True

    try:
        res_dict = par.parse_tei(kres_file)
        kres_out_str = ""
        for _, sentence in res_dict.items():
            kres_out_str += par.to_conll_2009_SRL(sentence)
    except Exception as exc:
        logging.info("Failed processing file: {}".format(str(kres_file)))
        logging.error(exc)
        return False


    with outfile.open("wb+") as fp:
        fp.write(kres_out_str.encode("utf-8"))
        logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
        return True
    return False

def giga_orig_generator():
    with open(INDIR_GIGA, 'r') as gof:
        previous_new_line = False
        for l_gof in gof:
            if l_gof == '\n':
                if previous_new_line:
                    continue
                previous_new_line = True
            elif previous_new_line:
                previous_new_line = False
            yield l_gof


def handle_gigafida_file():
    """
    File that splits big text file into more minor files. Only split on empty lines.  
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    num_lines = 1393184026
    # 1393184026
    # 1393184033
    # return
    num_lines_per_part = num_lines / GIGA_PARTS
    curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
            ignore_lines = True
            wf = False
        else:
            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
            ignore_lines = False
        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if ignore_lines:
                if i > num_lines_per_part * curr_part and l_gof == '\n':
                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
                        ignore_lines = False
                        # delete last file (probably not whole)
                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
                    if ignore_lines:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue
            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if wf:
                    # print(i)
                    wf.write(par.to_conll_2009_SRL(sentence))
                sentence['tokens'] = []
                sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if wf:
                        wf.close()
                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
        curr_part += 1
        wf.close()

import  time
def handle_giga_file(ran):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    num_lines = 1393184026
    # 1393184026
    # 1393184033
    # return
    num_lines_per_part = num_lines / GIGA_PARTS
    curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if curr_part in file_indices:
            if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
                os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))

            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')

        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if curr_part < ran[0]:
                if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
                    if curr_part < ran[0]:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue

            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if curr_part not in file_indices:
                    continue
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if curr_part in file_indices:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if curr_part in file_indices and wf:
                        wf.close()
                    if curr_part >= ran[1]:
                        break
                    if curr_part in file_indices:
                        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                            os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))

                        wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')

        curr_part += 1
        wf.close()

def handle_giga_file_selected_sentences(error_sentences):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    # print('num_lines' + 3)
    # num_lines = 1393184026
    num_lines = 1393222523
    # 1393184026
    # 1393184033
    # return
    # num_lines_per_part = num_lines / GIGA_PARTS
    # curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
            os.remove(os.path.join(OUTDIR, 'giga_errors'))

        wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')

        with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
            sentence_ids_list = pickle.load(pkl_file)

        sentence_id = 0
        skip_sentence = not sentence_ids_list[sentence_id] in error_sentences

        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)


            if l_gjf == '\n':
                if not skip_sentence:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
                sentence_id += 1
                if sentence_ids_list[sentence_id] in error_sentences:
                    print(sentence_ids_list[sentence_id])
                    skip_sentence = False
                else:
                    skip_sentence = True

            if skip_sentence:
                continue


            # if curr_part < ran[0]:
            #     if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
            #         if curr_part < ran[0]:
            #             print(curr_part)
            #             curr_part += 1
            #             continue
            #     else:
            #         continue

            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
            #     if i > num_lines_per_part * (curr_part + 1):
            #         curr_part += 1
            #         # if wf doesn't exist (first one)
            #         if curr_part in file_indices and wf:
            #             wf.close()
            #         if curr_part >= ran[1]:
            #             break
                    # if curr_part in file_indices:
                    #     if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                    #         os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
                    #
                    #     wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')

        # curr_part += 1
        wf.close()

file_indices = set(range(0, 100000))
with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
    file_indices = set(pickle.load(pkl_file))

with Pool(CPU_CORES) as p:
    if analysis == 'kres':
        p.map(handle_file, infiles)
    elif analysis == 'gigafida':
        handle_gigafida_file()
    elif analysis == 'giga':
        final_range = [0, 100000]
        size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
        # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
        ranges = []
        ps = None
        for i in range(CPU_CORES):
            s = int(final_range[0] + size_per_proc * i)
            ns = int(final_range[0] + size_per_proc * (i + 1))
            ranges.append([s, ns])
        # ranges = [[0, 1]]

        # p.map(handle_giga_file, ranges)
        # p.map(handle_giga_file, ranges)
        error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
        handle_giga_file_selected_sentences(set(error_sentences))


logging.info("end parsing kres")
Big changes 2022-02-04 10:24:47 +00:00			`import pickle`

finished parse + tag toolchain -> TODO: tagger error 2019-02-18 07:49:04 +00:00			`from parser.parser import Parser`
			`import os`
			`from os.path import join, dirname`
			`from pathlib import Path`
			`import re`
			`import sys`
			`import cProfile`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`import configparser`
added logger 2019-02-28 09:15:14 +00:00			`import logging`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00			`from multiprocessing import Pool`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
			`SSJ500K_2_1 = 27829 # number of sentences`
			`par = Parser()`

			`# path to data`
			`config = configparser.ConfigParser()`
			`config.read("tools.cfg")`
Big changes 2022-02-04 10:24:47 +00:00			`analysis = ''`
			`if 'kres_orig' in config["tools"]:`
			`analysis = 'kres'`
			`INDIR = Path(config["tools"]["kres_orig"])`
			`OUTDIR = Path(config["tools"]["kres_tsv"])`
			`elif 'giga_orig' in config["tools"]:`
			`# analysis = 'gigafida'`
			`analysis = 'giga'`
			`INDIR_GIGA_ORIG = Path(config["tools"]["giga"])`
			`INDIR_GIGA = Path(config["tools"]["giga_orig"])`
			`INDIR_JOS = Path(config["tools"]["giga_jos"])`
			`OUTDIR = Path(config["tools"]["giga_tsv"])`
			`GIGA_PARTS = int(config["tools"]["giga_parts"])`
			`INTERNAL_DATA = config["tools"]["internal_data"]`

added number of cores to config 2019-02-28 12:57:27 +00:00			`CPU_CORES = int(config["tools"]["cpu_cores"])`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
			`LOGFILE = Path(config["tools"]["logfile"]).absolute()`
			`LOGFILE.touch(exist_ok=True)`
			`LOGFILE.resolve()`
added logger 2019-02-28 09:15:14 +00:00
added logging; paralelize the first part now 2019-02-28 09:34:12 +00:00			`logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
			`"""`
			`print("parsing ssj")`
			`ssj_file = "../data/ssj500k-sl.sample.xml"`
			`ssj_dict = par.parse_tei(ssj_file)`
			`# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."`
			`print("end parsing ssj")`
			`"""`

			`# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"`
			`OUTDIR.mkdir(exist_ok=True)`

Big changes 2022-02-04 10:24:47 +00:00			`if analysis == 'kres':`
			`infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))`
			`logging.info("Parsing kres: {} files.".format(len(infiles)))`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
			`def handle_file(infile):`
			`i = infile[0]`
			`kres_file = infile[1]`
added number of cores to config 2019-02-28 12:57:27 +00:00			`outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
added number of cores to config 2019-02-28 12:57:27 +00:00			`if outfile.is_file():`
			`logging.info("Skipping existing file: {}.".format(str(kres_file)))`
			`return True`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
added number of cores to config 2019-02-28 12:57:27 +00:00			`try:`
			`res_dict = par.parse_tei(kres_file)`
			`kres_out_str = ""`
			`for _, sentence in res_dict.items():`
			`kres_out_str += par.to_conll_2009_SRL(sentence)`
added msd-not-found exception 2019-02-28 20:49:49 +00:00			`except Exception as exc:`
added number of cores to config 2019-02-28 12:57:27 +00:00			`logging.info("Failed processing file: {}".format(str(kres_file)))`
added msd-not-found exception 2019-02-28 20:49:49 +00:00			`logging.error(exc)`
added number of cores to config 2019-02-28 12:57:27 +00:00			`return False`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00
added number of cores to config 2019-02-28 12:57:27 +00:00
			`with outfile.open("wb+") as fp:`
added tools.cfg for configurable paths 2019-02-27 08:15:40 +00:00			`fp.write(kres_out_str.encode("utf-8"))`
added number of cores to config 2019-02-28 12:57:27 +00:00			`logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))`
			`return True`
			`return False`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
Big changes 2022-02-04 10:24:47 +00:00			`def giga_orig_generator():`
			`with open(INDIR_GIGA, 'r') as gof:`
			`previous_new_line = False`
			`for l_gof in gof:`
			`if l_gof == '\n':`
			`if previous_new_line:`
			`continue`
			`previous_new_line = True`
			`elif previous_new_line:`
			`previous_new_line = False`
			`yield l_gof`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00

Big changes 2022-02-04 10:24:47 +00:00			`def handle_gigafida_file():`
			`"""`
			`File that splits big text file into more minor files. Only split on empty lines.`
			`"""`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`# with open(INDIR_JOS, 'r') as gjf:`
			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`# pass`
			`# num_lines = i + 1`
			`# print(num_lines)`
			`num_lines = 1393184026`
			`# 1393184026`
			`# 1393184033`
			`# return`
			`num_lines_per_part = num_lines / GIGA_PARTS`
			`curr_part = 0`
			`gof_generator = giga_orig_generator()`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`with open(INDIR_JOS, 'r') as gjf:`
			`sentence = {}`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):`
			`ignore_lines = True`
			`wf = False`
			`else:`
			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`
			`ignore_lines = False`
			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`for i, l_gjf in enumerate(gjf):`
			`l_gof = next(gof_generator)`
			`if ignore_lines:`
			`if i > num_lines_per_part * curr_part and l_gof == '\n':`
			`if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):`
			`ignore_lines = False`
			`# delete last file (probably not whole)`
			`os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))`
			`if ignore_lines:`
			`print(curr_part)`
			`curr_part += 1`
			`continue`
			`else:`
			`continue`
			`l_gof_split = l_gof.split('\t')`
			`l_gjf_split = l_gjf.split('\t')`

			`# if punctuation`
			`if l_gof != '\n':`
			`if l_gof_split[1][-1] == 'u':`
			`# print(l_gjf_split)`
			`sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))`
			`else:`
			`sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))`

			`sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])`

			`# if l_gof == '\n':`
			`else:`
			`if wf:`
			`# print(i)`
			`wf.write(par.to_conll_2009_SRL(sentence))`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`# wf.flush()`
			`# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':`
			`if i > num_lines_per_part * (curr_part + 1):`
			`curr_part += 1`
			`# if wf doesn't exist (first one)`
			`if wf:`
			`wf.close()`
			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`
			`curr_part += 1`
			`wf.close()`

			`import time`
			`def handle_giga_file(ran):`
			`"""`
			`File that splits big text file into more minor files. Only split on empty lines.`
			`"""`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`# with open(INDIR_JOS, 'r') as gjf:`
			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`# pass`
			`# num_lines = i + 1`
			`# print(num_lines)`
			`num_lines = 1393184026`
			`# 1393184026`
			`# 1393184033`
			`# return`
			`num_lines_per_part = num_lines / GIGA_PARTS`
			`curr_part = 0`
			`gof_generator = giga_orig_generator()`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`with open(INDIR_JOS, 'r') as gjf:`
			`sentence = {}`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`wf = None`
			`if curr_part in file_indices:`
			`if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):`
			`os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))`

			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')`

			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`for i, l_gjf in enumerate(gjf):`
			`l_gof = next(gof_generator)`
			`if curr_part < ran[0]:`
			`if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :`
			`if curr_part < ran[0]:`
			`print(curr_part)`
			`curr_part += 1`
			`continue`
			`else:`
			`continue`

			`l_gof_split = l_gof.split('\t')`
			`l_gjf_split = l_gjf.split('\t')`

			`# if punctuation`
			`if l_gof != '\n':`
			`if curr_part not in file_indices:`
			`continue`
			`if l_gof_split[1][-1] == 'u':`
			`# print(l_gjf_split)`
			`sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))`
			`else:`
			`sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))`

			`sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])`

			`# if l_gof == '\n':`
			`else:`
			`if curr_part in file_indices:`
			`wf.write(par.to_conll_2009_SRL(sentence))`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`# wf.flush()`
			`# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':`
			`if i > num_lines_per_part * (curr_part + 1):`
			`curr_part += 1`
			`# if wf doesn't exist (first one)`
			`if curr_part in file_indices and wf:`
			`wf.close()`
			`if curr_part >= ran[1]:`
			`break`
			`if curr_part in file_indices:`
			`if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):`
			`os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))`

			`wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`

			`curr_part += 1`
			`wf.close()`

			`def handle_giga_file_selected_sentences(error_sentences):`
			`"""`
			`File that splits big text file into more minor files. Only split on empty lines.`
			`"""`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`# with open(INDIR_JOS, 'r') as gjf:`
			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`# pass`
			`# num_lines = i + 1`
			`# print(num_lines)`
			`# print('num_lines' + 3)`
			`# num_lines = 1393184026`
			`num_lines = 1393222523`
			`# 1393184026`
			`# 1393184033`
			`# return`
			`# num_lines_per_part = num_lines / GIGA_PARTS`
			`# curr_part = 0`
			`gof_generator = giga_orig_generator()`
			`# with open(INDIR_GIGA, 'r') as gof:`
			`with open(INDIR_JOS, 'r') as gjf:`
			`sentence = {}`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`wf = None`
			`if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):`
			`os.remove(os.path.join(OUTDIR, 'giga_errors'))`

			`wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
Big changes 2022-02-04 10:24:47 +00:00			`with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:`
			`sentence_ids_list = pickle.load(pkl_file)`
added multiprocessing to parse_all.py 2019-02-28 09:53:27 +00:00
Big changes 2022-02-04 10:24:47 +00:00			`sentence_id = 0`
			`skip_sentence = not sentence_ids_list[sentence_id] in error_sentences`

			`# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):`
			`for i, l_gjf in enumerate(gjf):`
			`l_gof = next(gof_generator)`


			`if l_gjf == '\n':`
			`if not skip_sentence:`
			`wf.write(par.to_conll_2009_SRL(sentence))`
			`sentence['tokens'] = []`
			`sentence['links'] = {}`
			`sentence_id += 1`
			`if sentence_ids_list[sentence_id] in error_sentences:`
			`print(sentence_ids_list[sentence_id])`
			`skip_sentence = False`
			`else:`
			`skip_sentence = True`

			`if skip_sentence:`
			`continue`


			`# if curr_part < ran[0]:`
			`# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :`
			`# if curr_part < ran[0]:`
			`# print(curr_part)`
			`# curr_part += 1`
			`# continue`
			`# else:`
			`# continue`

			`l_gof_split = l_gof.split('\t')`
			`l_gjf_split = l_gjf.split('\t')`

			`# if punctuation`
			`if l_gof != '\n':`
			`if l_gof_split[1][-1] == 'u':`
			`# print(l_gjf_split)`
			`sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))`
			`else:`
			`sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))`

			`sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])`

			`# if l_gof == '\n':`
			`# wf.flush()`
			`# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':`
			`# if i > num_lines_per_part * (curr_part + 1):`
			`# curr_part += 1`
			`# # if wf doesn't exist (first one)`
			`# if curr_part in file_indices and wf:`
			`# wf.close()`
			`# if curr_part >= ran[1]:`
			`# break`
			`# if curr_part in file_indices:`
			`# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):`
			`# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))`
			`#`
			`# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')`

			`# curr_part += 1`
			`wf.close()`

			`file_indices = set(range(0, 100000))`
			`with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:`
			`file_indices = set(pickle.load(pkl_file))`

			`with Pool(CPU_CORES) as p:`
			`if analysis == 'kres':`
			`p.map(handle_file, infiles)`
			`elif analysis == 'gigafida':`
			`handle_gigafida_file()`
			`elif analysis == 'giga':`
			`final_range = [0, 100000]`
			`size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES`
			`# splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]`
			`ranges = []`
			`ps = None`
			`for i in range(CPU_CORES):`
			`s = int(final_range[0] + size_per_proc * i)`
			`ns = int(final_range[0] + size_per_proc * (i + 1))`
			`ranges.append([s, ns])`
			`# ranges = [[0, 1]]`

			`# p.map(handle_giga_file, ranges)`
			`# p.map(handle_giga_file, ranges)`
			`error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]`
			`handle_giga_file_selected_sentences(set(error_sentences))`


			`logging.info("end parsing kres")`