cjvt-srl-tagging/tools/parse_all.py

import pickle

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool

SSJ500K_2_1 = 27829  # number of sentences
par = Parser()

# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
analysis = ''
if 'kres_orig' in config["tools"]:
    analysis = 'kres'
    INDIR = Path(config["tools"]["kres_orig"])
    OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
    # analysis = 'gigafida'
    analysis = 'giga'
    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
    INDIR_GIGA = Path(config["tools"]["giga_orig"])
    INDIR_JOS = Path(config["tools"]["giga_jos"])
    OUTDIR = Path(config["tools"]["giga_tsv"])
    GIGA_PARTS = int(config["tools"]["giga_parts"])
    INTERNAL_DATA = config["tools"]["internal_data"]

CPU_CORES = int(config["tools"]["cpu_cores"])

LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()

logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""

# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)

if analysis == 'kres':
    infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
    logging.info("Parsing kres: {} files.".format(len(infiles)))

def handle_file(infile):
    i = infile[0]
    kres_file = infile[1]
    outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")

    if outfile.is_file():
        logging.info("Skipping existing file: {}.".format(str(kres_file)))
        return True

    try:
        res_dict = par.parse_tei(kres_file)
        kres_out_str = ""
        for _, sentence in res_dict.items():
            kres_out_str += par.to_conll_2009_SRL(sentence)
    except Exception as exc:
        logging.info("Failed processing file: {}".format(str(kres_file)))
        logging.error(exc)
        return False


    with outfile.open("wb+") as fp:
        fp.write(kres_out_str.encode("utf-8"))
        logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
        return True
    return False

def giga_orig_generator():
    with open(INDIR_GIGA, 'r') as gof:
        previous_new_line = False
        for l_gof in gof:
            if l_gof == '\n':
                if previous_new_line:
                    continue
                previous_new_line = True
            elif previous_new_line:
                previous_new_line = False
            yield l_gof


def handle_gigafida_file():
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    num_lines = 1393184026
    # 1393184026
    # 1393184033
    # return
    num_lines_per_part = num_lines / GIGA_PARTS
    curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
            ignore_lines = True
            wf = False
        else:
            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
            ignore_lines = False
        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if ignore_lines:
                if i > num_lines_per_part * curr_part and l_gof == '\n':
                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
                        ignore_lines = False
                        # delete last file (probably not whole)
                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
                    if ignore_lines:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue
            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if wf:
                    # print(i)
                    wf.write(par.to_conll_2009_SRL(sentence))
                sentence['tokens'] = []
                sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if wf:
                        wf.close()
                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
        curr_part += 1
        wf.close()

import  time
def handle_giga_file(ran):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    num_lines = 1393184026
    # 1393184026
    # 1393184033
    # return
    num_lines_per_part = num_lines / GIGA_PARTS
    curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if curr_part in file_indices:
            if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
                os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))

            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')

        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if curr_part < ran[0]:
                if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
                    if curr_part < ran[0]:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue

            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if curr_part not in file_indices:
                    continue
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if curr_part in file_indices:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if curr_part in file_indices and wf:
                        wf.close()
                    if curr_part >= ran[1]:
                        break
                    if curr_part in file_indices:
                        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                            os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))

                        wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')

        curr_part += 1
        wf.close()

def handle_giga_file_selected_sentences(error_sentences):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    # print('num_lines' + 3)
    # num_lines = 1393184026
    num_lines = 1393222523
    # 1393184026
    # 1393184033
    # return
    # num_lines_per_part = num_lines / GIGA_PARTS
    # curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
            os.remove(os.path.join(OUTDIR, 'giga_errors'))

        wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')

        with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
            sentence_ids_list = pickle.load(pkl_file)

        sentence_id = 0
        skip_sentence = not sentence_ids_list[sentence_id] in error_sentences

        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)


            if l_gjf == '\n':
                if not skip_sentence:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
                sentence_id += 1
                if sentence_ids_list[sentence_id] in error_sentences:
                    print(sentence_ids_list[sentence_id])
                    skip_sentence = False
                else:
                    skip_sentence = True

            if skip_sentence:
                continue


            # if curr_part < ran[0]:
            #     if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
            #         if curr_part < ran[0]:
            #             print(curr_part)
            #             curr_part += 1
            #             continue
            #     else:
            #         continue

            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
            #     if i > num_lines_per_part * (curr_part + 1):
            #         curr_part += 1
            #         # if wf doesn't exist (first one)
            #         if curr_part in file_indices and wf:
            #             wf.close()
            #         if curr_part >= ran[1]:
            #             break
                    # if curr_part in file_indices:
                    #     if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                    #         os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
                    #
                    #     wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')

        # curr_part += 1
        wf.close()

file_indices = set(range(0, 100000))
with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
    file_indices = set(pickle.load(pkl_file))

with Pool(CPU_CORES) as p:
    if analysis == 'kres':
        p.map(handle_file, infiles)
    elif analysis == 'gigafida':
        handle_gigafida_file()
    elif analysis == 'giga':
        final_range = [0, 100000]
        size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
        # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
        ranges = []
        ps = None
        for i in range(CPU_CORES):
            s = int(final_range[0] + size_per_proc * i)
            ns = int(final_range[0] + size_per_proc * (i + 1))
            ranges.append([s, ns])
        # ranges = [[0, 1]]

        # p.map(handle_giga_file, ranges)
        # p.map(handle_giga_file, ranges)
        error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
        handle_giga_file_selected_sentences(set(error_sentences))


logging.info("end parsing kres")