cjvt-srl-tagging/tools/parse_all.py

import pickle

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool

SSJ500K_2_1 = 27829  # number of sentences
par = Parser()

# path to data
config = configparser.ConfigParser()
# config.read("tools.cfg")
config.read("tools.cfg.ssj500k2.3")
analysis = ''
if 'kres_orig' in config["tools"]:
    analysis = 'kres'
    INDIR = Path(config["tools"]["kres_orig"])
    OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
    # analysis = 'gigafida'
    analysis = 'giga'
    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
    INDIR_GIGA = Path(config["tools"]["giga_orig"])
    INDIR_JOS = Path(config["tools"]["giga_jos"])
    OUTDIR = Path(config["tools"]["giga_tsv"])
    GIGA_PARTS = int(config["tools"]["giga_parts"])
    INTERNAL_DATA = config["tools"]["internal_data"]
elif 'ssj500k_orig' in config["tools"]:
    # analysis = 'gigafida'
    analysis = 'ssj500k'
    INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])
    INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])
    INDIR_JOS = Path(config["tools"]["ssj500k_jos"])
    OUTDIR = Path(config["tools"]["ssj500k_tsv"])
    INTERNAL_DATA = config["tools"]["internal_data"]

CPU_CORES = int(config["tools"]["cpu_cores"])

LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()

logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""

# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
# OUTDIR.mkdir(exist_ok=True)

if analysis == 'kres':
    infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
    logging.info("Parsing kres: {} files.".format(len(infiles)))


def handle_ssj500k_file():
    kres_file = INDIR_SSJ500K_ORIG
    outfile = OUTDIR

    if outfile.is_file():
        logging.info("Skipping existing file: {}.".format(str(kres_file)))
        return True

    # try:
    res_dict = par.parse_tei(kres_file)
    kres_out_str = ""
    for _, sentence in res_dict.items():
        kres_out_str += par.to_conll_2009_SRL(sentence)
    # except Exception as exc:
    #     logging.info("Failed processing file: {}".format(str(kres_file)))
    #     logging.error(exc)
    #     return False


    with outfile.open("wb+") as fp:
        fp.write(kres_out_str.encode("utf-8"))
        # logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
        return True
    return False

def ssj500k_orig_generator():
    with open(INDIR_SSJ500K, 'r') as gof:
        previous_new_line = False
        for l_gof in gof:
            if l_gof == '\n':
                if previous_new_line:
                    continue
                previous_new_line = True
            elif previous_new_line:
                previous_new_line = False
            yield l_gof


def handle_gigafida_file():
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
            ignore_lines = True
            wf = False
        else:
            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
            ignore_lines = False
        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if ignore_lines:
                if i > num_lines_per_part * curr_part and l_gof == '\n':
                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
                        ignore_lines = False
                        # delete last file (probably not whole)
                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
                    if ignore_lines:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue
            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if wf:
                    # print(i)
                    wf.write(par.to_conll_2009_SRL(sentence))
                sentence['tokens'] = []
                sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if wf:
                        wf.close()
                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
        curr_part += 1
        wf.close()


def handle_ssj500k_file2():
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    gof_generator = ssj500k_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
            ignore_lines = True
            wf = False
        else:
            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
            ignore_lines = False
        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if ignore_lines:
                if i > num_lines_per_part * curr_part and l_gof == '\n':
                    if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
                        ignore_lines = False
                        # delete last file (probably not whole)
                        os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
                    if ignore_lines:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue
            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if wf:
                    # print(i)
                    wf.write(par.to_conll_2009_SRL(sentence))
                sentence['tokens'] = []
                sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if wf:
                        wf.close()
                    wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
        curr_part += 1
        wf.close()


import  time
def handle_giga_file(ran):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    num_lines = 1393184026
    # 1393184026
    # 1393184033
    # return
    num_lines_per_part = num_lines / GIGA_PARTS
    curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if curr_part in file_indices:
            if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
                os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))

            wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')

        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)
            if curr_part < ran[0]:
                if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
                    if curr_part < ran[0]:
                        print(curr_part)
                        curr_part += 1
                        continue
                else:
                    continue

            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if curr_part not in file_indices:
                    continue
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            else:
                if curr_part in file_indices:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
                if i > num_lines_per_part * (curr_part + 1):
                    curr_part += 1
                    # if wf doesn't exist (first one)
                    if curr_part in file_indices and wf:
                        wf.close()
                    if curr_part >= ran[1]:
                        break
                    if curr_part in file_indices:
                        if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                            os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))

                        wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')

        curr_part += 1
        wf.close()

def handle_giga_file_selected_sentences(error_sentences):
    """
    File that splits big text file into more minor files. Only split on empty lines.
    """
    # with open(INDIR_GIGA, 'r') as gof:
    #     with open(INDIR_JOS, 'r') as gjf:
    #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
    #             pass
    #     num_lines = i + 1
    # print(num_lines)
    # print('num_lines' + 3)
    # num_lines = 1393184026
    num_lines = 1393222523
    # 1393184026
    # 1393184033
    # return
    # num_lines_per_part = num_lines / GIGA_PARTS
    # curr_part = 0
    gof_generator = giga_orig_generator()
    # with open(INDIR_GIGA, 'r') as gof:
    with open(INDIR_JOS, 'r') as gjf:
        sentence = {}
        sentence['tokens'] = []
        sentence['links'] = {}
        wf = None
        if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
            os.remove(os.path.join(OUTDIR, 'giga_errors'))

        wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')

        with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
            sentence_ids_list = pickle.load(pkl_file)

        sentence_id = 0
        skip_sentence = not sentence_ids_list[sentence_id] in error_sentences

        # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
        for i, l_gjf in enumerate(gjf):
            l_gof = next(gof_generator)


            if l_gjf == '\n':
                if not skip_sentence:
                    wf.write(par.to_conll_2009_SRL(sentence))
                    sentence['tokens'] = []
                    sentence['links'] = {}
                sentence_id += 1
                if sentence_ids_list[sentence_id] in error_sentences:
                    print(sentence_ids_list[sentence_id])
                    skip_sentence = False
                else:
                    skip_sentence = True

            if skip_sentence:
                continue


            # if curr_part < ran[0]:
            #     if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
            #         if curr_part < ran[0]:
            #             print(curr_part)
            #             curr_part += 1
            #             continue
            #     else:
            #         continue

            l_gof_split = l_gof.split('\t')
            l_gjf_split = l_gjf.split('\t')

            # if punctuation
            if l_gof != '\n':
                if l_gof_split[1][-1] == 'u':
                    # print(l_gjf_split)
                    sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
                else:
                    sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))

                sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])

            # if l_gof == '\n':
            # wf.flush()
            # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
            #     if i > num_lines_per_part * (curr_part + 1):
            #         curr_part += 1
            #         # if wf doesn't exist (first one)
            #         if curr_part in file_indices and wf:
            #             wf.close()
            #         if curr_part >= ran[1]:
            #             break
                    # if curr_part in file_indices:
                    #     if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
                    #         os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
                    #
                    #     wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')

        # curr_part += 1
        wf.close()


handle_ssj500k_file()


logging.info("end parsing kres")