import pickle from parser.parser import Parser import os from os.path import join, dirname from pathlib import Path import re import sys import cProfile import configparser import logging from multiprocessing import Pool SSJ500K_2_1 = 27829 # number of sentences par = Parser() # path to data config = configparser.ConfigParser() config.read("tools.cfg") analysis = '' if 'kres_orig' in config["tools"]: analysis = 'kres' INDIR = Path(config["tools"]["kres_orig"]) OUTDIR = Path(config["tools"]["kres_tsv"]) elif 'giga_orig' in config["tools"]: # analysis = 'gigafida' analysis = 'giga' INDIR_GIGA_ORIG = Path(config["tools"]["giga"]) INDIR_GIGA = Path(config["tools"]["giga_orig"]) INDIR_JOS = Path(config["tools"]["giga_jos"]) OUTDIR = Path(config["tools"]["giga_tsv"]) GIGA_PARTS = int(config["tools"]["giga_parts"]) INTERNAL_DATA = config["tools"]["internal_data"] CPU_CORES = int(config["tools"]["cpu_cores"]) LOGFILE = Path(config["tools"]["logfile"]).absolute() LOGFILE.touch(exist_ok=True) LOGFILE.resolve() logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) """ print("parsing ssj") ssj_file = "../data/ssj500k-sl.sample.xml" ssj_dict = par.parse_tei(ssj_file) # assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences." print("end parsing ssj") """ # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" OUTDIR.mkdir(exist_ok=True) if analysis == 'kres': infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()])) logging.info("Parsing kres: {} files.".format(len(infiles))) def handle_file(infile): i = infile[0] kres_file = infile[1] outfile = (OUTDIR / kres_file.name).with_suffix(".tsv") if outfile.is_file(): logging.info("Skipping existing file: {}.".format(str(kres_file))) return True try: res_dict = par.parse_tei(kres_file) kres_out_str = "" for _, sentence in res_dict.items(): kres_out_str += par.to_conll_2009_SRL(sentence) except Exception as exc: logging.info("Failed processing file: {}".format(str(kres_file))) logging.error(exc) return False with outfile.open("wb+") as fp: fp.write(kres_out_str.encode("utf-8")) logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) return True return False def giga_orig_generator(): with open(INDIR_GIGA, 'r') as gof: previous_new_line = False for l_gof in gof: if l_gof == '\n': if previous_new_line: continue previous_new_line = True elif previous_new_line: previous_new_line = False yield l_gof def handle_gigafida_file(): """ File that splits big text file into more minor files. Only split on empty lines. """ # with open(INDIR_GIGA, 'r') as gof: # with open(INDIR_JOS, 'r') as gjf: # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): # pass # num_lines = i + 1 # print(num_lines) num_lines = 1393184026 # 1393184026 # 1393184033 # return num_lines_per_part = num_lines / GIGA_PARTS curr_part = 0 gof_generator = giga_orig_generator() # with open(INDIR_GIGA, 'r') as gof: with open(INDIR_JOS, 'r') as gjf: sentence = {} sentence['tokens'] = [] sentence['links'] = {} if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)): ignore_lines = True wf = False else: wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') ignore_lines = False # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): for i, l_gjf in enumerate(gjf): l_gof = next(gof_generator) if ignore_lines: if i > num_lines_per_part * curr_part and l_gof == '\n': if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))): ignore_lines = False # delete last file (probably not whole) os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1))) if ignore_lines: print(curr_part) curr_part += 1 continue else: continue l_gof_split = l_gof.split('\t') l_gjf_split = l_gjf.split('\t') # if punctuation if l_gof != '\n': if l_gof_split[1][-1] == 'u': # print(l_gjf_split) sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1])) else: sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1])) sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6]) # if l_gof == '\n': else: if wf: # print(i) wf.write(par.to_conll_2009_SRL(sentence)) sentence['tokens'] = [] sentence['links'] = {} # wf.flush() # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n': if i > num_lines_per_part * (curr_part + 1): curr_part += 1 # if wf doesn't exist (first one) if wf: wf.close() wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') curr_part += 1 wf.close() import time def handle_giga_file(ran): """ File that splits big text file into more minor files. Only split on empty lines. """ # with open(INDIR_GIGA, 'r') as gof: # with open(INDIR_JOS, 'r') as gjf: # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): # pass # num_lines = i + 1 # print(num_lines) num_lines = 1393184026 # 1393184026 # 1393184033 # return num_lines_per_part = num_lines / GIGA_PARTS curr_part = 0 gof_generator = giga_orig_generator() # with open(INDIR_GIGA, 'r') as gof: with open(INDIR_JOS, 'r') as gjf: sentence = {} sentence['tokens'] = [] sentence['links'] = {} wf = None if curr_part in file_indices: if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])): os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])) wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a') # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): for i, l_gjf in enumerate(gjf): l_gof = next(gof_generator) if curr_part < ran[0]: if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' : if curr_part < ran[0]: print(curr_part) curr_part += 1 continue else: continue l_gof_split = l_gof.split('\t') l_gjf_split = l_gjf.split('\t') # if punctuation if l_gof != '\n': if curr_part not in file_indices: continue if l_gof_split[1][-1] == 'u': # print(l_gjf_split) sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1])) else: sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1])) sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6]) # if l_gof == '\n': else: if curr_part in file_indices: wf.write(par.to_conll_2009_SRL(sentence)) sentence['tokens'] = [] sentence['links'] = {} # wf.flush() # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n': if i > num_lines_per_part * (curr_part + 1): curr_part += 1 # if wf doesn't exist (first one) if curr_part in file_indices and wf: wf.close() if curr_part >= ran[1]: break if curr_part in file_indices: if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)): os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)) wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') curr_part += 1 wf.close() def handle_giga_file_selected_sentences(error_sentences): """ File that splits big text file into more minor files. Only split on empty lines. """ # with open(INDIR_GIGA, 'r') as gof: # with open(INDIR_JOS, 'r') as gjf: # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): # pass # num_lines = i + 1 # print(num_lines) # print('num_lines' + 3) # num_lines = 1393184026 num_lines = 1393222523 # 1393184026 # 1393184033 # return # num_lines_per_part = num_lines / GIGA_PARTS # curr_part = 0 gof_generator = giga_orig_generator() # with open(INDIR_GIGA, 'r') as gof: with open(INDIR_JOS, 'r') as gjf: sentence = {} sentence['tokens'] = [] sentence['links'] = {} wf = None if os.path.exists(os.path.join(OUTDIR, 'giga_errors')): os.remove(os.path.join(OUTDIR, 'giga_errors')) wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a') with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file: sentence_ids_list = pickle.load(pkl_file) sentence_id = 0 skip_sentence = not sentence_ids_list[sentence_id] in error_sentences # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): for i, l_gjf in enumerate(gjf): l_gof = next(gof_generator) if l_gjf == '\n': if not skip_sentence: wf.write(par.to_conll_2009_SRL(sentence)) sentence['tokens'] = [] sentence['links'] = {} sentence_id += 1 if sentence_ids_list[sentence_id] in error_sentences: print(sentence_ids_list[sentence_id]) skip_sentence = False else: skip_sentence = True if skip_sentence: continue # if curr_part < ran[0]: # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' : # if curr_part < ran[0]: # print(curr_part) # curr_part += 1 # continue # else: # continue l_gof_split = l_gof.split('\t') l_gjf_split = l_gjf.split('\t') # if punctuation if l_gof != '\n': if l_gof_split[1][-1] == 'u': # print(l_gjf_split) sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1])) else: sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1])) sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6]) # if l_gof == '\n': # wf.flush() # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n': # if i > num_lines_per_part * (curr_part + 1): # curr_part += 1 # # if wf doesn't exist (first one) # if curr_part in file_indices and wf: # wf.close() # if curr_part >= ran[1]: # break # if curr_part in file_indices: # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)): # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)) # # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') # curr_part += 1 wf.close() file_indices = set(range(0, 100000)) with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file: file_indices = set(pickle.load(pkl_file)) with Pool(CPU_CORES) as p: if analysis == 'kres': p.map(handle_file, infiles) elif analysis == 'gigafida': handle_gigafida_file() elif analysis == 'giga': final_range = [0, 100000] size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)] ranges = [] ps = None for i in range(CPU_CORES): s = int(final_range[0] + size_per_proc * i) ns = int(final_range[0] + size_per_proc * (i + 1)) ranges.append([s, ns]) # ranges = [[0, 1]] # p.map(handle_giga_file, ranges) # p.map(handle_giga_file, ranges) error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))] handle_giga_file_selected_sentences(set(error_sentences)) logging.info("end parsing kres")