378 lines
13 KiB
Python
378 lines
13 KiB
Python
import pickle
|
|
|
|
from parser.parser import Parser
|
|
import os
|
|
from os.path import join, dirname
|
|
from pathlib import Path
|
|
import re
|
|
import sys
|
|
import cProfile
|
|
import configparser
|
|
import logging
|
|
from multiprocessing import Pool
|
|
|
|
SSJ500K_2_1 = 27829 # number of sentences
|
|
par = Parser()
|
|
|
|
# path to data
|
|
config = configparser.ConfigParser()
|
|
config.read("tools.cfg")
|
|
analysis = ''
|
|
if 'kres_orig' in config["tools"]:
|
|
analysis = 'kres'
|
|
INDIR = Path(config["tools"]["kres_orig"])
|
|
OUTDIR = Path(config["tools"]["kres_tsv"])
|
|
elif 'giga_orig' in config["tools"]:
|
|
# analysis = 'gigafida'
|
|
analysis = 'giga'
|
|
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
|
|
INDIR_GIGA = Path(config["tools"]["giga_orig"])
|
|
INDIR_JOS = Path(config["tools"]["giga_jos"])
|
|
OUTDIR = Path(config["tools"]["giga_tsv"])
|
|
GIGA_PARTS = int(config["tools"]["giga_parts"])
|
|
INTERNAL_DATA = config["tools"]["internal_data"]
|
|
|
|
CPU_CORES = int(config["tools"]["cpu_cores"])
|
|
|
|
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
|
LOGFILE.touch(exist_ok=True)
|
|
LOGFILE.resolve()
|
|
|
|
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
|
|
|
"""
|
|
print("parsing ssj")
|
|
ssj_file = "../data/ssj500k-sl.sample.xml"
|
|
ssj_dict = par.parse_tei(ssj_file)
|
|
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
|
print("end parsing ssj")
|
|
"""
|
|
|
|
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
|
OUTDIR.mkdir(exist_ok=True)
|
|
|
|
if analysis == 'kres':
|
|
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
|
|
logging.info("Parsing kres: {} files.".format(len(infiles)))
|
|
|
|
def handle_file(infile):
|
|
i = infile[0]
|
|
kres_file = infile[1]
|
|
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
|
|
|
|
if outfile.is_file():
|
|
logging.info("Skipping existing file: {}.".format(str(kres_file)))
|
|
return True
|
|
|
|
try:
|
|
res_dict = par.parse_tei(kres_file)
|
|
kres_out_str = ""
|
|
for _, sentence in res_dict.items():
|
|
kres_out_str += par.to_conll_2009_SRL(sentence)
|
|
except Exception as exc:
|
|
logging.info("Failed processing file: {}".format(str(kres_file)))
|
|
logging.error(exc)
|
|
return False
|
|
|
|
|
|
with outfile.open("wb+") as fp:
|
|
fp.write(kres_out_str.encode("utf-8"))
|
|
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
|
|
return True
|
|
return False
|
|
|
|
def giga_orig_generator():
|
|
with open(INDIR_GIGA, 'r') as gof:
|
|
previous_new_line = False
|
|
for l_gof in gof:
|
|
if l_gof == '\n':
|
|
if previous_new_line:
|
|
continue
|
|
previous_new_line = True
|
|
elif previous_new_line:
|
|
previous_new_line = False
|
|
yield l_gof
|
|
|
|
|
|
def handle_gigafida_file():
|
|
"""
|
|
File that splits big text file into more minor files. Only split on empty lines.
|
|
"""
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
# with open(INDIR_JOS, 'r') as gjf:
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
# pass
|
|
# num_lines = i + 1
|
|
# print(num_lines)
|
|
num_lines = 1393184026
|
|
# 1393184026
|
|
# 1393184033
|
|
# return
|
|
num_lines_per_part = num_lines / GIGA_PARTS
|
|
curr_part = 0
|
|
gof_generator = giga_orig_generator()
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
with open(INDIR_JOS, 'r') as gjf:
|
|
sentence = {}
|
|
sentence['tokens'] = []
|
|
sentence['links'] = {}
|
|
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
|
|
ignore_lines = True
|
|
wf = False
|
|
else:
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
ignore_lines = False
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
for i, l_gjf in enumerate(gjf):
|
|
l_gof = next(gof_generator)
|
|
if ignore_lines:
|
|
if i > num_lines_per_part * curr_part and l_gof == '\n':
|
|
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
|
|
ignore_lines = False
|
|
# delete last file (probably not whole)
|
|
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
|
|
if ignore_lines:
|
|
print(curr_part)
|
|
curr_part += 1
|
|
continue
|
|
else:
|
|
continue
|
|
l_gof_split = l_gof.split('\t')
|
|
l_gjf_split = l_gjf.split('\t')
|
|
|
|
# if punctuation
|
|
if l_gof != '\n':
|
|
if l_gof_split[1][-1] == 'u':
|
|
# print(l_gjf_split)
|
|
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
|
else:
|
|
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
|
|
|
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
|
|
|
# if l_gof == '\n':
|
|
else:
|
|
if wf:
|
|
# print(i)
|
|
wf.write(par.to_conll_2009_SRL(sentence))
|
|
sentence['tokens'] = []
|
|
sentence['links'] = {}
|
|
# wf.flush()
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
|
if i > num_lines_per_part * (curr_part + 1):
|
|
curr_part += 1
|
|
# if wf doesn't exist (first one)
|
|
if wf:
|
|
wf.close()
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
curr_part += 1
|
|
wf.close()
|
|
|
|
import time
|
|
def handle_giga_file(ran):
|
|
"""
|
|
File that splits big text file into more minor files. Only split on empty lines.
|
|
"""
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
# with open(INDIR_JOS, 'r') as gjf:
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
# pass
|
|
# num_lines = i + 1
|
|
# print(num_lines)
|
|
num_lines = 1393184026
|
|
# 1393184026
|
|
# 1393184033
|
|
# return
|
|
num_lines_per_part = num_lines / GIGA_PARTS
|
|
curr_part = 0
|
|
gof_generator = giga_orig_generator()
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
with open(INDIR_JOS, 'r') as gjf:
|
|
sentence = {}
|
|
sentence['tokens'] = []
|
|
sentence['links'] = {}
|
|
wf = None
|
|
if curr_part in file_indices:
|
|
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
|
|
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
|
|
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
for i, l_gjf in enumerate(gjf):
|
|
l_gof = next(gof_generator)
|
|
if curr_part < ran[0]:
|
|
if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
|
|
if curr_part < ran[0]:
|
|
print(curr_part)
|
|
curr_part += 1
|
|
continue
|
|
else:
|
|
continue
|
|
|
|
l_gof_split = l_gof.split('\t')
|
|
l_gjf_split = l_gjf.split('\t')
|
|
|
|
# if punctuation
|
|
if l_gof != '\n':
|
|
if curr_part not in file_indices:
|
|
continue
|
|
if l_gof_split[1][-1] == 'u':
|
|
# print(l_gjf_split)
|
|
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
|
else:
|
|
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
|
|
|
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
|
|
|
# if l_gof == '\n':
|
|
else:
|
|
if curr_part in file_indices:
|
|
wf.write(par.to_conll_2009_SRL(sentence))
|
|
sentence['tokens'] = []
|
|
sentence['links'] = {}
|
|
# wf.flush()
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
|
if i > num_lines_per_part * (curr_part + 1):
|
|
curr_part += 1
|
|
# if wf doesn't exist (first one)
|
|
if curr_part in file_indices and wf:
|
|
wf.close()
|
|
if curr_part >= ran[1]:
|
|
break
|
|
if curr_part in file_indices:
|
|
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
|
|
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
|
|
curr_part += 1
|
|
wf.close()
|
|
|
|
def handle_giga_file_selected_sentences(error_sentences):
|
|
"""
|
|
File that splits big text file into more minor files. Only split on empty lines.
|
|
"""
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
# with open(INDIR_JOS, 'r') as gjf:
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
# pass
|
|
# num_lines = i + 1
|
|
# print(num_lines)
|
|
# print('num_lines' + 3)
|
|
# num_lines = 1393184026
|
|
num_lines = 1393222523
|
|
# 1393184026
|
|
# 1393184033
|
|
# return
|
|
# num_lines_per_part = num_lines / GIGA_PARTS
|
|
# curr_part = 0
|
|
gof_generator = giga_orig_generator()
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
with open(INDIR_JOS, 'r') as gjf:
|
|
sentence = {}
|
|
sentence['tokens'] = []
|
|
sentence['links'] = {}
|
|
wf = None
|
|
if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
|
|
os.remove(os.path.join(OUTDIR, 'giga_errors'))
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
|
|
|
|
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
|
|
sentence_ids_list = pickle.load(pkl_file)
|
|
|
|
sentence_id = 0
|
|
skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
|
|
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
for i, l_gjf in enumerate(gjf):
|
|
l_gof = next(gof_generator)
|
|
|
|
|
|
if l_gjf == '\n':
|
|
if not skip_sentence:
|
|
wf.write(par.to_conll_2009_SRL(sentence))
|
|
sentence['tokens'] = []
|
|
sentence['links'] = {}
|
|
sentence_id += 1
|
|
if sentence_ids_list[sentence_id] in error_sentences:
|
|
print(sentence_ids_list[sentence_id])
|
|
skip_sentence = False
|
|
else:
|
|
skip_sentence = True
|
|
|
|
if skip_sentence:
|
|
continue
|
|
|
|
|
|
# if curr_part < ran[0]:
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
|
|
# if curr_part < ran[0]:
|
|
# print(curr_part)
|
|
# curr_part += 1
|
|
# continue
|
|
# else:
|
|
# continue
|
|
|
|
l_gof_split = l_gof.split('\t')
|
|
l_gjf_split = l_gjf.split('\t')
|
|
|
|
# if punctuation
|
|
if l_gof != '\n':
|
|
if l_gof_split[1][-1] == 'u':
|
|
# print(l_gjf_split)
|
|
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
|
else:
|
|
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
|
|
|
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
|
|
|
# if l_gof == '\n':
|
|
# wf.flush()
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
|
# if i > num_lines_per_part * (curr_part + 1):
|
|
# curr_part += 1
|
|
# # if wf doesn't exist (first one)
|
|
# if curr_part in file_indices and wf:
|
|
# wf.close()
|
|
# if curr_part >= ran[1]:
|
|
# break
|
|
# if curr_part in file_indices:
|
|
# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
|
|
# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
|
|
#
|
|
# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
|
|
# curr_part += 1
|
|
wf.close()
|
|
|
|
file_indices = set(range(0, 100000))
|
|
with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
|
|
file_indices = set(pickle.load(pkl_file))
|
|
|
|
with Pool(CPU_CORES) as p:
|
|
if analysis == 'kres':
|
|
p.map(handle_file, infiles)
|
|
elif analysis == 'gigafida':
|
|
handle_gigafida_file()
|
|
elif analysis == 'giga':
|
|
final_range = [0, 100000]
|
|
size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
|
|
# splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
|
|
ranges = []
|
|
ps = None
|
|
for i in range(CPU_CORES):
|
|
s = int(final_range[0] + size_per_proc * i)
|
|
ns = int(final_range[0] + size_per_proc * (i + 1))
|
|
ranges.append([s, ns])
|
|
# ranges = [[0, 1]]
|
|
|
|
# p.map(handle_giga_file, ranges)
|
|
# p.map(handle_giga_file, ranges)
|
|
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
|
|
handle_giga_file_selected_sentences(set(error_sentences))
|
|
|
|
|
|
logging.info("end parsing kres")
|