cjvt-srl-tagging/tools/parse_all.py

378 lines
13 KiB
Python
Raw Normal View History

2022-02-04 10:24:47 +00:00
import pickle
from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
2019-02-27 08:15:40 +00:00
import configparser
2019-02-28 09:15:14 +00:00
import logging
2019-02-28 09:53:27 +00:00
from multiprocessing import Pool
2019-02-27 08:15:40 +00:00
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
2022-02-04 10:24:47 +00:00
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'giga'
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
INDIR_GIGA = Path(config["tools"]["giga_orig"])
INDIR_JOS = Path(config["tools"]["giga_jos"])
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
2019-02-28 12:57:27 +00:00
CPU_CORES = int(config["tools"]["cpu_cores"])
2019-02-28 09:53:27 +00:00
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
2019-02-28 09:15:14 +00:00
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
2019-02-27 08:15:40 +00:00
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
2022-02-04 10:24:47 +00:00
if analysis == 'kres':
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
2019-02-28 09:53:27 +00:00
def handle_file(infile):
i = infile[0]
kres_file = infile[1]
2019-02-28 12:57:27 +00:00
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
2019-02-28 20:49:49 +00:00
except Exception as exc:
2019-02-28 12:57:27 +00:00
logging.info("Failed processing file: {}".format(str(kres_file)))
2019-02-28 20:49:49 +00:00
logging.error(exc)
2019-02-28 12:57:27 +00:00
return False
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
with outfile.open("wb+") as fp:
2019-02-27 08:15:40 +00:00
fp.write(kres_out_str.encode("utf-8"))
2019-02-28 12:57:27 +00:00
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
return True
return False
2019-02-28 09:53:27 +00:00
2022-02-04 10:24:47 +00:00
def giga_orig_generator():
with open(INDIR_GIGA, 'r') as gof:
previous_new_line = False
for l_gof in gof:
if l_gof == '\n':
if previous_new_line:
continue
previous_new_line = True
elif previous_new_line:
previous_new_line = False
yield l_gof
2019-02-28 09:53:27 +00:00
2022-02-04 10:24:47 +00:00
def handle_gigafida_file():
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
num_lines = 1393184026
# 1393184026
# 1393184033
# return
num_lines_per_part = num_lines / GIGA_PARTS
curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
ignore_lines = True
wf = False
else:
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
ignore_lines = False
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if ignore_lines:
if i > num_lines_per_part * curr_part and l_gof == '\n':
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
ignore_lines = False
# delete last file (probably not whole)
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
if ignore_lines:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if wf:
# print(i)
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if wf:
wf.close()
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
import time
def handle_giga_file(ran):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
num_lines = 1393184026
# 1393184026
# 1393184033
# return
num_lines_per_part = num_lines / GIGA_PARTS
curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if curr_part < ran[0]:
if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
if curr_part < ran[0]:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if curr_part not in file_indices:
continue
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if curr_part in file_indices:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if curr_part in file_indices and wf:
wf.close()
if curr_part >= ran[1]:
break
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
def handle_giga_file_selected_sentences(error_sentences):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
# print('num_lines' + 3)
# num_lines = 1393184026
num_lines = 1393222523
# 1393184026
# 1393184033
# return
# num_lines_per_part = num_lines / GIGA_PARTS
# curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
os.remove(os.path.join(OUTDIR, 'giga_errors'))
wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
2019-02-28 09:53:27 +00:00
2022-02-04 10:24:47 +00:00
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
sentence_ids_list = pickle.load(pkl_file)
2019-02-28 09:53:27 +00:00
2022-02-04 10:24:47 +00:00
sentence_id = 0
skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if l_gjf == '\n':
if not skip_sentence:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
sentence_id += 1
if sentence_ids_list[sentence_id] in error_sentences:
print(sentence_ids_list[sentence_id])
skip_sentence = False
else:
skip_sentence = True
if skip_sentence:
continue
# if curr_part < ran[0]:
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
# if curr_part < ran[0]:
# print(curr_part)
# curr_part += 1
# continue
# else:
# continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
# if i > num_lines_per_part * (curr_part + 1):
# curr_part += 1
# # if wf doesn't exist (first one)
# if curr_part in file_indices and wf:
# wf.close()
# if curr_part >= ran[1]:
# break
# if curr_part in file_indices:
# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
# curr_part += 1
wf.close()
file_indices = set(range(0, 100000))
with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
file_indices = set(pickle.load(pkl_file))
with Pool(CPU_CORES) as p:
if analysis == 'kres':
p.map(handle_file, infiles)
elif analysis == 'gigafida':
handle_gigafida_file()
elif analysis == 'giga':
final_range = [0, 100000]
size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
# splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
ranges = []
ps = None
for i in range(CPU_CORES):
s = int(final_range[0] + size_per_proc * i)
ns = int(final_range[0] + size_per_proc * (i + 1))
ranges.append([s, ns])
# ranges = [[0, 1]]
# p.map(handle_giga_file, ranges)
# p.map(handle_giga_file, ranges)
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
handle_giga_file_selected_sentences(set(error_sentences))
logging.info("end parsing kres")