cjvt-srl-tagging/tools/parse_all.py

423 lines
15 KiB
Python
Raw Normal View History

2022-02-04 10:24:47 +00:00
import pickle
from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
2019-02-27 08:15:40 +00:00
import configparser
2019-02-28 09:15:14 +00:00
import logging
2019-02-28 09:53:27 +00:00
from multiprocessing import Pool
2019-02-27 08:15:40 +00:00
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
# config.read("tools.cfg")
config.read("tools.cfg.ssj500k2.3")
2022-02-04 10:24:47 +00:00
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'giga'
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
INDIR_GIGA = Path(config["tools"]["giga_orig"])
INDIR_JOS = Path(config["tools"]["giga_jos"])
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
elif 'ssj500k_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'ssj500k'
INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])
INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])
INDIR_JOS = Path(config["tools"]["ssj500k_jos"])
OUTDIR = Path(config["tools"]["ssj500k_tsv"])
INTERNAL_DATA = config["tools"]["internal_data"]
2022-02-04 10:24:47 +00:00
2019-02-28 12:57:27 +00:00
CPU_CORES = int(config["tools"]["cpu_cores"])
2019-02-28 09:53:27 +00:00
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
2019-02-28 09:15:14 +00:00
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
2019-02-27 08:15:40 +00:00
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
# OUTDIR.mkdir(exist_ok=True)
2019-02-27 08:15:40 +00:00
2022-02-04 10:24:47 +00:00
if analysis == 'kres':
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
2019-02-28 09:53:27 +00:00
def handle_ssj500k_file():
kres_file = INDIR_SSJ500K_ORIG
outfile = OUTDIR
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
2019-02-27 08:15:40 +00:00
# try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
# except Exception as exc:
# logging.info("Failed processing file: {}".format(str(kres_file)))
# logging.error(exc)
# return False
2019-02-27 08:15:40 +00:00
2019-02-28 12:57:27 +00:00
with outfile.open("wb+") as fp:
2019-02-27 08:15:40 +00:00
fp.write(kres_out_str.encode("utf-8"))
# logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
2019-02-28 12:57:27 +00:00
return True
return False
2019-02-28 09:53:27 +00:00
def ssj500k_orig_generator():
with open(INDIR_SSJ500K, 'r') as gof:
2022-02-04 10:24:47 +00:00
previous_new_line = False
for l_gof in gof:
if l_gof == '\n':
if previous_new_line:
continue
previous_new_line = True
elif previous_new_line:
previous_new_line = False
yield l_gof
2019-02-28 09:53:27 +00:00
2022-02-04 10:24:47 +00:00
def handle_gigafida_file():
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
ignore_lines = True
wf = False
else:
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
ignore_lines = False
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if ignore_lines:
if i > num_lines_per_part * curr_part and l_gof == '\n':
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
ignore_lines = False
# delete last file (probably not whole)
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
if ignore_lines:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if wf:
# print(i)
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if wf:
wf.close()
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
def handle_ssj500k_file2():
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
gof_generator = ssj500k_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
ignore_lines = True
wf = False
else:
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
ignore_lines = False
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if ignore_lines:
if i > num_lines_per_part * curr_part and l_gof == '\n':
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
ignore_lines = False
# delete last file (probably not whole)
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
if ignore_lines:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if wf:
# print(i)
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if wf:
wf.close()
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
2022-02-04 10:24:47 +00:00
import time
def handle_giga_file(ran):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
num_lines = 1393184026
# 1393184026
# 1393184033
# return
num_lines_per_part = num_lines / GIGA_PARTS
curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if curr_part < ran[0]:
if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
if curr_part < ran[0]:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if curr_part not in file_indices:
continue
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if curr_part in file_indices:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if curr_part in file_indices and wf:
wf.close()
if curr_part >= ran[1]:
break
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
def handle_giga_file_selected_sentences(error_sentences):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
# print('num_lines' + 3)
# num_lines = 1393184026
num_lines = 1393222523
# 1393184026
# 1393184033
# return
# num_lines_per_part = num_lines / GIGA_PARTS
# curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
os.remove(os.path.join(OUTDIR, 'giga_errors'))
wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
2019-02-28 09:53:27 +00:00
2022-02-04 10:24:47 +00:00
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
sentence_ids_list = pickle.load(pkl_file)
2019-02-28 09:53:27 +00:00
2022-02-04 10:24:47 +00:00
sentence_id = 0
skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if l_gjf == '\n':
if not skip_sentence:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
sentence_id += 1
if sentence_ids_list[sentence_id] in error_sentences:
print(sentence_ids_list[sentence_id])
skip_sentence = False
else:
skip_sentence = True
if skip_sentence:
continue
# if curr_part < ran[0]:
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
# if curr_part < ran[0]:
# print(curr_part)
# curr_part += 1
# continue
# else:
# continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
# if i > num_lines_per_part * (curr_part + 1):
# curr_part += 1
# # if wf doesn't exist (first one)
# if curr_part in file_indices and wf:
# wf.close()
# if curr_part >= ran[1]:
# break
# if curr_part in file_indices:
# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
# curr_part += 1
wf.close()
handle_ssj500k_file()
2022-02-04 10:24:47 +00:00
logging.info("end parsing kres")