2022-02-04 10:24:47 +00:00
|
|
|
import pickle
|
|
|
|
|
2019-02-18 07:49:04 +00:00
|
|
|
from parser.parser import Parser
|
|
|
|
import os
|
|
|
|
from os.path import join, dirname
|
|
|
|
from pathlib import Path
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import cProfile
|
2019-02-27 08:15:40 +00:00
|
|
|
import configparser
|
2019-02-28 09:15:14 +00:00
|
|
|
import logging
|
2019-02-28 09:53:27 +00:00
|
|
|
from multiprocessing import Pool
|
2019-02-27 08:15:40 +00:00
|
|
|
|
|
|
|
SSJ500K_2_1 = 27829 # number of sentences
|
|
|
|
par = Parser()
|
|
|
|
|
|
|
|
# path to data
|
|
|
|
config = configparser.ConfigParser()
|
2022-03-14 10:01:53 +00:00
|
|
|
# config.read("tools.cfg")
|
|
|
|
config.read("tools.cfg.ssj500k2.3")
|
2022-02-04 10:24:47 +00:00
|
|
|
analysis = ''
|
|
|
|
if 'kres_orig' in config["tools"]:
|
|
|
|
analysis = 'kres'
|
|
|
|
INDIR = Path(config["tools"]["kres_orig"])
|
|
|
|
OUTDIR = Path(config["tools"]["kres_tsv"])
|
|
|
|
elif 'giga_orig' in config["tools"]:
|
|
|
|
# analysis = 'gigafida'
|
|
|
|
analysis = 'giga'
|
|
|
|
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
|
|
|
|
INDIR_GIGA = Path(config["tools"]["giga_orig"])
|
|
|
|
INDIR_JOS = Path(config["tools"]["giga_jos"])
|
|
|
|
OUTDIR = Path(config["tools"]["giga_tsv"])
|
|
|
|
GIGA_PARTS = int(config["tools"]["giga_parts"])
|
|
|
|
INTERNAL_DATA = config["tools"]["internal_data"]
|
2022-03-14 10:01:53 +00:00
|
|
|
elif 'ssj500k_orig' in config["tools"]:
|
|
|
|
# analysis = 'gigafida'
|
|
|
|
analysis = 'ssj500k'
|
|
|
|
INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])
|
|
|
|
INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])
|
|
|
|
INDIR_JOS = Path(config["tools"]["ssj500k_jos"])
|
|
|
|
OUTDIR = Path(config["tools"]["ssj500k_tsv"])
|
|
|
|
INTERNAL_DATA = config["tools"]["internal_data"]
|
2022-02-04 10:24:47 +00:00
|
|
|
|
2019-02-28 12:57:27 +00:00
|
|
|
CPU_CORES = int(config["tools"]["cpu_cores"])
|
2019-02-28 09:53:27 +00:00
|
|
|
|
|
|
|
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
|
|
|
LOGFILE.touch(exist_ok=True)
|
|
|
|
LOGFILE.resolve()
|
2019-02-28 09:15:14 +00:00
|
|
|
|
2019-02-28 09:34:12 +00:00
|
|
|
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
2019-02-27 08:15:40 +00:00
|
|
|
|
|
|
|
"""
|
|
|
|
print("parsing ssj")
|
|
|
|
ssj_file = "../data/ssj500k-sl.sample.xml"
|
|
|
|
ssj_dict = par.parse_tei(ssj_file)
|
|
|
|
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
|
|
|
print("end parsing ssj")
|
|
|
|
"""
|
|
|
|
|
|
|
|
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
2022-03-14 10:01:53 +00:00
|
|
|
# OUTDIR.mkdir(exist_ok=True)
|
2019-02-27 08:15:40 +00:00
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
if analysis == 'kres':
|
|
|
|
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
|
|
|
|
logging.info("Parsing kres: {} files.".format(len(infiles)))
|
2019-02-28 09:53:27 +00:00
|
|
|
|
2022-03-14 10:01:53 +00:00
|
|
|
|
|
|
|
def handle_ssj500k_file():
|
|
|
|
kres_file = INDIR_SSJ500K_ORIG
|
|
|
|
outfile = OUTDIR
|
2019-02-27 08:15:40 +00:00
|
|
|
|
2019-02-28 12:57:27 +00:00
|
|
|
if outfile.is_file():
|
|
|
|
logging.info("Skipping existing file: {}.".format(str(kres_file)))
|
|
|
|
return True
|
2019-02-27 08:15:40 +00:00
|
|
|
|
2022-03-14 10:01:53 +00:00
|
|
|
# try:
|
|
|
|
res_dict = par.parse_tei(kres_file)
|
|
|
|
kres_out_str = ""
|
|
|
|
for _, sentence in res_dict.items():
|
|
|
|
kres_out_str += par.to_conll_2009_SRL(sentence)
|
|
|
|
# except Exception as exc:
|
|
|
|
# logging.info("Failed processing file: {}".format(str(kres_file)))
|
|
|
|
# logging.error(exc)
|
|
|
|
# return False
|
2019-02-27 08:15:40 +00:00
|
|
|
|
2019-02-28 12:57:27 +00:00
|
|
|
|
|
|
|
with outfile.open("wb+") as fp:
|
2019-02-27 08:15:40 +00:00
|
|
|
fp.write(kres_out_str.encode("utf-8"))
|
2022-03-14 10:01:53 +00:00
|
|
|
# logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
|
2019-02-28 12:57:27 +00:00
|
|
|
return True
|
|
|
|
return False
|
2019-02-28 09:53:27 +00:00
|
|
|
|
2022-03-14 10:01:53 +00:00
|
|
|
def ssj500k_orig_generator():
|
|
|
|
with open(INDIR_SSJ500K, 'r') as gof:
|
2022-02-04 10:24:47 +00:00
|
|
|
previous_new_line = False
|
|
|
|
for l_gof in gof:
|
|
|
|
if l_gof == '\n':
|
|
|
|
if previous_new_line:
|
|
|
|
continue
|
|
|
|
previous_new_line = True
|
|
|
|
elif previous_new_line:
|
|
|
|
previous_new_line = False
|
|
|
|
yield l_gof
|
2019-02-28 09:53:27 +00:00
|
|
|
|
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
def handle_gigafida_file():
|
|
|
|
"""
|
|
|
|
File that splits big text file into more minor files. Only split on empty lines.
|
|
|
|
"""
|
|
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
|
|
# with open(INDIR_JOS, 'r') as gjf:
|
|
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
|
|
# pass
|
|
|
|
# num_lines = i + 1
|
|
|
|
# print(num_lines)
|
|
|
|
gof_generator = giga_orig_generator()
|
|
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
|
|
with open(INDIR_JOS, 'r') as gjf:
|
|
|
|
sentence = {}
|
|
|
|
sentence['tokens'] = []
|
|
|
|
sentence['links'] = {}
|
|
|
|
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
|
|
|
|
ignore_lines = True
|
|
|
|
wf = False
|
|
|
|
else:
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
|
|
ignore_lines = False
|
|
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
|
|
for i, l_gjf in enumerate(gjf):
|
|
|
|
l_gof = next(gof_generator)
|
|
|
|
if ignore_lines:
|
|
|
|
if i > num_lines_per_part * curr_part and l_gof == '\n':
|
|
|
|
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
|
|
|
|
ignore_lines = False
|
|
|
|
# delete last file (probably not whole)
|
|
|
|
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
|
|
|
|
if ignore_lines:
|
|
|
|
print(curr_part)
|
|
|
|
curr_part += 1
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
continue
|
|
|
|
l_gof_split = l_gof.split('\t')
|
|
|
|
l_gjf_split = l_gjf.split('\t')
|
|
|
|
|
|
|
|
# if punctuation
|
|
|
|
if l_gof != '\n':
|
|
|
|
if l_gof_split[1][-1] == 'u':
|
|
|
|
# print(l_gjf_split)
|
|
|
|
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
|
|
|
else:
|
|
|
|
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
|
|
|
|
|
|
|
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
|
|
|
|
|
|
|
# if l_gof == '\n':
|
|
|
|
else:
|
|
|
|
if wf:
|
|
|
|
# print(i)
|
|
|
|
wf.write(par.to_conll_2009_SRL(sentence))
|
|
|
|
sentence['tokens'] = []
|
|
|
|
sentence['links'] = {}
|
|
|
|
# wf.flush()
|
|
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
|
|
|
if i > num_lines_per_part * (curr_part + 1):
|
|
|
|
curr_part += 1
|
|
|
|
# if wf doesn't exist (first one)
|
|
|
|
if wf:
|
|
|
|
wf.close()
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
|
|
curr_part += 1
|
|
|
|
wf.close()
|
|
|
|
|
2022-03-14 10:01:53 +00:00
|
|
|
|
|
|
|
def handle_ssj500k_file2():
|
|
|
|
"""
|
|
|
|
File that splits big text file into more minor files. Only split on empty lines.
|
|
|
|
"""
|
|
|
|
gof_generator = ssj500k_orig_generator()
|
|
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
|
|
with open(INDIR_JOS, 'r') as gjf:
|
|
|
|
sentence = {}
|
|
|
|
sentence['tokens'] = []
|
|
|
|
sentence['links'] = {}
|
|
|
|
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
|
|
|
|
ignore_lines = True
|
|
|
|
wf = False
|
|
|
|
else:
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
|
|
ignore_lines = False
|
|
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
|
|
for i, l_gjf in enumerate(gjf):
|
|
|
|
l_gof = next(gof_generator)
|
|
|
|
if ignore_lines:
|
|
|
|
if i > num_lines_per_part * curr_part and l_gof == '\n':
|
|
|
|
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
|
|
|
|
ignore_lines = False
|
|
|
|
# delete last file (probably not whole)
|
|
|
|
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
|
|
|
|
if ignore_lines:
|
|
|
|
print(curr_part)
|
|
|
|
curr_part += 1
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
continue
|
|
|
|
l_gof_split = l_gof.split('\t')
|
|
|
|
l_gjf_split = l_gjf.split('\t')
|
|
|
|
|
|
|
|
# if punctuation
|
|
|
|
if l_gof != '\n':
|
|
|
|
if l_gof_split[1][-1] == 'u':
|
|
|
|
# print(l_gjf_split)
|
|
|
|
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
|
|
|
else:
|
|
|
|
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
|
|
|
|
|
|
|
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
|
|
|
|
|
|
|
# if l_gof == '\n':
|
|
|
|
else:
|
|
|
|
if wf:
|
|
|
|
# print(i)
|
|
|
|
wf.write(par.to_conll_2009_SRL(sentence))
|
|
|
|
sentence['tokens'] = []
|
|
|
|
sentence['links'] = {}
|
|
|
|
# wf.flush()
|
|
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
|
|
|
if i > num_lines_per_part * (curr_part + 1):
|
|
|
|
curr_part += 1
|
|
|
|
# if wf doesn't exist (first one)
|
|
|
|
if wf:
|
|
|
|
wf.close()
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
|
|
curr_part += 1
|
|
|
|
wf.close()
|
|
|
|
|
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
import time
|
|
|
|
def handle_giga_file(ran):
|
|
|
|
"""
|
|
|
|
File that splits big text file into more minor files. Only split on empty lines.
|
|
|
|
"""
|
|
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
|
|
# with open(INDIR_JOS, 'r') as gjf:
|
|
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
|
|
# pass
|
|
|
|
# num_lines = i + 1
|
|
|
|
# print(num_lines)
|
|
|
|
num_lines = 1393184026
|
|
|
|
# 1393184026
|
|
|
|
# 1393184033
|
|
|
|
# return
|
|
|
|
num_lines_per_part = num_lines / GIGA_PARTS
|
|
|
|
curr_part = 0
|
|
|
|
gof_generator = giga_orig_generator()
|
|
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
|
|
with open(INDIR_JOS, 'r') as gjf:
|
|
|
|
sentence = {}
|
|
|
|
sentence['tokens'] = []
|
|
|
|
sentence['links'] = {}
|
|
|
|
wf = None
|
|
|
|
if curr_part in file_indices:
|
|
|
|
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
|
|
|
|
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
|
|
|
|
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
|
|
|
|
|
|
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
|
|
for i, l_gjf in enumerate(gjf):
|
|
|
|
l_gof = next(gof_generator)
|
|
|
|
if curr_part < ran[0]:
|
|
|
|
if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
|
|
|
|
if curr_part < ran[0]:
|
|
|
|
print(curr_part)
|
|
|
|
curr_part += 1
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
continue
|
|
|
|
|
|
|
|
l_gof_split = l_gof.split('\t')
|
|
|
|
l_gjf_split = l_gjf.split('\t')
|
|
|
|
|
|
|
|
# if punctuation
|
|
|
|
if l_gof != '\n':
|
|
|
|
if curr_part not in file_indices:
|
|
|
|
continue
|
|
|
|
if l_gof_split[1][-1] == 'u':
|
|
|
|
# print(l_gjf_split)
|
|
|
|
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
|
|
|
else:
|
|
|
|
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
|
|
|
|
|
|
|
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
|
|
|
|
|
|
|
# if l_gof == '\n':
|
|
|
|
else:
|
|
|
|
if curr_part in file_indices:
|
|
|
|
wf.write(par.to_conll_2009_SRL(sentence))
|
|
|
|
sentence['tokens'] = []
|
|
|
|
sentence['links'] = {}
|
|
|
|
# wf.flush()
|
|
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
|
|
|
if i > num_lines_per_part * (curr_part + 1):
|
|
|
|
curr_part += 1
|
|
|
|
# if wf doesn't exist (first one)
|
|
|
|
if curr_part in file_indices and wf:
|
|
|
|
wf.close()
|
|
|
|
if curr_part >= ran[1]:
|
|
|
|
break
|
|
|
|
if curr_part in file_indices:
|
|
|
|
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
|
|
|
|
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
|
|
|
|
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
|
|
|
|
|
|
curr_part += 1
|
|
|
|
wf.close()
|
|
|
|
|
|
|
|
def handle_giga_file_selected_sentences(error_sentences):
|
|
|
|
"""
|
|
|
|
File that splits big text file into more minor files. Only split on empty lines.
|
|
|
|
"""
|
|
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
|
|
# with open(INDIR_JOS, 'r') as gjf:
|
|
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
|
|
# pass
|
|
|
|
# num_lines = i + 1
|
|
|
|
# print(num_lines)
|
|
|
|
# print('num_lines' + 3)
|
|
|
|
# num_lines = 1393184026
|
|
|
|
num_lines = 1393222523
|
|
|
|
# 1393184026
|
|
|
|
# 1393184033
|
|
|
|
# return
|
|
|
|
# num_lines_per_part = num_lines / GIGA_PARTS
|
|
|
|
# curr_part = 0
|
|
|
|
gof_generator = giga_orig_generator()
|
|
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
|
|
with open(INDIR_JOS, 'r') as gjf:
|
|
|
|
sentence = {}
|
|
|
|
sentence['tokens'] = []
|
|
|
|
sentence['links'] = {}
|
|
|
|
wf = None
|
|
|
|
if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
|
|
|
|
os.remove(os.path.join(OUTDIR, 'giga_errors'))
|
|
|
|
|
|
|
|
wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
|
2019-02-28 09:53:27 +00:00
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
|
|
|
|
sentence_ids_list = pickle.load(pkl_file)
|
2019-02-28 09:53:27 +00:00
|
|
|
|
2022-02-04 10:24:47 +00:00
|
|
|
sentence_id = 0
|
|
|
|
skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
|
|
|
|
|
|
|
|
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
|
|
for i, l_gjf in enumerate(gjf):
|
|
|
|
l_gof = next(gof_generator)
|
|
|
|
|
|
|
|
|
|
|
|
if l_gjf == '\n':
|
|
|
|
if not skip_sentence:
|
|
|
|
wf.write(par.to_conll_2009_SRL(sentence))
|
|
|
|
sentence['tokens'] = []
|
|
|
|
sentence['links'] = {}
|
|
|
|
sentence_id += 1
|
|
|
|
if sentence_ids_list[sentence_id] in error_sentences:
|
|
|
|
print(sentence_ids_list[sentence_id])
|
|
|
|
skip_sentence = False
|
|
|
|
else:
|
|
|
|
skip_sentence = True
|
|
|
|
|
|
|
|
if skip_sentence:
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# if curr_part < ran[0]:
|
|
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
|
|
|
|
# if curr_part < ran[0]:
|
|
|
|
# print(curr_part)
|
|
|
|
# curr_part += 1
|
|
|
|
# continue
|
|
|
|
# else:
|
|
|
|
# continue
|
|
|
|
|
|
|
|
l_gof_split = l_gof.split('\t')
|
|
|
|
l_gjf_split = l_gjf.split('\t')
|
|
|
|
|
|
|
|
# if punctuation
|
|
|
|
if l_gof != '\n':
|
|
|
|
if l_gof_split[1][-1] == 'u':
|
|
|
|
# print(l_gjf_split)
|
|
|
|
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
|
|
|
else:
|
|
|
|
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
|
|
|
|
|
|
|
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
|
|
|
|
|
|
|
# if l_gof == '\n':
|
|
|
|
# wf.flush()
|
|
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
|
|
|
# if i > num_lines_per_part * (curr_part + 1):
|
|
|
|
# curr_part += 1
|
|
|
|
# # if wf doesn't exist (first one)
|
|
|
|
# if curr_part in file_indices and wf:
|
|
|
|
# wf.close()
|
|
|
|
# if curr_part >= ran[1]:
|
|
|
|
# break
|
|
|
|
# if curr_part in file_indices:
|
|
|
|
# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
|
|
|
|
# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
|
|
|
|
#
|
|
|
|
# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
|
|
|
|
|
|
# curr_part += 1
|
|
|
|
wf.close()
|
|
|
|
|
2022-03-14 10:01:53 +00:00
|
|
|
|
|
|
|
|
|
|
|
handle_ssj500k_file()
|
2022-02-04 10:24:47 +00:00
|
|
|
|
|
|
|
|
|
|
|
logging.info("end parsing kres")
|