Big changes

This commit is contained in:
Luka 2022-02-04 11:24:47 +01:00
parent a6cee3d459
commit c1ecc4cdbc
18 changed files with 1384 additions and 53 deletions

3
.gitignore vendored
View File

@ -6,3 +6,6 @@ nohup.out
data/kres_out/*
data/kres_example/
venv/
.idea/
data/

View File

@ -15,6 +15,6 @@ run:
-v /etc/group:/etc/group \
-v $(shell pwd)/../../:/cjvt-srl-tagging \
-w /cjvt-srl-tagging \
-v /home/kristjan/kres_mount:/kres_mount:ro \
-v /home/luka/Development/srl/data:/kres_mount:ro \
python-java \
/bin/bash

View File

@ -0,0 +1,19 @@
import os
# INPATH = Path(config["tools"]["giga_srl"])
# infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files'
from shutil import copyfile
INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl'
OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv'
for i in range(100000):
# print(os.path.join(INPATH, 'giga.%07d.tsv' % i))
# if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)):
# print('giga.%07d.tsv' % i)
if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)):
copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i))
print('giga%07d.srl.tsv' % i)
if i % 1000 == 0:
print(i)

View File

View File

@ -0,0 +1,192 @@
import pickle
from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'giga'
INDIR_GIGA = Path(config["tools"]["giga_orig"])
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
INDIR_JOS = Path(config["tools"]["giga_jos"])
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
origfiles = []
for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
for file in files:
origfiles.append(Path(os.path.join(subdir, file)))
origfiles=list(enumerate(sorted(origfiles)))
def giga_orig_sentence_generator():
with open(INDIR_GIGA, 'r') as gof:
previous_new_line = False
sentence_words = []
for l_gof in gof:
if l_gof == '\n':
yield ' '.join(sentence_words)
sentence_words = []
else:
sentence_words.append(l_gof.split('\t')[0])
# yield l_gof
sentence_generator = giga_orig_sentence_generator()
sentence_ids = []
for origfile in origfiles:
split_file_sentences = par.parse_tei(origfile[1])
for k, v in split_file_sentences.items():
one_file_sentence = next(sentence_generator)
if one_file_sentence == v['text']:
sentence_ids.append(v['sid'])
else:
print('----------------')
print('ERROR')
print(v['sid'])
print(one_file_sentence)
print(v['text'])
print(origfile[0])
# count sentences in orig (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
pickle.dump(sentence_ids, output)
# def giga_orig_generator():
# with open(INDIR_GIGA, 'r') as gof:
# previous_new_line = False
# for l_gof in gof:
# if l_gof == '\n':
# if previous_new_line:
# continue
# previous_new_line = True
# elif previous_new_line:
# previous_new_line = False
# yield l_gof
# import time
# def handle_giga_file(ran):
# """
# File that splits big text file into more minor files. Only split on empty lines.
# """
# # with open(INDIR_GIGA, 'r') as gof:
# # with open(INDIR_JOS, 'r') as gjf:
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# # pass
# # num_lines = i + 1
# # print(num_lines)
# num_lines = 1393184026
# # 1393184026
# # 1393184033
# # return
# num_lines_per_part = num_lines / GIGA_PARTS
# curr_part = 0
# gof_generator = giga_orig_generator()
#
# diff_files = set()
# # with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_GIGA_OLD, 'r') as gjf:
# # sentence = {}
# # sentence['tokens'] = []
# # sentence['links'] = {}
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
#
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
#
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# for i, l_gjf in enumerate(gjf):
# l_gof = next(gof_generator)
# if curr_part < ran[0]:
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
# if curr_part < ran[0]:
# print(curr_part)
# curr_part += 1
# continue
# else:
# continue
#
# l_gof_split = l_gof.split('\t')
# l_gjf_split = l_gjf.split('\t')
#
# # if punctuation
# if l_gof != '\n':
# if l_gof_split != l_gjf_split:
# print(curr_part)
# diff_files.add(curr_part)
# l_gof = next(gof_generator)
#
#
# # if l_gof == '\n':
# else:
# # wf.flush()
# # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
# if i > num_lines_per_part * (curr_part + 1):
# curr_part += 1
# # if wf doesn't exist (first one)
# # wf.close()
# if curr_part >= ran[1]:
# break
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
#
# curr_part += 1
# return diff_files
# # wf.close()
#
# with Pool(CPU_CORES) as p:
# final_range = [0, 100000]
# # final_range = [0, 150]
# # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
# # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
# # ranges = []
# # ps = None
# # for i in range(CPU_CORES):
# # s = int(final_range[0] + size_per_proc * i)
# # ns = int(final_range[0] + size_per_proc * (i + 1))
# # ranges.append([s, ns])
# # # ranges = [[0, 1]]
# # res = p.map(handle_giga_file, ranges)
#
# res = handle_giga_file(final_range)
# res = sorted(list(res))
# if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
# os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
# with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
# pickle.dump(res, pkl_file)
# # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
# # mydict2 = pickle.load(pkl_file)
# print('test')

114
tools/gen_json.kres.py Normal file
View File

@ -0,0 +1,114 @@
from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
import logging
from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
# ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["giga_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
def handle_file(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
logging.info("Finished generating .json files.")

View File

@ -1,3 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import pickle
from pathlib import Path
from parser.parser import Parser
import configparser
@ -9,9 +14,10 @@ from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
ORIGPATH = Path(config["tools"]["giga"])
INPATH = Path(config["tools"]["giga_srl"])
OUTPATH = Path(config["tools"]["giga_json"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
@ -48,6 +54,13 @@ def match_sentence_id(sentence, orig_dict):
return k
raise KeyError
def match_sentence_id_giga(sentence, orig_dict):
for k, e in orig_dict.items():
# orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == e["text"]:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
@ -59,7 +72,7 @@ def get_dep_rel(token):
}
return None
def handle_file(infile_tpl):
def handle_file_old(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
@ -101,14 +114,283 @@ def handle_file(infile_tpl):
logging.info("SRL relations written to: {}".format(outfile))
def handle_file(whole_input):
# sentence_id = whole_input[0][3]
# orig_infile = whole_input[0][1]
sentence_id = whole_input[3]
orig_infile = whole_input[1]
# origfile = origfiles[0][1]
# infile_tpl = infile_tpl[0]
# i = infile_tpl[0]
# infile = infile_tpl[1]
outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
if outfile.exists():
return
# origfile = get_origfile()
orig_dict = par.parse_tei(orig_infile)
outdata = {}
gen = srl_multiple_files_sentences_generator(sentence_id)
# gen = srl_multiple_files_sentences_generator(whole_input[1])
mismatch_sentences = 0
for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
if orig_id == 'GF0014802.2685.7':
print('PAUSE')
# look at neighbouring sentences if they are correct
for i in range(100):
sentence, sentence_arr = next(gen)
# orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_val["text"]:
# if i != 10 and i != 0:
# print('OK!')
sid = orig_id
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
break
else:
if i == 99:
mismatch_sentences += 1
sid = orig_id
outdata[sid] = []
gen = srl_multiple_files_sentences_generator(sentence_id + sentence_i)
if mismatch_sentences > 0:
if mismatch_sentences / len(orig_dict.items()) < 0.1:
print('Slight mismatch - %d' % sentence_id)
print(whole_input)
print('ABS mitigated %d' % mismatch_sentences)
print('------------------------------------------------')
else:
print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
print('Big mismatch - %d' % sentence_id)
print(whole_input)
print('ABS mitigated errors:')
print(mismatch_sentences)
print('------------------------------------------------')
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
def count_orig_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
return
print(filename[0])
orig_dict = par.parse_tei(filename[1])
# return filename[0], filename[1], len(orig_dict)
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
def count_srl_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
return
print(filename[0])
num_sentences = 0
with filename[1].open("r") as fp:
for line in fp:
if line == '\n':
num_sentences += 1
# return filename[0], filename[1], num_sentences
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], num_sentences), output)
def srl_sentences_generator(infile, curr_index, sen_start_index):
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
if curr_index < sen_start_index:
curr_index += 1
else:
yield to_sentence(sentence_arr), sentence_arr
yield None
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
sentence_id = max(0, sentence_id - 10)
for i, srl_file in enumerate(srl_file_sizes):
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
srl_files = srl_file_sizes[i:]
break
for file_info in srl_files:
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
el = next(srl_gen)
while el is not None:
yield el
el = next(srl_gen)
yield None
# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
origfiles = []
for subdir, dirs, files in os.walk(ORIGPATH):
for file in files:
origfiles.append(Path(os.path.join(subdir, file)))
origfiles=list(enumerate(sorted(origfiles)))
##### REMOVE ############
# origfiles = origfiles[:3]
# count sentences in orig (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
# srl_file_sizes = {}
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
# with Pool(CPU_CORES) as p:
# # p.map(handle_file, infiles)
# p.map(count_orig_file_sentences, origfiles)
for i in range(len(origfiles)):
count_orig_file_sentences(origfiles[i])
orig_file_sizes = []
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
print(x.name)
if x.is_file():
with x.open('rb') as pkl_small_file:
orig_file_sizes.append(pickle.load(pkl_small_file))
# orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
print("Sorting orig files")
orig_file_sizes = sorted(orig_file_sizes)
total_size = 0
orig_file_sizes_final = []
print("Calculating orig files size")
for n, pa, si in orig_file_sizes:
orig_file_sizes_final.append((n, pa, si, total_size))
total_size += si
orig_file_sizes = orig_file_sizes_final
print("Saving orig files size")
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
pickle.dump(orig_file_sizes, output)
print("Orig files saved")
else:
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
orig_file_sizes = pickle.load(pkl_file)
# count sentences in srl (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
# srl_file_sizes = {}
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
# with Pool(CPU_CORES) as p:
# # p.map(handle_file, infiles)
# p.map(count_srl_file_sentences, infiles)
for i in range(len(infiles)):
count_srl_file_sentences(infiles[i])
srl_file_sizes = []
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
print(x.name)
if x.is_file():
with x.open('rb') as pkl_small_file:
srl_file_sizes.append(pickle.load(pkl_small_file))
print("Sorting srl files")
srl_file_sizes = sorted(srl_file_sizes)
total_size = 0
srl_file_sizes_final = []
print("Calculating srl files size")
for n, pa, si in srl_file_sizes:
srl_file_sizes_final.append((n, pa, si, total_size))
total_size += si
srl_file_sizes = srl_file_sizes_final
print("Saving srl files size")
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
pickle.dump(srl_file_sizes, output)
print("Srl files saved")
else:
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
srl_file_sizes = pickle.load(pkl_file)
# print(len(orig_file_sizes))
# print('asd' + 2)
# inputs = []
# srl_i = 0
# srl_file = srl_file_sizes[srl_i]
# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
# interesting_srl_files = []
# # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
# # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
# # srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
# while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
# # if beginning of file is in
# if srl_file[3] > orig_first_sent_i:
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
# # print('if %d' % srl_file[3])
# else:
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
# # print('else %d' % orig_first_sent_i)
#
# if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
# srl_i += 1
# if srl_i < len(srl_file_sizes):
# srl_file = srl_file_sizes[srl_i]
# else:
# break
# # print(srl_i)
# # print('a ' + 2)
# else:
# break
#
# inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
# print(inputs[-1])
# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
# a = next(srl_gen)
# b = next(srl_gen)
# c = next(srl_gen)
print('beginning processing')
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
# p.map(handle_file, inputs)
p.map(handle_file, orig_file_sizes)
# for of in orig_file_sizes:
# handle_file(of)
logging.info("Finished generating .json files.")

View File

@ -0,0 +1,294 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import pickle
from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
import logging
from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["giga"])
INPATH = Path(config["tools"]["giga_srl_errors"])
OUTPATH = Path(config["tools"]["giga_json"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
return k
raise KeyError
def match_sentence_id_giga(sentence, orig_dict):
for k, e in orig_dict.items():
# orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == e["text"]:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
def handle_file_old(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
def fix_json(srl_gen, error_sentence, orig_json_data):
# sentence_id = whole_input[0][3]
# orig_infile = whole_input[0][1]
# sentence_id = whole_input[3]
# orig_infile = whole_input[1]
# origfile = origfiles[0][1]
# infile_tpl = infile_tpl[0]
# i = infile_tpl[0]
# infile = infile_tpl[1]
# outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
# if outfile.exists():
# return
# origfile = get_origfile()
# orig_dict = par.parse_tei(orig_infile)
# outdata = {}
# gen = srl_multiple_files_sentences_generator(sentence_id)
# gen = srl_multiple_files_sentences_generator(whole_input[1])
# mismatch_sentences = 0
# look at neighbouring sentences if they are correct
sentence, sentence_arr = next(srl_gen)
# orig_sentence = " ".join(token[2] for token in e["tokens"])
sid = error_sentence
# a = orig_json_data[sid]
if orig_json_data[sid] != []:
# print('POSSIBLE ERROR:')
# print(orig_json_data[sid])
orig_json_data[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
orig_json_data[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in orig_json_data[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(orig_json_data[sid])
print(sid)
print()
print()
# a = orig_json_data[sid]
return orig_json_data
def count_orig_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
return
print(filename[0])
orig_dict = par.parse_tei(filename[1])
# return filename[0], filename[1], len(orig_dict)
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
def count_srl_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
return
print(filename[0])
num_sentences = 0
with filename[1].open("r") as fp:
for line in fp:
if line == '\n':
num_sentences += 1
# return filename[0], filename[1], num_sentences
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], num_sentences), output)
def srl_error_fix_generator(infile):
with infile.open("rb") as fp:
for sentence_arr in extract_sentences(fp.readlines()):
yield to_sentence(sentence_arr), sentence_arr
yield None
def srl_sentences_generator(infile, curr_index, sen_start_index):
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
if curr_index < sen_start_index:
curr_index += 1
else:
yield to_sentence(sentence_arr), sentence_arr
yield None
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
sentence_id = max(0, sentence_id - 10)
for i, srl_file in enumerate(srl_file_sizes):
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
srl_files = srl_file_sizes[i:]
break
for file_info in srl_files:
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
el = next(srl_gen)
while el is not None:
yield el
el = next(srl_gen)
yield None
error_sentences_grouped = []
group = False
prev_name = ''
# group sentences by their files
for name in error_sentences:
if name[:9] == prev_name:
group.append(name)
else:
prev_name = name[:9]
if group:
error_sentences_grouped.append(group)
group = [name]
error_sentences_grouped.append(group)
srl_gen = srl_error_fix_generator(INPATH)
# find errors in json files:
# with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output:
# sentence_ids = pickle.load(output)
#
#
#
# origfiles = []
# for subdir, dirs, files in os.walk(OUTPATH):
# for file in files:
# origfiles.append(Path(os.path.join(subdir, file)))
# origfiles=sorted(origfiles)
#
#
#
# for sent in origfiles:
# # for sent in sentence_ids:
# # outfile = Path(OUTPATH, sent[:9] + '-dedup.json')
# outfile = sent
#
# try:
# with outfile.open() as json_file:
# json.load(json_file)
# pass
# except:
# print(outfile.name)
#
#
# raise Exception('test')
# iterate over all wronged sentences and fix them
for errors_in_file in error_sentences_grouped:
outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json')
with outfile.open() as json_file:
print(outfile.name)
orig_json_data = json.load(json_file)
for error_sentence in errors_in_file:
orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data)
with outfile.open('w') as json_file:
json.dump(orig_json_data, json_file)
logging.info("SRL relations written to: {}".format(outfile))

View File

@ -1,3 +1,5 @@
import pickle
from parser.parser import Parser
import os
from os.path import join, dirname
@ -15,8 +17,21 @@ par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'giga'
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
INDIR_GIGA = Path(config["tools"]["giga_orig"])
INDIR_JOS = Path(config["tools"]["giga_jos"])
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
@ -36,8 +51,9 @@ print("end parsing ssj")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
if analysis == 'kres':
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
def handle_file(infile):
i = infile[0]
@ -65,10 +81,297 @@ def handle_file(infile):
return True
return False
def giga_orig_generator():
with open(INDIR_GIGA, 'r') as gof:
previous_new_line = False
for l_gof in gof:
if l_gof == '\n':
if previous_new_line:
continue
previous_new_line = True
elif previous_new_line:
previous_new_line = False
yield l_gof
def handle_gigafida_file():
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
num_lines = 1393184026
# 1393184026
# 1393184033
# return
num_lines_per_part = num_lines / GIGA_PARTS
curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
ignore_lines = True
wf = False
else:
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
ignore_lines = False
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if ignore_lines:
if i > num_lines_per_part * curr_part and l_gof == '\n':
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
ignore_lines = False
# delete last file (probably not whole)
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
if ignore_lines:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if wf:
# print(i)
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if wf:
wf.close()
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
import time
def handle_giga_file(ran):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
num_lines = 1393184026
# 1393184026
# 1393184033
# return
num_lines_per_part = num_lines / GIGA_PARTS
curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if curr_part < ran[0]:
if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
if curr_part < ran[0]:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if curr_part not in file_indices:
continue
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if curr_part in file_indices:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if curr_part in file_indices and wf:
wf.close()
if curr_part >= ran[1]:
break
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
def handle_giga_file_selected_sentences(error_sentences):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
# print('num_lines' + 3)
# num_lines = 1393184026
num_lines = 1393222523
# 1393184026
# 1393184033
# return
# num_lines_per_part = num_lines / GIGA_PARTS
# curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
os.remove(os.path.join(OUTDIR, 'giga_errors'))
wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
sentence_ids_list = pickle.load(pkl_file)
sentence_id = 0
skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if l_gjf == '\n':
if not skip_sentence:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
sentence_id += 1
if sentence_ids_list[sentence_id] in error_sentences:
print(sentence_ids_list[sentence_id])
skip_sentence = False
else:
skip_sentence = True
if skip_sentence:
continue
# if curr_part < ran[0]:
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
# if curr_part < ran[0]:
# print(curr_part)
# curr_part += 1
# continue
# else:
# continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
# if i > num_lines_per_part * (curr_part + 1):
# curr_part += 1
# # if wf doesn't exist (first one)
# if curr_part in file_indices and wf:
# wf.close()
# if curr_part >= ran[1]:
# break
# if curr_part in file_indices:
# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
# curr_part += 1
wf.close()
file_indices = set(range(0, 100000))
with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
file_indices = set(pickle.load(pkl_file))
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
if analysis == 'kres':
p.map(handle_file, infiles)
elif analysis == 'gigafida':
handle_gigafida_file()
elif analysis == 'giga':
final_range = [0, 100000]
size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
# splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
ranges = []
ps = None
for i in range(CPU_CORES):
s = int(final_range[0] + size_per_proc * i)
ns = int(final_range[0] + size_per_proc * (i + 1))
ranges.append([s, ns])
# ranges = [[0, 1]]
# p.map(handle_giga_file, ranges)
# p.map(handle_giga_file, ranges)
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
handle_giga_file_selected_sentences(set(error_sentences))
logging.info("end parsing kres")

View File

@ -57,7 +57,10 @@ class Parser:
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
guess_corpus = "KRES"
if root.get("id")[0:2] == 'GF':
guess_corpus = "GIGA"
else:
guess_corpus = "KRES"
divs = [root]
else:
guess_corpus = "SSJ"
@ -65,7 +68,10 @@ class Parser:
# parse divs
for div in divs:
f_id = div.get("id")
f_id = div.get("id")[:-6]
if guess_corpus == "GIGA":
div = div.findall(".//body")[0]
# parse paragraphs
for p in div.findall(".//p"):
@ -75,46 +81,62 @@ class Parser:
for s in p.findall(".//s"):
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_list = []
sentence_tokens = []
# parse tokens
for el in s.iter():
if el.tag in self.W_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES"
else el.get("ana").split(":")[-1]),
)]
if guess_corpus != "GIGA":
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
)]
else:
sentence_list.append(el.text)
elif el.tag in self.C_TAGS:
# only Kres' C_TAGS have ids
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
if guess_corpus != "GIGA":
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in self.S_TAGS:
# Kres' <S /> doesn't contain .text
sentence_text += " "
if guess_corpus == "GIGA":
sentence_list.append(el.text)
else:
sentence_text += " "
else:
# pass links and linkGroups
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
)
}
if guess_corpus == "GIGA":
res_dict[sentence_id] = {
"sid": sentence_id,
"text": ' '.join(sentence_list),
"tokens": None,
"links": None
}
else:
res_dict[sentence_id] = {
"sid": sentence_id,
"