Compare commits
1 Commits
Author | SHA1 | Date |
---|---|---|
Luka | c1ecc4cdbc | 2 years ago |
@ -0,0 +1,19 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
# INPATH = Path(config["tools"]["giga_srl"])
|
||||||
|
# infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
|
||||||
|
SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files'
|
||||||
|
from shutil import copyfile
|
||||||
|
|
||||||
|
INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl'
|
||||||
|
OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv'
|
||||||
|
for i in range(100000):
|
||||||
|
# print(os.path.join(INPATH, 'giga.%07d.tsv' % i))
|
||||||
|
# if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)):
|
||||||
|
# print('giga.%07d.tsv' % i)
|
||||||
|
if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)):
|
||||||
|
copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i))
|
||||||
|
print('giga%07d.srl.tsv' % i)
|
||||||
|
|
||||||
|
if i % 1000 == 0:
|
||||||
|
print(i)
|
@ -0,0 +1,192 @@
|
|||||||
|
import pickle
|
||||||
|
|
||||||
|
from parser.parser import Parser
|
||||||
|
import os
|
||||||
|
from os.path import join, dirname
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import cProfile
|
||||||
|
import configparser
|
||||||
|
import logging
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
|
SSJ500K_2_1 = 27829 # number of sentences
|
||||||
|
par = Parser()
|
||||||
|
|
||||||
|
# path to data
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.read("tools.cfg")
|
||||||
|
analysis = ''
|
||||||
|
if 'kres_orig' in config["tools"]:
|
||||||
|
analysis = 'kres'
|
||||||
|
INDIR = Path(config["tools"]["kres_orig"])
|
||||||
|
OUTDIR = Path(config["tools"]["kres_tsv"])
|
||||||
|
elif 'giga_orig' in config["tools"]:
|
||||||
|
# analysis = 'gigafida'
|
||||||
|
analysis = 'giga'
|
||||||
|
INDIR_GIGA = Path(config["tools"]["giga_orig"])
|
||||||
|
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
|
||||||
|
INDIR_JOS = Path(config["tools"]["giga_jos"])
|
||||||
|
OUTDIR = Path(config["tools"]["giga_tsv"])
|
||||||
|
GIGA_PARTS = int(config["tools"]["giga_parts"])
|
||||||
|
INTERNAL_DATA = config["tools"]["internal_data"]
|
||||||
|
|
||||||
|
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||||
|
|
||||||
|
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||||
|
LOGFILE.touch(exist_ok=True)
|
||||||
|
LOGFILE.resolve()
|
||||||
|
|
||||||
|
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||||
|
|
||||||
|
origfiles = []
|
||||||
|
for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
|
||||||
|
for file in files:
|
||||||
|
origfiles.append(Path(os.path.join(subdir, file)))
|
||||||
|
origfiles=list(enumerate(sorted(origfiles)))
|
||||||
|
|
||||||
|
def giga_orig_sentence_generator():
|
||||||
|
with open(INDIR_GIGA, 'r') as gof:
|
||||||
|
previous_new_line = False
|
||||||
|
sentence_words = []
|
||||||
|
for l_gof in gof:
|
||||||
|
if l_gof == '\n':
|
||||||
|
yield ' '.join(sentence_words)
|
||||||
|
sentence_words = []
|
||||||
|
else:
|
||||||
|
sentence_words.append(l_gof.split('\t')[0])
|
||||||
|
# yield l_gof
|
||||||
|
|
||||||
|
sentence_generator = giga_orig_sentence_generator()
|
||||||
|
|
||||||
|
sentence_ids = []
|
||||||
|
for origfile in origfiles:
|
||||||
|
split_file_sentences = par.parse_tei(origfile[1])
|
||||||
|
for k, v in split_file_sentences.items():
|
||||||
|
one_file_sentence = next(sentence_generator)
|
||||||
|
if one_file_sentence == v['text']:
|
||||||
|
sentence_ids.append(v['sid'])
|
||||||
|
else:
|
||||||
|
print('----------------')
|
||||||
|
print('ERROR')
|
||||||
|
print(v['sid'])
|
||||||
|
print(one_file_sentence)
|
||||||
|
print(v['text'])
|
||||||
|
print(origfile[0])
|
||||||
|
|
||||||
|
# count sentences in orig (if not counted before)
|
||||||
|
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
|
||||||
|
if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
|
||||||
|
os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
|
||||||
|
|
||||||
|
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
|
||||||
|
pickle.dump(sentence_ids, output)
|
||||||
|
|
||||||
|
# def giga_orig_generator():
|
||||||
|
# with open(INDIR_GIGA, 'r') as gof:
|
||||||
|
# previous_new_line = False
|
||||||
|
# for l_gof in gof:
|
||||||
|
# if l_gof == '\n':
|
||||||
|
# if previous_new_line:
|
||||||
|
# continue
|
||||||
|
# previous_new_line = True
|
||||||
|
# elif previous_new_line:
|
||||||
|
# previous_new_line = False
|
||||||
|
# yield l_gof
|
||||||
|
|
||||||
|
# import time
|
||||||
|
# def handle_giga_file(ran):
|
||||||
|
# """
|
||||||
|
# File that splits big text file into more minor files. Only split on empty lines.
|
||||||
|
# """
|
||||||
|
# # with open(INDIR_GIGA, 'r') as gof:
|
||||||
|
# # with open(INDIR_JOS, 'r') as gjf:
|
||||||
|
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||||
|
# # pass
|
||||||
|
# # num_lines = i + 1
|
||||||
|
# # print(num_lines)
|
||||||
|
# num_lines = 1393184026
|
||||||
|
# # 1393184026
|
||||||
|
# # 1393184033
|
||||||
|
# # return
|
||||||
|
# num_lines_per_part = num_lines / GIGA_PARTS
|
||||||
|
# curr_part = 0
|
||||||
|
# gof_generator = giga_orig_generator()
|
||||||
|
#
|
||||||
|
# diff_files = set()
|
||||||
|
# # with open(INDIR_GIGA, 'r') as gof:
|
||||||
|
# with open(INDIR_GIGA_OLD, 'r') as gjf:
|
||||||
|
# # sentence = {}
|
||||||
|
# # sentence['tokens'] = []
|
||||||
|
# # sentence['links'] = {}
|
||||||
|
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
|
||||||
|
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
|
||||||
|
#
|
||||||
|
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
|
||||||
|
#
|
||||||
|
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||||
|
# for i, l_gjf in enumerate(gjf):
|
||||||
|
# l_gof = next(gof_generator)
|
||||||
|
# if curr_part < ran[0]:
|
||||||
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
|
||||||
|
# if curr_part < ran[0]:
|
||||||
|
# print(curr_part)
|
||||||
|
# curr_part += 1
|
||||||
|
# continue
|
||||||
|
# else:
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# l_gof_split = l_gof.split('\t')
|
||||||
|
# l_gjf_split = l_gjf.split('\t')
|
||||||
|
#
|
||||||
|
# # if punctuation
|
||||||
|
# if l_gof != '\n':
|
||||||
|
# if l_gof_split != l_gjf_split:
|
||||||
|
# print(curr_part)
|
||||||
|
# diff_files.add(curr_part)
|
||||||
|
# l_gof = next(gof_generator)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# # if l_gof == '\n':
|
||||||
|
# else:
|
||||||
|
# # wf.flush()
|
||||||
|
# # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
||||||
|
# if i > num_lines_per_part * (curr_part + 1):
|
||||||
|
# curr_part += 1
|
||||||
|
# # if wf doesn't exist (first one)
|
||||||
|
# # wf.close()
|
||||||
|
# if curr_part >= ran[1]:
|
||||||
|
# break
|
||||||
|
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
|
||||||
|
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
|
||||||
|
#
|
||||||
|
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
||||||
|
#
|
||||||
|
# curr_part += 1
|
||||||
|
# return diff_files
|
||||||
|
# # wf.close()
|
||||||
|
#
|
||||||
|
# with Pool(CPU_CORES) as p:
|
||||||
|
# final_range = [0, 100000]
|
||||||
|
# # final_range = [0, 150]
|
||||||
|
# # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
|
||||||
|
# # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
|
||||||
|
# # ranges = []
|
||||||
|
# # ps = None
|
||||||
|
# # for i in range(CPU_CORES):
|
||||||
|
# # s = int(final_range[0] + size_per_proc * i)
|
||||||
|
# # ns = int(final_range[0] + size_per_proc * (i + 1))
|
||||||
|
# # ranges.append([s, ns])
|
||||||
|
# # # ranges = [[0, 1]]
|
||||||
|
# # res = p.map(handle_giga_file, ranges)
|
||||||
|
#
|
||||||
|
# res = handle_giga_file(final_range)
|
||||||
|
# res = sorted(list(res))
|
||||||
|
# if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
|
||||||
|
# os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
|
||||||
|
# with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
|
||||||
|
# pickle.dump(res, pkl_file)
|
||||||
|
# # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
|
||||||
|
# # mydict2 = pickle.load(pkl_file)
|
||||||
|
# print('test')
|
@ -0,0 +1,114 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from parser.parser import Parser
|
||||||
|
import configparser
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
|
# parse config
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.read("tools.cfg")
|
||||||
|
# ORIGPATH = Path(config["tools"]["kres_orig"])
|
||||||
|
INPATH = Path(config["tools"]["giga_srl"])
|
||||||
|
OUTPATH = Path(config["tools"]["kres_json"])
|
||||||
|
DEBUG = config["tools"]["debug"] == "True"
|
||||||
|
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||||
|
|
||||||
|
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||||
|
LOGFILE.touch(exist_ok=True)
|
||||||
|
LOGFILE.resolve()
|
||||||
|
|
||||||
|
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||||
|
|
||||||
|
def get_origfile(filename):
|
||||||
|
for origfile in ORIGPATH.iterdir():
|
||||||
|
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||||
|
return origfile
|
||||||
|
raise FileNotFoundError
|
||||||
|
|
||||||
|
def extract_sentences(line_reader):
|
||||||
|
acc = []
|
||||||
|
# last char in line is \n, remove it
|
||||||
|
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
|
||||||
|
if len(line) == 1: # empty line
|
||||||
|
tmp = acc
|
||||||
|
acc = []
|
||||||
|
yield tmp
|
||||||
|
else:
|
||||||
|
acc.append(line)
|
||||||
|
|
||||||
|
def to_sentence(sentence_arr):
|
||||||
|
return " ".join([token[1] for token in sentence_arr])
|
||||||
|
|
||||||
|
def match_sentence_id(sentence, orig_dict):
|
||||||
|
for k, e in orig_dict.items():
|
||||||
|
orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||||
|
if sentence == orig_sentence:
|
||||||
|
return k
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
def get_dep_rel(token):
|
||||||
|
logging.debug(token)
|
||||||
|
for i, field in enumerate(token[14:]):
|
||||||
|
if field != "_":
|
||||||
|
return {
|
||||||
|
"arg": field,
|
||||||
|
"from": i, # i-th predicate in sentence
|
||||||
|
"dep": token[0],
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
def handle_file(infile_tpl):
|
||||||
|
i = infile_tpl[0]
|
||||||
|
infile = infile_tpl[1]
|
||||||
|
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||||
|
origfile = get_origfile(infile)
|
||||||
|
orig_dict = par.parse_tei(origfile)
|
||||||
|
|
||||||
|
with infile.open("rb") as fp:
|
||||||
|
outdata = {}
|
||||||
|
for sentence_arr in extract_sentences(fp.readlines()):
|
||||||
|
# tsv dropped sentence ids, match the ID, using original data
|
||||||
|
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
||||||
|
|
||||||
|
outdata[sid] = []
|
||||||
|
|
||||||
|
# find all predicate indices in the sentence
|
||||||
|
predicates = []
|
||||||
|
for token in sentence_arr:
|
||||||
|
if token[12] == "Y":
|
||||||
|
predicates += [token[0]] # idx
|
||||||
|
|
||||||
|
deprel = get_dep_rel(token)
|
||||||
|
if deprel is not None:
|
||||||
|
outdata[sid].append(deprel)
|
||||||
|
|
||||||
|
# deprel["from"] points to n-th predicate
|
||||||
|
# replace with predicate's token index
|
||||||
|
for deprel in outdata[sid]:
|
||||||
|
deprel["from"] = predicates[deprel["from"]]
|
||||||
|
|
||||||
|
if DEBUG:
|
||||||
|
print(to_sentence(sentence_arr))
|
||||||
|
print(outdata[sid])
|
||||||
|
print(sid)
|
||||||
|
print()
|
||||||
|
print()
|
||||||
|
|
||||||
|
with outfile.open("w") as fp:
|
||||||
|
json.dump(outdata, fp)
|
||||||
|
logging.info("SRL relations written to: {}".format(outfile))
|
||||||
|
|
||||||
|
|
||||||
|
# main
|
||||||
|
par = Parser()
|
||||||
|
OUTPATH.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
|
||||||
|
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
|
||||||
|
|
||||||
|
with Pool(CPU_CORES) as p:
|
||||||
|
p.map(handle_file, infiles)
|
||||||
|
|
||||||
|
logging.info("Finished generating .json files.")
|
@ -0,0 +1,294 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from pathlib import Path
|
||||||
|
from parser.parser import Parser
|
||||||
|
import configparser
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
|
# parse config
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.read("tools.cfg")
|
||||||
|
ORIGPATH = Path(config["tools"]["giga"])
|
||||||
|
INPATH = Path(config["tools"]["giga_srl_errors"])
|
||||||
|
OUTPATH = Path(config["tools"]["giga_json"])
|
||||||
|
INTERNAL_DATA = Path(config["tools"]["internal_data"])
|
||||||
|
DEBUG = config["tools"]["debug"] == "True"
|
||||||
|
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||||
|
|
||||||
|
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||||
|
LOGFILE.touch(exist_ok=True)
|
||||||
|
LOGFILE.resolve()
|
||||||
|
|
||||||
|
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||||
|
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_origfile(filename):
|
||||||
|
for origfile in ORIGPATH.iterdir():
|
||||||
|
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||||
|
return origfile
|
||||||
|
raise FileNotFoundError
|
||||||
|
|
||||||
|
def extract_sentences(line_reader):
|
||||||
|
acc = []
|
||||||
|
# last char in line is \n, remove it
|
||||||
|
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
|
||||||
|
if len(line) == 1: # empty line
|
||||||
|
tmp = acc
|
||||||
|
acc = []
|
||||||
|
yield tmp
|
||||||
|
else:
|
||||||
|
acc.append(line)
|
||||||
|
|
||||||
|
def to_sentence(sentence_arr):
|
||||||
|
return " ".join([token[1] for token in sentence_arr])
|
||||||
|
|
||||||
|
def match_sentence_id(sentence, orig_dict):
|
||||||
|
for k, e in orig_dict.items():
|
||||||
|
orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||||
|
if sentence == orig_sentence:
|
||||||
|
return k
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
def match_sentence_id_giga(sentence, orig_dict):
|
||||||
|
for k, e in orig_dict.items():
|
||||||
|
# orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||||
|
if sentence == e["text"]:
|
||||||
|
return k
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
def get_dep_rel(token):
|
||||||
|
logging.debug(token)
|
||||||
|
for i, field in enumerate(token[14:]):
|
||||||
|
if field != "_":
|
||||||
|
return {
|
||||||
|
"arg": field,
|
||||||
|
"from": i, # i-th predicate in sentence
|
||||||
|
"dep": token[0],
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
def handle_file_old(infile_tpl):
|
||||||
|
i = infile_tpl[0]
|
||||||
|
infile = infile_tpl[1]
|
||||||
|
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||||
|
origfile = get_origfile(infile)
|
||||||
|
orig_dict = par.parse_tei(origfile)
|
||||||
|
|
||||||
|
with infile.open("rb") as fp:
|
||||||
|
outdata = {}
|
||||||
|
for sentence_arr in extract_sentences(fp.readlines()):
|
||||||
|
# tsv dropped sentence ids, match the ID, using original data
|
||||||
|
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
||||||
|
|
||||||
|
outdata[sid] = []
|
||||||
|
|
||||||
|
# find all predicate indices in the sentence
|
||||||
|
predicates = []
|
||||||
|
for token in sentence_arr:
|
||||||
|
if token[12] == "Y":
|
||||||
|
predicates += [token[0]] # idx
|
||||||
|
|
||||||
|
deprel = get_dep_rel(token)
|
||||||
|
if deprel is not None:
|
||||||
|
outdata[sid].append(deprel)
|
||||||
|
|
||||||
|
# deprel["from"] points to n-th predicate
|
||||||
|
# replace with predicate's token index
|
||||||
|
for deprel in outdata[sid]:
|
||||||
|
deprel["from"] = predicates[deprel["from"]]
|
||||||
|
|
||||||
|
if DEBUG:
|
||||||
|
print(to_sentence(sentence_arr))
|
||||||
|
print(outdata[sid])
|
||||||
|
print(sid)
|
||||||
|
print()
|
||||||
|
print()
|
||||||
|
|
||||||
|
with outfile.open("w") as fp:
|
||||||
|
json.dump(outdata, fp)
|
||||||
|
logging.info("SRL relations written to: {}".format(outfile))
|
||||||
|
|
||||||
|
|
||||||
|
def fix_json(srl_gen, error_sentence, orig_json_data):
|
||||||
|
# sentence_id = whole_input[0][3]
|
||||||
|
# orig_infile = whole_input[0][1]
|
||||||
|
# sentence_id = whole_input[3]
|
||||||
|
# orig_infile = whole_input[1]
|
||||||
|
|
||||||
|
# origfile = origfiles[0][1]
|
||||||
|
# infile_tpl = infile_tpl[0]
|
||||||
|
|
||||||
|
# i = infile_tpl[0]
|
||||||
|
# infile = infile_tpl[1]
|
||||||
|
# outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
|
||||||
|
|
||||||
|
# if outfile.exists():
|
||||||
|
# return
|
||||||
|
# origfile = get_origfile()
|
||||||
|
# orig_dict = par.parse_tei(orig_infile)
|
||||||
|
# outdata = {}
|
||||||
|
|
||||||
|
# gen = srl_multiple_files_sentences_generator(sentence_id)
|
||||||
|
# gen = srl_multiple_files_sentences_generator(whole_input[1])
|
||||||
|
|
||||||
|
# mismatch_sentences = 0
|
||||||
|
|
||||||
|
# look at neighbouring sentences if they are correct
|
||||||
|
sentence, sentence_arr = next(srl_gen)
|
||||||
|
# orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||||
|
sid = error_sentence
|
||||||
|
# a = orig_json_data[sid]
|
||||||
|
if orig_json_data[sid] != []:
|
||||||
|
# print('POSSIBLE ERROR:')
|
||||||
|
# print(orig_json_data[sid])
|
||||||
|
orig_json_data[sid] = []
|
||||||
|
|
||||||
|
# find all predicate indices in the sentence
|
||||||
|
predicates = []
|
||||||
|
for token in sentence_arr:
|
||||||
|
if token[12] == "Y":
|
||||||
|
predicates += [token[0]] # idx
|
||||||
|
|
||||||
|
deprel = get_dep_rel(token)
|
||||||
|
if deprel is not None:
|
||||||
|
orig_json_data[sid].append(deprel)
|
||||||
|
|
||||||
|
# deprel["from"] points to n-th predicate
|
||||||
|
# replace with predicate's token index
|
||||||
|
for deprel in orig_json_data[sid]:
|
||||||
|
deprel["from"] = predicates[deprel["from"]]
|
||||||
|
|
||||||
|
if DEBUG:
|
||||||
|
print(to_sentence(sentence_arr))
|
||||||
|
print(orig_json_data[sid])
|
||||||
|
print(sid)
|
||||||
|
print()
|
||||||
|
print()
|
||||||
|
# a = orig_json_data[sid]
|
||||||
|
return orig_json_data
|
||||||
|
|
||||||
|
def count_orig_file_sentences(filename):
|
||||||
|
|
||||||
|
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
|
||||||
|
return
|
||||||
|
print(filename[0])
|
||||||
|
orig_dict = par.parse_tei(filename[1])
|
||||||
|
# return filename[0], filename[1], len(orig_dict)
|
||||||
|
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
|
||||||
|
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
|
||||||
|
|
||||||
|
|
||||||
|
def count_srl_file_sentences(filename):
|
||||||
|
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
|
||||||
|
return
|
||||||
|
|
||||||
|
print(filename[0])
|
||||||
|
num_sentences = 0
|
||||||
|
with filename[1].open("r") as fp:
|
||||||
|
for line in fp:
|
||||||
|
if line == '\n':
|
||||||
|
num_sentences += 1
|
||||||
|
|
||||||
|
# return filename[0], filename[1], num_sentences
|
||||||
|
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
|
||||||
|
pickle.dump((filename[0], filename[1], num_sentences), output)
|
||||||
|
|
||||||
|
def srl_error_fix_generator(infile):
|
||||||
|
with infile.open("rb") as fp:
|
||||||
|
for sentence_arr in extract_sentences(fp.readlines()):
|
||||||
|
yield to_sentence(sentence_arr), sentence_arr
|
||||||
|
yield None
|
||||||
|
|
||||||
|
def srl_sentences_generator(infile, curr_index, sen_start_index):
|
||||||
|
with infile.open("rb") as fp:
|
||||||
|
outdata = {}
|
||||||
|
for sentence_arr in extract_sentences(fp.readlines()):
|
||||||
|
if curr_index < sen_start_index:
|
||||||
|
curr_index += 1
|
||||||
|
else:
|
||||||
|
yield to_sentence(sentence_arr), sentence_arr
|
||||||
|
yield None
|
||||||
|
|
||||||
|
|
||||||
|
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
|
||||||
|
sentence_id = max(0, sentence_id - 10)
|
||||||
|
for i, srl_file in enumerate(srl_file_sizes):
|
||||||
|
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
|
||||||
|
srl_files = srl_file_sizes[i:]
|
||||||
|
break
|
||||||
|
|
||||||
|
for file_info in srl_files:
|
||||||
|
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
|
||||||
|
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
|
||||||
|
el = next(srl_gen)
|
||||||
|
while el is not None:
|
||||||
|
yield el
|
||||||
|
el = next(srl_gen)
|
||||||
|
|
||||||
|
yield None
|
||||||
|
|
||||||
|
error_sentences_grouped = []
|
||||||
|
group = False
|
||||||
|
prev_name = ''
|
||||||
|
# group sentences by their files
|
||||||
|
for name in error_sentences:
|
||||||
|
if name[:9] == prev_name:
|
||||||
|
group.append(name)
|
||||||
|
else:
|
||||||
|
prev_name = name[:9]
|
||||||
|
if group:
|
||||||
|
error_sentences_grouped.append(group)
|
||||||
|
group = [name]
|
||||||
|
error_sentences_grouped.append(group)
|
||||||
|
|
||||||
|
srl_gen = srl_error_fix_generator(INPATH)
|
||||||
|
|
||||||
|
# find errors in json files:
|
||||||
|
# with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output:
|
||||||
|
# sentence_ids = pickle.load(output)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# origfiles = []
|
||||||
|
# for subdir, dirs, files in os.walk(OUTPATH):
|
||||||
|
# for file in files:
|
||||||
|
# origfiles.append(Path(os.path.join(subdir, file)))
|
||||||
|
# origfiles=sorted(origfiles)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# for sent in origfiles:
|
||||||
|
# # for sent in sentence_ids:
|
||||||
|
# # outfile = Path(OUTPATH, sent[:9] + '-dedup.json')
|
||||||
|
# outfile = sent
|
||||||
|
#
|
||||||
|
# try:
|
||||||
|
# with outfile.open() as json_file:
|
||||||
|
# json.load(json_file)
|
||||||
|
# pass
|
||||||
|
# except:
|
||||||
|
# print(outfile.name)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# raise Exception('test')
|
||||||
|
# iterate over all wronged sentences and fix them
|
||||||
|
for errors_in_file in error_sentences_grouped:
|
||||||
|
outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json')
|
||||||
|
with outfile.open() as json_file:
|
||||||
|
print(outfile.name)
|
||||||
|
orig_json_data = json.load(json_file)
|
||||||
|
for error_sentence in errors_in_file:
|
||||||
|
orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data)
|
||||||
|
|
||||||
|
with outfile.open('w') as json_file:
|
||||||
|
json.dump(orig_json_data, json_file)
|
||||||
|
logging.info("SRL relations written to: {}".format(outfile))
|
@ -0,0 +1,29 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# parsing tools.cfg values
|
||||||
|
IN_FOLDER="$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg)"
|
||||||
|
echo "input folder: $IN_FOLDER"
|
||||||
|
OUT_FOLDER="$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg)"
|
||||||
|
echo "output folder: $OUT_FOLDER"
|
||||||
|
|
||||||
|
SUFFIX="srl.tsv"
|
||||||
|
|
||||||
|
mkdir -p "$OUT_FOLDER"
|
||||||
|
rm "$OUT_FOLDER/*${SUFFIX}" &> /dev/null
|
||||||
|
|
||||||
|
for infile in "$IN_FOLDER/*"; do
|
||||||
|
echo "Tagging: ${infile}"
|
||||||
|
base=$(basename $infile | cut -d'.' -f1)
|
||||||
|
outfile="${OUT_FOLDER}/${base}.${SUFFIX}"
|
||||||
|
|
||||||
|
# mate-tools tagger
|
||||||
|
./scripts/parse_srl_only_mod.sh "$infile" "$outfile"
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "Saved as ${outfile}"
|
||||||
|
else
|
||||||
|
echo "ERR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
@ -0,0 +1,29 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# parsing tools.cfg values
|
||||||
|
IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
|
||||||
|
echo "input folder: $IN_FOLDER"
|
||||||
|
OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
|
||||||
|
echo "output folder: $OUT_FOLDER"
|
||||||
|
|
||||||
|
SUFFIX="srl.tsv"
|
||||||
|
|
||||||
|
mkdir -p $OUT_FOLDER
|
||||||
|
rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
|
||||||
|
|
||||||
|
for infile in $IN_FOLDER/*; do
|
||||||
|
echo "Tagging: ${infile}"
|
||||||
|
base=$(basename $infile | cut -d'.' -f1)
|
||||||
|
outfile=${OUT_FOLDER}/${base}.${SUFFIX}
|
||||||
|
|
||||||
|
# mate-tools tagger
|
||||||
|
./scripts/parse_srl_only_mod.sh $infile $outfile
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "Saved as ${outfile}"
|
||||||
|
else
|
||||||
|
echo "ERR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
@ -1,8 +1,18 @@
|
|||||||
[tools]
|
[tools]
|
||||||
kres_orig = /kres_mount/kres_parsed/tei
|
giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
|
||||||
kres_tsv = ../data/kres_out/1_tsv
|
giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
|
||||||
kres_srl = ../data/kres_out/2_srl
|
; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
|
||||||
kres_json = ../data/kres_out/final_json
|
giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
|
||||||
|
giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
|
||||||
|
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
|
||||||
|
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
|
||||||
|
; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
|
||||||
|
giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
|
||||||
|
giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
|
||||||
|
; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP
|
||||||
|
giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
|
||||||
|
internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
|
||||||
|
giga_parts = 100000
|
||||||
logfile = ../progress.log
|
logfile = ../progress.log
|
||||||
cpu_cores = 5
|
cpu_cores = 16
|
||||||
debug = False
|
debug = False
|
||||||
|
@ -0,0 +1,16 @@
|
|||||||
|
[tools]
|
||||||
|
giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
|
||||||
|
giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
|
||||||
|
; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
|
||||||
|
giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
|
||||||
|
giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
|
||||||
|
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
|
||||||
|
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
|
||||||
|
; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
|
||||||
|
giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
|
||||||
|
giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
|
||||||
|
internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
|
||||||
|
giga_parts = 100000
|
||||||
|
logfile = ../progress.log
|
||||||
|
cpu_cores = 1
|
||||||
|
debug = False
|
@ -0,0 +1,8 @@
|
|||||||
|
[tools]
|
||||||
|
kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
|
||||||
|
kres_tsv = ../data/kres_out/1_tsv
|
||||||
|
kres_srl = ../data/kres_out/2_srl
|
||||||
|
kres_json = ../data/kres_out/final_json
|
||||||
|
logfile = ../progress.log
|
||||||
|
cpu_cores = 5
|
||||||
|
debug = False
|
@ -0,0 +1,8 @@
|
|||||||
|
[tools]
|
||||||
|
kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
|
||||||
|
giga_tsv = ../data/giga_out/1_tsv
|
||||||
|
giga_srl = ../data/giga_out/2_srl
|
||||||
|
kres_json = ../data/giga_out/final_json
|
||||||
|
logfile = ../progress.log
|
||||||
|
cpu_cores = 5
|
||||||
|
debug = False
|
Loading…
Reference in new issue