Compare commits

...

2 Commits

Author SHA1 Message Date
fd20295017 Adapted code to ssj500k and added its branch 2022-03-14 11:01:53 +01:00
c1ecc4cdbc Big changes 2022-02-04 11:24:47 +01:00
23 changed files with 1765 additions and 73 deletions

3
.gitignore vendored
View File

@ -6,3 +6,6 @@ nohup.out
data/kres_out/*
data/kres_example/
venv/
.idea/
data/

View File

@ -6,8 +6,9 @@ json_files: # srl_tagged_files
cd tools; python3 gen_json.py
srl_tagged_files: # tsv_files
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
# # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
cd tools/srl-20131216; ./tag_all.sh
# cd tools/srl-20131216; ./tag_ssj500k2.3.sh
tsv_files: # tools/fillpred_model/model.pickle
cd tools; python3 parse_all.py

View File

@ -1,3 +1,11 @@
# Instructions
For mining ssj500k <b>checkout to branch ssj500k</b>.
For running order look at Makefile. Generally it works like this:
- tools/parse_all.py - It creates mate file that is necessary for running Java based srl.jar
- tools/srl-20131216/tag_all.sh - Tags ssj500k
- tools/gen_json.py - Mine SRL to json
- tools/gen_tei.py - Mine SRL to tei
# cjvt-srl-tagging
We'll be using mate-tools to perform SRL on Kres.

View File

@ -15,6 +15,6 @@ run:
-v /etc/group:/etc/group \
-v $(shell pwd)/../../:/cjvt-srl-tagging \
-w /cjvt-srl-tagging \
-v /home/kristjan/kres_mount:/kres_mount:ro \
-v /home/luka/Development/srl/data:/kres_mount:ro \
python-java \
/bin/bash

View File

@ -0,0 +1,19 @@
import os
# INPATH = Path(config["tools"]["giga_srl"])
# infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files'
from shutil import copyfile
INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl'
OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv'
for i in range(100000):
# print(os.path.join(INPATH, 'giga.%07d.tsv' % i))
# if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)):
# print('giga.%07d.tsv' % i)
if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)):
copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i))
print('giga%07d.srl.tsv' % i)
if i % 1000 == 0:
print(i)

View File

View File

@ -0,0 +1,192 @@
import pickle
from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'giga'
INDIR_GIGA = Path(config["tools"]["giga_orig"])
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
INDIR_JOS = Path(config["tools"]["giga_jos"])
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
origfiles = []
for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
for file in files:
origfiles.append(Path(os.path.join(subdir, file)))
origfiles=list(enumerate(sorted(origfiles)))
def giga_orig_sentence_generator():
with open(INDIR_GIGA, 'r') as gof:
previous_new_line = False
sentence_words = []
for l_gof in gof:
if l_gof == '\n':
yield ' '.join(sentence_words)
sentence_words = []
else:
sentence_words.append(l_gof.split('\t')[0])
# yield l_gof
sentence_generator = giga_orig_sentence_generator()
sentence_ids = []
for origfile in origfiles:
split_file_sentences = par.parse_tei(origfile[1])
for k, v in split_file_sentences.items():
one_file_sentence = next(sentence_generator)
if one_file_sentence == v['text']:
sentence_ids.append(v['sid'])
else:
print('----------------')
print('ERROR')
print(v['sid'])
print(one_file_sentence)
print(v['text'])
print(origfile[0])
# count sentences in orig (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
pickle.dump(sentence_ids, output)
# def giga_orig_generator():
# with open(INDIR_GIGA, 'r') as gof:
# previous_new_line = False
# for l_gof in gof:
# if l_gof == '\n':
# if previous_new_line:
# continue
# previous_new_line = True
# elif previous_new_line:
# previous_new_line = False
# yield l_gof
# import time
# def handle_giga_file(ran):
# """
# File that splits big text file into more minor files. Only split on empty lines.
# """
# # with open(INDIR_GIGA, 'r') as gof:
# # with open(INDIR_JOS, 'r') as gjf:
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# # pass
# # num_lines = i + 1
# # print(num_lines)
# num_lines = 1393184026
# # 1393184026
# # 1393184033
# # return
# num_lines_per_part = num_lines / GIGA_PARTS
# curr_part = 0
# gof_generator = giga_orig_generator()
#
# diff_files = set()
# # with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_GIGA_OLD, 'r') as gjf:
# # sentence = {}
# # sentence['tokens'] = []
# # sentence['links'] = {}
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
#
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
#
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# for i, l_gjf in enumerate(gjf):
# l_gof = next(gof_generator)
# if curr_part < ran[0]:
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
# if curr_part < ran[0]:
# print(curr_part)
# curr_part += 1
# continue
# else:
# continue
#
# l_gof_split = l_gof.split('\t')
# l_gjf_split = l_gjf.split('\t')
#
# # if punctuation
# if l_gof != '\n':
# if l_gof_split != l_gjf_split:
# print(curr_part)
# diff_files.add(curr_part)
# l_gof = next(gof_generator)
#
#
# # if l_gof == '\n':
# else:
# # wf.flush()
# # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
# if i > num_lines_per_part * (curr_part + 1):
# curr_part += 1
# # if wf doesn't exist (first one)
# # wf.close()
# if curr_part >= ran[1]:
# break
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
#
# curr_part += 1
# return diff_files
# # wf.close()
#
# with Pool(CPU_CORES) as p:
# final_range = [0, 100000]
# # final_range = [0, 150]
# # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
# # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
# # ranges = []
# # ps = None
# # for i in range(CPU_CORES):
# # s = int(final_range[0] + size_per_proc * i)
# # ns = int(final_range[0] + size_per_proc * (i + 1))
# # ranges.append([s, ns])
# # # ranges = [[0, 1]]
# # res = p.map(handle_giga_file, ranges)
#
# res = handle_giga_file(final_range)
# res = sorted(list(res))
# if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
# os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
# with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
# pickle.dump(res, pkl_file)
# # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
# # mydict2 = pickle.load(pkl_file)
# print('test')

114
tools/gen_json.kres.py Normal file
View File

@ -0,0 +1,114 @@
from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
import logging
from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
# ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["giga_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
def handle_file(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
logging.info("Finished generating .json files.")

View File

@ -1,3 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import pickle
from pathlib import Path
from parser.parser import Parser
import configparser
@ -8,10 +13,11 @@ from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
config.read("tools.cfg.ssj500k2.3")
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
INPATH = Path(config["tools"]["ssj500k_srl"])
OUTPATH = Path(config["tools"]["ssj500k_json"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
@ -48,6 +54,13 @@ def match_sentence_id(sentence, orig_dict):
return k
raise KeyError
def match_sentence_id_giga(sentence, orig_dict):
for k, e in orig_dict.items():
# orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == e["text"]:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
@ -59,7 +72,7 @@ def get_dep_rel(token):
}
return None
def handle_file(infile_tpl):
def handle_file_old(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
@ -101,14 +114,275 @@ def handle_file(infile_tpl):
logging.info("SRL relations written to: {}".format(outfile))
def handle_file(whole_input):
# sentence_id = whole_input[0][3]
# orig_infile = whole_input[0][1]
sentence_id = whole_input[3]
orig_infile = whole_input[1]
# origfile = origfiles[0][1]
# infile_tpl = infile_tpl[0]
# i = infile_tpl[0]
# infile = infile_tpl[1]
outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
if outfile.exists():
return
# origfile = get_origfile()
orig_dict = par.parse_tei(orig_infile)
outdata = {}
gen = srl_multiple_files_sentences_generator(sentence_id)
# gen = srl_multiple_files_sentences_generator(whole_input[1])
mismatch_sentences = 0
for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
if orig_id == 'GF0014802.2685.7':
print('PAUSE')
# look at neighbouring sentences if they are correct
sentence, sentence_arr = next(gen)
# orig_sentence = " ".join(token[2] for token in e["tokens"])
assert sentence.replace(' ', '') == orig_val['text']
# if i != 10 and i != 0:
# print('OK!')
sid = orig_id
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
if mismatch_sentences > 0:
if mismatch_sentences / len(orig_dict.items()) < 0.1:
print('Slight mismatch - %d' % sentence_id)
print(whole_input)
print('ABS mitigated %d' % mismatch_sentences)
print('------------------------------------------------')
else:
print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
print('Big mismatch - %d' % sentence_id)
print(whole_input)
print('ABS mitigated errors:')
print(mismatch_sentences)
print('------------------------------------------------')
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
def count_orig_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
return
print(filename[0])
orig_dict = par.parse_tei(filename[1])
# return filename[0], filename[1], len(orig_dict)
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
def count_srl_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
return
print(filename[0])
num_sentences = 0
with filename[1].open("r") as fp:
for line in fp:
if line == '\n':
num_sentences += 1
# return filename[0], filename[1], num_sentences
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], num_sentences), output)
def srl_sentences_generator(infile, curr_index, sen_start_index):
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
if curr_index < sen_start_index:
curr_index += 1
else:
yield to_sentence(sentence_arr), sentence_arr
yield None
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
sentence_id = max(0, sentence_id - 10)
for i, srl_file in enumerate(srl_file_sizes):
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
srl_files = srl_file_sizes[i:]
break
for file_info in srl_files:
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
el = next(srl_gen)
while el is not None:
yield el
el = next(srl_gen)
yield None
# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
origfiles = []
for subdir, dirs, files in os.walk(ORIGPATH):
for file in files:
origfiles.append(Path(os.path.join(subdir, file)))
origfiles=list(enumerate(sorted(origfiles)))
##### REMOVE ############
# origfiles = origfiles[:3]
# count sentences in orig (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
# srl_file_sizes = {}
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
# with Pool(CPU_CORES) as p:
# # p.map(handle_file, infiles)
# p.map(count_orig_file_sentences, origfiles)
for i in range(len(origfiles)):
count_orig_file_sentences(origfiles[i])
orig_file_sizes = []
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
print(x.name)
if x.is_file():
with x.open('rb') as pkl_small_file:
orig_file_sizes.append(pickle.load(pkl_small_file))
# orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
print("Sorting orig files")
orig_file_sizes = sorted(orig_file_sizes)
total_size = 0
orig_file_sizes_final = []
print("Calculating orig files size")
for n, pa, si in orig_file_sizes:
orig_file_sizes_final.append((n, pa, si, total_size))
total_size += si
orig_file_sizes = orig_file_sizes_final
print("Saving orig files size")
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
pickle.dump(orig_file_sizes, output)
print("Orig files saved")
else:
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
orig_file_sizes = pickle.load(pkl_file)
# count sentences in srl (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
# srl_file_sizes = {}
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
# with Pool(CPU_CORES) as p:
# # p.map(handle_file, infiles)
# p.map(count_srl_file_sentences, infiles)
for i in range(len(infiles)):
count_srl_file_sentences(infiles[i])
srl_file_sizes = []
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
print(x.name)
if x.is_file():
with x.open('rb') as pkl_small_file:
srl_file_sizes.append(pickle.load(pkl_small_file))
print("Sorting srl files")
srl_file_sizes = sorted(srl_file_sizes)
total_size = 0
srl_file_sizes_final = []
print("Calculating srl files size")
for n, pa, si in srl_file_sizes:
srl_file_sizes_final.append((n, pa, si, total_size))
total_size += si
srl_file_sizes = srl_file_sizes_final
print("Saving srl files size")
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
pickle.dump(srl_file_sizes, output)
print("Srl files saved")
else:
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
srl_file_sizes = pickle.load(pkl_file)
# print(len(orig_file_sizes))
# print('asd' + 2)
# inputs = []
# srl_i = 0
# srl_file = srl_file_sizes[srl_i]
# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
# interesting_srl_files = []
# # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
# # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
# # srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
# while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
# # if beginning of file is in
# if srl_file[3] > orig_first_sent_i:
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
# # print('if %d' % srl_file[3])
# else:
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
# # print('else %d' % orig_first_sent_i)
#
# if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
# srl_i += 1
# if srl_i < len(srl_file_sizes):
# srl_file = srl_file_sizes[srl_i]
# else:
# break
# # print(srl_i)
# # print('a ' + 2)
# else:
# break
#
# inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
# print(inputs[-1])
# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
# a = next(srl_gen)
# b = next(srl_gen)
# c = next(srl_gen)
print('beginning processing')
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
# p.map(handle_file, inputs)
p.map(handle_file, orig_file_sizes)
# for of in orig_file_sizes:
# handle_file(of)
logging.info("Finished generating .json files.")

View File

@ -0,0 +1,294 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import pickle
from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
import logging
from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["giga"])
INPATH = Path(config["tools"]["giga_srl_errors"])
OUTPATH = Path(config["tools"]["giga_json"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
return k
raise KeyError
def match_sentence_id_giga(sentence, orig_dict):
for k, e in orig_dict.items():
# orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == e["text"]:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
def handle_file_old(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
def fix_json(srl_gen, error_sentence, orig_json_data):
# sentence_id = whole_input[0][3]
# orig_infile = whole_input[0][1]
# sentence_id = whole_input[3]
# orig_infile = whole_input[1]
# origfile = origfiles[0][1]
# infile_tpl = infile_tpl[0]
# i = infile_tpl[0]
# infile = infile_tpl[1]
# outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
# if outfile.exists():
# return
# origfile = get_origfile()
# orig_dict = par.parse_tei(orig_infile)
# outdata = {}
# gen = srl_multiple_files_sentences_generator(sentence_id)
# gen = srl_multiple_files_sentences_generator(whole_input[1])
# mismatch_sentences = 0
# look at neighbouring sentences if they are correct
sentence, sentence_arr = next(srl_gen)
# orig_sentence = " ".join(token[2] for token in e["tokens"])
sid = error_sentence
# a = orig_json_data[sid]
if orig_json_data[sid] != []:
# print('POSSIBLE ERROR:')
# print(orig_json_data[sid])
orig_json_data[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
orig_json_data[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in orig_json_data[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(orig_json_data[sid])
print(sid)
print()
print()
# a = orig_json_data[sid]
return orig_json_data
def count_orig_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
return
print(filename[0])
orig_dict = par.parse_tei(filename[1])
# return filename[0], filename[1], len(orig_dict)
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
def count_srl_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
return
print(filename[0])
num_sentences = 0
with filename[1].open("r") as fp:
for line in fp:
if line == '\n':
num_sentences += 1
# return filename[0], filename[1], num_sentences
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], num_sentences), output)
def srl_error_fix_generator(infile):
with infile.open("rb") as fp:
for sentence_arr in extract_sentences(fp.readlines()):
yield to_sentence(sentence_arr), sentence_arr
yield None
def srl_sentences_generator(infile, curr_index, sen_start_index):
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
if curr_index < sen_start_index:
curr_index += 1
else:
yield to_sentence(sentence_arr), sentence_arr
yield None
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
sentence_id = max(0, sentence_id - 10)
for i, srl_file in enumerate(srl_file_sizes):
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
srl_files = srl_file_sizes[i:]
break
for file_info in srl_files:
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
el = next(srl_gen)
while el is not None:
yield el
el = next(srl_gen)
yield None
error_sentences_grouped = []
group = False
prev_name = ''
# group sentences by their files
for name in error_sentences:
if name[:9] == prev_name:
group.append(name)
else:
prev_name = name[:9]
if group:
error_sentences_grouped.append(group)
group = [name]
error_sentences_grouped.append(group)
srl_gen = srl_error_fix_generator(INPATH)
# find errors in json files:
# with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output:
# sentence_ids = pickle.load(output)
#
#
#
# origfiles = []
# for subdir, dirs, files in os.walk(OUTPATH):
# for file in files:
# origfiles.append(Path(os.path.join(subdir, file)))
# origfiles=sorted(origfiles)
#
#
#
# for sent in origfiles:
# # for sent in sentence_ids:
# # outfile = Path(OUTPATH, sent[:9] + '-dedup.json')
# outfile = sent
#
# try:
# with outfile.open() as json_file:
# json.load(json_file)
# pass
# except:
# print(outfile.name)
#
#
# raise Exception('test')
# iterate over all wronged sentences and fix them
for errors_in_file in error_sentences_grouped:
outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json')
with outfile.open() as json_file:
print(outfile.name)
orig_json_data = json.load(json_file)
for error_sentence in errors_in_file:
orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data)
with outfile.open('w') as json_file:
json.dump(orig_json_data, json_file)
logging.info("SRL relations written to: {}".format(outfile))

47
tools/gen_tei.py Normal file
View File

@ -0,0 +1,47 @@
# parse config
import configparser
import json
import logging
import os
from pathlib import Path
from tools.parser.parser import Parser
config = configparser.ConfigParser()
config.read("tools.cfg.ssj500k2.3")
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
JSONPATH = Path(config["tools"]["ssj500k_json"] + '/ssj500k-sl.body.json')
OUTPATH = Path(config["tools"]["ssj500k_tei"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
par = Parser()
OUTPATH.mkdir(exist_ok=True)
jsondata = []
with open(JSONPATH, 'r') as jf:
jsondata = json.load(jf)
logging.info("Generating TEI with annotated SRL.")
def handle_file(file, jsondata):
teifile = (ORIGPATH / file)
resfile = (OUTPATH / file)
orig_dict = par.parse_tei(teifile)
# origfile = get_origfile()
orig_dict = par.minimize_tei(teifile, jsondata)
origfiles = []
for subdir, dirs, files in os.walk(ORIGPATH):
for file in files:
handle_file(file, jsondata)

View File

@ -1,3 +1,5 @@
import pickle
from parser.parser import Parser
import os
from os.path import join, dirname
@ -14,9 +16,31 @@ par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
# config.read("tools.cfg")
config.read("tools.cfg.ssj500k2.3")
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'giga'
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
INDIR_GIGA = Path(config["tools"]["giga_orig"])
INDIR_JOS = Path(config["tools"]["giga_jos"])
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
elif 'ssj500k_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'ssj500k'
INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])
INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])
INDIR_JOS = Path(config["tools"]["ssj500k_jos"])
OUTDIR = Path(config["tools"]["ssj500k_tsv"])
INTERNAL_DATA = config["tools"]["internal_data"]
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
@ -34,41 +58,365 @@ print("end parsing ssj")
"""
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
# OUTDIR.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
if analysis == 'kres':
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
def handle_file(infile):
i = infile[0]
kres_file = infile[1]
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
def handle_ssj500k_file():
kres_file = INDIR_SSJ500K_ORIG
outfile = OUTDIR
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
except Exception as exc:
logging.info("Failed processing file: {}".format(str(kres_file)))
logging.error(exc)
return False
# try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
# except Exception as exc:
# logging.info("Failed processing file: {}".format(str(kres_file)))
# logging.error(exc)
# return False
with outfile.open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
# logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
return True
return False
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
def ssj500k_orig_generator():
with open(INDIR_SSJ500K, 'r') as gof:
previous_new_line = False
for l_gof in gof:
if l_gof == '\n':
if previous_new_line:
continue
previous_new_line = True
elif previous_new_line:
previous_new_line = False
yield l_gof
def handle_gigafida_file():
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
ignore_lines = True
wf = False
else:
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
ignore_lines = False
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if ignore_lines:
if i > num_lines_per_part * curr_part and l_gof == '\n':
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
ignore_lines = False
# delete last file (probably not whole)
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
if ignore_lines:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if wf:
# print(i)
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if wf:
wf.close()
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
def handle_ssj500k_file2():
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
gof_generator = ssj500k_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
ignore_lines = True
wf = False
else:
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
ignore_lines = False
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if ignore_lines:
if i > num_lines_per_part * curr_part and l_gof == '\n':
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
ignore_lines = False
# delete last file (probably not whole)
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
if ignore_lines:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if wf:
# print(i)
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if wf:
wf.close()
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
import time
def handle_giga_file(ran):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
num_lines = 1393184026
# 1393184026
# 1393184033
# return
num_lines_per_part = num_lines / GIGA_PARTS
curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if curr_part < ran[0]:
if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
if curr_part < ran[0]:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if curr_part not in file_indices:
continue
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if curr_part in file_indices:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if curr_part in file_indices and wf:
wf.close()
if curr_part >= ran[1]:
break
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
def handle_giga_file_selected_sentences(error_sentences):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
# print('num_lines' + 3)
# num_lines = 1393184026
num_lines = 1393222523
# 1393184026
# 1393184033
# return
# num_lines_per_part = num_lines / GIGA_PARTS
# curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
os.remove(os.path.join(OUTDIR, 'giga_errors'))
wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
sentence_ids_list = pickle.load(pkl_file)
sentence_id = 0
skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if l_gjf == '\n':
if not skip_sentence:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
sentence_id += 1
if sentence_ids_list[sentence_id] in error_sentences:
print(sentence_ids_list[sentence_id])
skip_sentence = False
else:
skip_sentence = True
if skip_sentence:
continue
# if curr_part < ran[0]:
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
# if curr_part < ran[0]:
# print(curr_part)
# curr_part += 1
# continue
# else:
# continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
# if i > num_lines_per_part * (curr_part + 1):
# curr_part += 1
# # if wf doesn't exist (first one)
# if curr_part in file_indices and wf:
# wf.close()
# if curr_part >= ran[1]:
# break
# if curr_part in file_indices:
# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
# curr_part += 1
wf.close()
handle_ssj500k_file()
logging.info("end parsing kres")

View File

@ -1,3 +1,5 @@
import copy
from lxml import etree
import re
from parser.msd.msdmap import Msdmap
@ -5,6 +7,7 @@ import pickle
from pathlib import Path
from fillpred_model.step1 import build_model_row
import sys
import xml.etree.ElementTree as ET
class Parser:
# reads a TEI xml file and returns a dictionary:
@ -29,17 +32,23 @@ class Parser:
def parse_tei(self, filepath):
def parse_links(s_el):
lgrps = s_el.findall(".//links")
sent_id = '#' + s_el.get('id')
lgrps = s_el.findall(".//linkGrp")
if len(lgrps) < 1:
raise IOError("Can't find links.")
res_links = {}
for link in lgrps[0]:
dep = int(link.get("dep").split(".")[-1])
res_links[dep] = (
link.get("afun"),
dep,
int(link.get("from").split(".")[-1]),
)
for lgrp in lgrps:
if lgrp.get("type") == "JOS-SYN":
for link in lgrp:
jos_type = link.get("ana").split(":")[-1]
link_data = link.get("target").split(" ")
link_from = int(link_data[1].split('.')[-1][1:])
link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
res_links[link_from] = (
jos_type,
link_from,
link_to,
)
return res_links
guess_corpus = None # SSJ | KRES
@ -57,7 +66,10 @@ class Parser:
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
guess_corpus = "KRES"
if root.get("id")[0:2] == 'GF':
guess_corpus = "GIGA"
else:
guess_corpus = "KRES"
divs = [root]
else:
guess_corpus = "SSJ"
@ -65,7 +77,10 @@ class Parser:
# parse divs
for div in divs:
f_id = div.get("id")
f_id = div.get("id")[:-6]
if guess_corpus == "GIGA":
div = div.findall(".//body")[0]
# parse paragraphs
for p in div.findall(".//p"):
@ -73,8 +88,14 @@ class Parser:
# parse sentences
for s in p.findall(".//s"):
# test if sentence has jos-syn annotations and doesn't have SRL
sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
continue
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_list = []
sentence_tokens = []
# parse tokens
@ -84,46 +105,279 @@ class Parser:
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
uPosTag = None
uPosFeats = []
for msd_el in el.get("msd").split('|'):
key, val = msd_el.split('=')
if key == 'UPosTag':
uPosTag = val
else:
uPosFeats.append(msd_el)
uPosFeats = '|'.join(uPosFeats)
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES"
else el.get("ana").split(":")[-1]),
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
uPosTag,
uPosFeats
)]
elif el.tag in self.C_TAGS:
# only Kres' C_TAGS have ids
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
if guess_corpus != "GIGA":
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in self.S_TAGS:
# Kres' <S /> doesn't contain .text
sentence_text += " "
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
uPosTag = None
uPosFeats = []
for msd_el in el.get("msd").split('|'):
key, val = msd_el.split('=')
if key == 'UPosTag':
uPosTag = val
else:
uPosFeats.append(msd_el)
uPosFeats = '|'.join(uPosFeats)
sentence_tokens += [(
"pc",
int(el_id),
el.text,
el.text,
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
uPosTag,
uPosFeats
)]
else:
# pass links and linkGroups
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
sentence_id = s.get("id")
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
parse_links(s)
)
}
fp.close()
return res_dict
def minimize_tei(self, filepath, jsondata):
def set_xml_attr(node, attribute, value):
node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value
def parse_links(s_el):
sent_id = '#' + s_el.get('id')
lgrps = s_el.findall(".//linkGrp")
if len(lgrps) < 1:
raise IOError("Can't find links.")
res_links = {}
for lgrp in lgrps:
if lgrp.get("type") == "JOS-SYN":
for link in lgrp:
jos_type = link.get("ana").split(":")[-1]
link_data = link.get("target").split(" ")
link_from = int(link_data[1].split('.')[-1][1:])
link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
res_links[link_from] = (
jos_type,
link_from,
link_to,
)
return res_links
guess_corpus = None # SSJ | KRES
res_dict = {}
# with filepath.open("rb") as fp, open("../data/ssj500k2.3/final_tei/res.xml", 'w') as sf:
with filepath.open("rb") as fp:
used_ssj_documents = set([k.split('.')[0] for k, v in jsondata.items()])
used_ssj_paragraphs = set(['.'.join(k.split('.')[:-1]) for k, v in jsondata.items()])
used_ssj_sentences = set([k for k, v in jsondata.items()])
ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
tree = ET.parse(fp)
root_res = tree.getroot()
# root_res = copy.deepcopy(root)
ns = '{http://www.w3.org/XML/1998/namespace}'
ns2 = '{http://www.tei-c.org/ns/1.0}'
for doc in list(root_res):
doc_id = doc.get(ns + 'id')
if doc_id not in used_ssj_documents:
root_res.remove(doc)
continue
for par in list(doc):
par_id = par.get(ns + 'id')
if par_id not in used_ssj_paragraphs:
if par.tag != ns2 + 'bibl':
doc.remove(par)
continue
for sen in list(par):
sen_id = sen.get(ns + 'id')
if sen_id not in used_ssj_sentences:
par.remove(sen)
continue
linkGrp = ET.Element(f'{ns2}linkGrp')
linkGrp.attrib[f'targFunc'] = 'head argument'
linkGrp.attrib[f'type'] = 'SRL'
for srl_el in jsondata[sen_id]:
link = ET.Element(f'{ns2}link')
link.attrib['ana'] = f'srl:{srl_el["arg"]}'
link.attrib['target'] = f'#{sen_id}.t{srl_el["from"]} #{sen_id}.t{srl_el["dep"]}'
linkGrp.append(link)
sen.append(linkGrp)
# <linkGrp corresp="#ssj1.1.1" targFunc="head argument" type="SRL">
# <link ana="srl:TIME" target="#ssj1.1.1.t6 #ssj1.1.1.t3"/>
# <link ana="srl:QUANT" target="#ssj1.1.1.t6 #ssj1.1.1.t5"/>
# <link ana="srl:TIME" target="#ssj1.1.1.t8 #ssj1.1.1.t11"/>
# <link ana="srl:PAT" target="#ssj1.1.1.t23 #ssj1.1.1.t21"/>
# <link ana="srl:ACT" target="#ssj1.1.1.t23 #ssj1.1.1.t22"/>
# <link ana="srl:RESLT" target="#ssj1.1.1.t18 #ssj1.1.1.t23"/>
# </linkGrp>
# print('aaa')
# sf.write(etree.tostring(tree, pretty_print=True, encoding='utf-8').decode())
tree.write("../data/ssj500k2.3/final_tei/res.xml", encoding='utf-8')
return
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
if root.get("id")[0:2] == 'GF':
guess_corpus = "GIGA"
else:
guess_corpus = "KRES"
divs = [root]
else:
guess_corpus = "SSJ"
divs = root.findall(".//div")
# parse divs
for div in divs:
f_id = div.get("id")
if guess_corpus == "GIGA":
div = div.findall(".//body")[0]
# parse paragraphs
for p in div.findall(".//p"):
p_id = p.get("id").split(".")[-1]
# parse sentences
for s in p.findall(".//s"):
# test if sentence has jos-syn annotations and doesn't have SRL
sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
del s
continue
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_list = []
sentence_tokens = []
# parse tokens
for el in s.iter():
if el.tag in self.W_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
uPosTag = None
uPosFeats = []
for msd_el in el.get("msd").split('|'):
key, val = msd_el.split('=')
if key == 'UPosTag':
uPosTag = val
else:
uPosFeats.append(msd_el)
uPosFeats = '|'.join(uPosFeats)
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
uPosTag,
uPosFeats
)]
elif el.tag in self.C_TAGS:
# only Kres' C_TAGS have ids
if guess_corpus != "GIGA":
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in self.S_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
uPosTag = None
uPosFeats = []
for msd_el in el.get("msd").split('|'):
key, val = msd_el.split('=')
if key == 'UPosTag':
uPosTag = val
else:
uPosFeats.append(msd_el)
uPosFeats = '|'.join(uPosFeats)
sentence_tokens += [(
"pc",
int(el_id),
el.text,
el.text,
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
uPosTag,
uPosFeats
)]
else:
# pass links and linkGroups
pass
sentence_id = s.get("id")
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s)
)
}
et = etree.ElementTree(root)
et.write("../data/ssj500k2.3/final_tei/res.xml", pretty_print=True, encoding='unicode')
fp.close()
return res_dict
def to_conll_2009_SRL(self, sentence_entry):
def fillpred(tsv_row):
mrow = build_model_row(tsv_row)
x = mrow[:-1]
x = mrow[:-1]
y = self.fillpred_model.predict([x])
return y[0] # bool
@ -135,12 +389,8 @@ class Parser:
# handle stop signs
if token[0] != "w":
out_str += '\t'.join(
[t_id] +
[form for x in range(7)] +
["0", "0", "modra", "modra", "_", "_"] +
["\n"]
)
out_list = [t_id] + [form for x in range(7)] + ["0", "0", "modra", "modra", "_", "_"] + ["\n"]
out_str += '\t'.join(map(str, out_list))
continue
pos = self.msdmap.slo_msd_to_eng_pos(token[4])

View File

@ -34,7 +34,8 @@ JVM_ARGS="-cp $CP -Xmx$MEM"
NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step. This setting is equivalent to the CoNLL 2009 ST.
CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang $INPUT $MODEL $RERANKER $NOPI $OUTPUT"
echo "Executing: $CMD"
$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang "$INPUT" $MODEL $RERANKER $NOPI "$OUTPUT"
# CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang '$INPUT' $MODEL $RERANKER $NOPI '$OUTPUT'"
# echo "Executing: $CMD"
$CMD
# $CMD

View File

@ -0,0 +1,29 @@
#!/bin/bash
# parsing tools.cfg values
IN_FOLDER="$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg)"
echo "input folder: $IN_FOLDER"
OUT_FOLDER="$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg)"
echo "output folder: $OUT_FOLDER"
SUFFIX="srl.tsv"
mkdir -p "$OUT_FOLDER"
rm "$OUT_FOLDER/*${SUFFIX}" &> /dev/null
for infile in "$IN_FOLDER/*"; do
echo "Tagging: ${infile}"
base=$(basename $infile | cut -d'.' -f1)
outfile="${OUT_FOLDER}/${base}.${SUFFIX}"
# mate-tools tagger
./scripts/parse_srl_only_mod.sh "$infile" "$outfile"
if [ $? -eq 0 ]; then
echo "Saved as ${outfile}"
else
echo "ERR"
exit 1
fi
done

View File

@ -0,0 +1,29 @@
#!/bin/bash
# parsing tools.cfg values
IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
echo "input folder: $IN_FOLDER"
OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
echo "output folder: $OUT_FOLDER"
SUFFIX="srl.tsv"
mkdir -p $OUT_FOLDER
rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
for infile in $IN_FOLDER/*; do
echo "Tagging: ${infile}"
base=$(basename $infile | cut -d'.' -f1)
outfile=${OUT_FOLDER}/${base}.${SUFFIX}
# mate-tools tagger
./scripts/parse_srl_only_mod.sh $infile $outfile
if [ $? -eq 0 ]; then
echo "Saved as ${outfile}"
else
echo "ERR"
exit 1
fi
done

View File

@ -1,15 +1,16 @@
#!/bin/bash
# parsing tools.cfg values
IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
IN_FOLDER="../$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg.kres_new)"
IN_FOLDER=$IN_FOLDER$1
echo "input folder: $IN_FOLDER"
OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
OUT_FOLDER="../$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg.kres_new)"
echo "output folder: $OUT_FOLDER"
SUFFIX="srl.tsv"
mkdir -p $OUT_FOLDER
rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
for infile in $IN_FOLDER/*; do
echo "Tagging: ${infile}"

View File

@ -0,0 +1,30 @@
#!/bin/bash
# parsing tools.cfg values
IN_FOLDER="../$(sed -n -e 's/^\s*ssj500k_tsv_folder\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
IN_FOLDER=$IN_FOLDER$1
echo "input folder: $IN_FOLDER"
OUT_FOLDER="../$(sed -n -e 's/^\s*ssj500k_srl\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
echo "output folder: $OUT_FOLDER"
SUFFIX="srl.tsv"
mkdir -p $OUT_FOLDER
# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
for infile in $IN_FOLDER/*; do
echo "Tagging: ${infile}"
base=$(basename $infile | cut -d'.' -f1)
outfile=${OUT_FOLDER}/${base}.${SUFFIX}
# mate-tools tagger
./scripts/parse_srl_only_mod.sh $infile $outfile
if [ $? -eq 0 ]; then
echo "Saved as ${outfile}"
else
echo "ERR"
exit 1
fi
done

View File

@ -1,8 +1,13 @@
[tools]
kres_orig = /kres_mount/kres_parsed/tei
kres_tsv = ../data/kres_out/1_tsv
kres_srl = ../data/kres_out/2_srl
kres_json = ../data/kres_out/final_json
logfile = ../progress.log
cpu_cores = 5
debug = False
giga = ../data/gf_example/gf2_orig
giga_orig = ../data/gf_example/gf2-dedup.patch0001
giga_jos = ../data/gf_example/gf2-dedup.jos.patch0001
giga_tsv = ../data/gf_example/gf_files_part
giga_srl = ../data/gf_example/2_srl
;giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
giga_json = ../data/gf_example/final_json
internal_data = ../data/gf_example/internal_data
giga_parts = 100000
logfile = ../data/gf_example/progress.log
cpu_cores = 1
debug = True

16
tools/tools.cfg.gigafida Normal file
View File

@ -0,0 +1,16 @@
[tools]
giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
giga_parts = 100000
logfile = ../progress.log
cpu_cores = 1
debug = False

8
tools/tools.cfg.kres Normal file
View File

@ -0,0 +1,8 @@
[tools]
kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
kres_tsv = ../data/kres_out/1_tsv
kres_srl = ../data/kres_out/2_srl
kres_json = ../data/kres_out/final_json
logfile = ../progress.log
cpu_cores = 5
debug = False

8
tools/tools.cfg.kres_new Normal file
View File

@ -0,0 +1,8 @@
[tools]
kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
giga_tsv = ../data/giga_out/1_tsv
giga_srl = ../data/giga_out/2_srl
kres_json = ../data/giga_out/final_json
logfile = ../progress.log
cpu_cores = 5
debug = False

View File

@ -0,0 +1,15 @@
[tools]
ssj500k = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
ssj500k_orig = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
ssj500k_orig_folder = ../data/ssj500k2.3/orig
ssj500k_jos = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
ssj500k_tsv = ../data/ssj500k2.3/tsvs/tsvs.tsv
ssj500k_tsv_folder = ../data/ssj500k2.3/tsvs
ssj500k_srl = ../data/ssj500k2.3/srls
ssj500k_json = ../data/ssj500k2.3/final_json
ssj500k_tei = ../data/ssj500k2.3/final_tei
internal_data = ../data/ssj500k2.3/internal_data
;internal_data = ../data/gf_example/internal_data
logfile = ../data/ssj500k2.3/progress.log
cpu_cores = 1
debug = True