forked from kristjan/cjvt-srl-tagging
Compare commits
No commits in common. "ssj500k" and "master" have entirely different histories.
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -6,6 +6,3 @@ nohup.out
|
|||
|
||||
data/kres_out/*
|
||||
data/kres_example/
|
||||
venv/
|
||||
.idea/
|
||||
data/
|
||||
|
|
3
Makefile
3
Makefile
|
@ -6,9 +6,8 @@ json_files: # srl_tagged_files
|
|||
cd tools; python3 gen_json.py
|
||||
|
||||
srl_tagged_files: # tsv_files
|
||||
# # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
|
||||
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
|
||||
cd tools/srl-20131216; ./tag_all.sh
|
||||
# cd tools/srl-20131216; ./tag_ssj500k2.3.sh
|
||||
|
||||
tsv_files: # tools/fillpred_model/model.pickle
|
||||
cd tools; python3 parse_all.py
|
||||
|
|
|
@ -1,11 +1,3 @@
|
|||
# Instructions
|
||||
For mining ssj500k <b>checkout to branch ssj500k</b>.
|
||||
For running order look at Makefile. Generally it works like this:
|
||||
- tools/parse_all.py - It creates mate file that is necessary for running Java based srl.jar
|
||||
- tools/srl-20131216/tag_all.sh - Tags ssj500k
|
||||
- tools/gen_json.py - Mine SRL to json
|
||||
- tools/gen_tei.py - Mine SRL to tei
|
||||
|
||||
# cjvt-srl-tagging
|
||||
We'll be using mate-tools to perform SRL on Kres.
|
||||
|
||||
|
|
|
@ -15,6 +15,6 @@ run:
|
|||
-v /etc/group:/etc/group \
|
||||
-v $(shell pwd)/../../:/cjvt-srl-tagging \
|
||||
-w /cjvt-srl-tagging \
|
||||
-v /home/luka/Development/srl/data:/kres_mount:ro \
|
||||
-v /home/kristjan/kres_mount:/kres_mount:ro \
|
||||
python-java \
|
||||
/bin/bash
|
||||
|
|
|
@ -1,19 +0,0 @@
|
|||
import os
|
||||
|
||||
# INPATH = Path(config["tools"]["giga_srl"])
|
||||
# infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
|
||||
SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files'
|
||||
from shutil import copyfile
|
||||
|
||||
INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl'
|
||||
OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv'
|
||||
for i in range(100000):
|
||||
# print(os.path.join(INPATH, 'giga.%07d.tsv' % i))
|
||||
# if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)):
|
||||
# print('giga.%07d.tsv' % i)
|
||||
if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)):
|
||||
copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i))
|
||||
print('giga%07d.srl.tsv' % i)
|
||||
|
||||
if i % 1000 == 0:
|
||||
print(i)
|
|
@ -1,192 +0,0 @@
|
|||
import pickle
|
||||
|
||||
from parser.parser import Parser
|
||||
import os
|
||||
from os.path import join, dirname
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
import cProfile
|
||||
import configparser
|
||||
import logging
|
||||
from multiprocessing import Pool
|
||||
|
||||
SSJ500K_2_1 = 27829 # number of sentences
|
||||
par = Parser()
|
||||
|
||||
# path to data
|
||||
config = configparser.ConfigParser()
|
||||
config.read("tools.cfg")
|
||||
analysis = ''
|
||||
if 'kres_orig' in config["tools"]:
|
||||
analysis = 'kres'
|
||||
INDIR = Path(config["tools"]["kres_orig"])
|
||||
OUTDIR = Path(config["tools"]["kres_tsv"])
|
||||
elif 'giga_orig' in config["tools"]:
|
||||
# analysis = 'gigafida'
|
||||
analysis = 'giga'
|
||||
INDIR_GIGA = Path(config["tools"]["giga_orig"])
|
||||
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
|
||||
INDIR_JOS = Path(config["tools"]["giga_jos"])
|
||||
OUTDIR = Path(config["tools"]["giga_tsv"])
|
||||
GIGA_PARTS = int(config["tools"]["giga_parts"])
|
||||
INTERNAL_DATA = config["tools"]["internal_data"]
|
||||
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||
LOGFILE.touch(exist_ok=True)
|
||||
LOGFILE.resolve()
|
||||
|
||||
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||
|
||||
origfiles = []
|
||||
for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
|
||||
for file in files:
|
||||
origfiles.append(Path(os.path.join(subdir, file)))
|
||||
origfiles=list(enumerate(sorted(origfiles)))
|
||||
|
||||
def giga_orig_sentence_generator():
|
||||
with open(INDIR_GIGA, 'r') as gof:
|
||||
previous_new_line = False
|
||||
sentence_words = []
|
||||
for l_gof in gof:
|
||||
if l_gof == '\n':
|
||||
yield ' '.join(sentence_words)
|
||||
sentence_words = []
|
||||
else:
|
||||
sentence_words.append(l_gof.split('\t')[0])
|
||||
# yield l_gof
|
||||
|
||||
sentence_generator = giga_orig_sentence_generator()
|
||||
|
||||
sentence_ids = []
|
||||
for origfile in origfiles:
|
||||
split_file_sentences = par.parse_tei(origfile[1])
|
||||
for k, v in split_file_sentences.items():
|
||||
one_file_sentence = next(sentence_generator)
|
||||
if one_file_sentence == v['text']:
|
||||
sentence_ids.append(v['sid'])
|
||||
else:
|
||||
print('----------------')
|
||||
print('ERROR')
|
||||
print(v['sid'])
|
||||
print(one_file_sentence)
|
||||
print(v['text'])
|
||||
print(origfile[0])
|
||||
|
||||
# count sentences in orig (if not counted before)
|
||||
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
|
||||
if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
|
||||
os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
|
||||
|
||||
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
|
||||
pickle.dump(sentence_ids, output)
|
||||
|
||||
# def giga_orig_generator():
|
||||
# with open(INDIR_GIGA, 'r') as gof:
|
||||
# previous_new_line = False
|
||||
# for l_gof in gof:
|
||||
# if l_gof == '\n':
|
||||
# if previous_new_line:
|
||||
# continue
|
||||
# previous_new_line = True
|
||||
# elif previous_new_line:
|
||||
# previous_new_line = False
|
||||
# yield l_gof
|
||||
|
||||
# import time
|
||||
# def handle_giga_file(ran):
|
||||
# """
|
||||
# File that splits big text file into more minor files. Only split on empty lines.
|
||||
# """
|
||||
# # with open(INDIR_GIGA, 'r') as gof:
|
||||
# # with open(INDIR_JOS, 'r') as gjf:
|
||||
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||
# # pass
|
||||
# # num_lines = i + 1
|
||||
# # print(num_lines)
|
||||
# num_lines = 1393184026
|
||||
# # 1393184026
|
||||
# # 1393184033
|
||||
# # return
|
||||
# num_lines_per_part = num_lines / GIGA_PARTS
|
||||
# curr_part = 0
|
||||
# gof_generator = giga_orig_generator()
|
||||
#
|
||||
# diff_files = set()
|
||||
# # with open(INDIR_GIGA, 'r') as gof:
|
||||
# with open(INDIR_GIGA_OLD, 'r') as gjf:
|
||||
# # sentence = {}
|
||||
# # sentence['tokens'] = []
|
||||
# # sentence['links'] = {}
|
||||
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
|
||||
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
|
||||
#
|
||||
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
|
||||
#
|
||||
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||
# for i, l_gjf in enumerate(gjf):
|
||||
# l_gof = next(gof_generator)
|
||||
# if curr_part < ran[0]:
|
||||
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
|
||||
# if curr_part < ran[0]:
|
||||
# print(curr_part)
|
||||
# curr_part += 1
|
||||
# continue
|
||||
# else:
|
||||
# continue
|
||||
#
|
||||
# l_gof_split = l_gof.split('\t')
|
||||
# l_gjf_split = l_gjf.split('\t')
|
||||
#
|
||||
# # if punctuation
|
||||
# if l_gof != '\n':
|
||||
# if l_gof_split != l_gjf_split:
|
||||
# print(curr_part)
|
||||
# diff_files.add(curr_part)
|
||||
# l_gof = next(gof_generator)
|
||||
#
|
||||
#
|
||||
# # if l_gof == '\n':
|
||||
# else:
|
||||
# # wf.flush()
|
||||
# # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
||||
# if i > num_lines_per_part * (curr_part + 1):
|
||||
# curr_part += 1
|
||||
# # if wf doesn't exist (first one)
|
||||
# # wf.close()
|
||||
# if curr_part >= ran[1]:
|
||||
# break
|
||||
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
|
||||
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
|
||||
#
|
||||
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
||||
#
|
||||
# curr_part += 1
|
||||
# return diff_files
|
||||
# # wf.close()
|
||||
#
|
||||
# with Pool(CPU_CORES) as p:
|
||||
# final_range = [0, 100000]
|
||||
# # final_range = [0, 150]
|
||||
# # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
|
||||
# # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
|
||||
# # ranges = []
|
||||
# # ps = None
|
||||
# # for i in range(CPU_CORES):
|
||||
# # s = int(final_range[0] + size_per_proc * i)
|
||||
# # ns = int(final_range[0] + size_per_proc * (i + 1))
|
||||
# # ranges.append([s, ns])
|
||||
# # # ranges = [[0, 1]]
|
||||
# # res = p.map(handle_giga_file, ranges)
|
||||
#
|
||||
# res = handle_giga_file(final_range)
|
||||
# res = sorted(list(res))
|
||||
# if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
|
||||
# os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
|
||||
# with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
|
||||
# pickle.dump(res, pkl_file)
|
||||
# # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
|
||||
# # mydict2 = pickle.load(pkl_file)
|
||||
# print('test')
|
|
@ -1,114 +0,0 @@
|
|||
from pathlib import Path
|
||||
from parser.parser import Parser
|
||||
import configparser
|
||||
import json
|
||||
import sys
|
||||
import logging
|
||||
from multiprocessing import Pool
|
||||
|
||||
# parse config
|
||||
config = configparser.ConfigParser()
|
||||
config.read("tools.cfg")
|
||||
# ORIGPATH = Path(config["tools"]["kres_orig"])
|
||||
INPATH = Path(config["tools"]["giga_srl"])
|
||||
OUTPATH = Path(config["tools"]["kres_json"])
|
||||
DEBUG = config["tools"]["debug"] == "True"
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||
LOGFILE.touch(exist_ok=True)
|
||||
LOGFILE.resolve()
|
||||
|
||||
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||
|
||||
def get_origfile(filename):
|
||||
for origfile in ORIGPATH.iterdir():
|
||||
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||
return origfile
|
||||
raise FileNotFoundError
|
||||
|
||||
def extract_sentences(line_reader):
|
||||
acc = []
|
||||
# last char in line is \n, remove it
|
||||
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
|
||||
if len(line) == 1: # empty line
|
||||
tmp = acc
|
||||
acc = []
|
||||
yield tmp
|
||||
else:
|
||||
acc.append(line)
|
||||
|
||||
def to_sentence(sentence_arr):
|
||||
return " ".join([token[1] for token in sentence_arr])
|
||||
|
||||
def match_sentence_id(sentence, orig_dict):
|
||||
for k, e in orig_dict.items():
|
||||
orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
if sentence == orig_sentence:
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
def get_dep_rel(token):
|
||||
logging.debug(token)
|
||||
for i, field in enumerate(token[14:]):
|
||||
if field != "_":
|
||||
return {
|
||||
"arg": field,
|
||||
"from": i, # i-th predicate in sentence
|
||||
"dep": token[0],
|
||||
}
|
||||
return None
|
||||
|
||||
def handle_file(infile_tpl):
|
||||
i = infile_tpl[0]
|
||||
infile = infile_tpl[1]
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||
origfile = get_origfile(infile)
|
||||
orig_dict = par.parse_tei(origfile)
|
||||
|
||||
with infile.open("rb") as fp:
|
||||
outdata = {}
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
# tsv dropped sentence ids, match the ID, using original data
|
||||
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
||||
|
||||
outdata[sid] = []
|
||||
|
||||
# find all predicate indices in the sentence
|
||||
predicates = []
|
||||
for token in sentence_arr:
|
||||
if token[12] == "Y":
|
||||
predicates += [token[0]] # idx
|
||||
|
||||
deprel = get_dep_rel(token)
|
||||
if deprel is not None:
|
||||
outdata[sid].append(deprel)
|
||||
|
||||
# deprel["from"] points to n-th predicate
|
||||
# replace with predicate's token index
|
||||
for deprel in outdata[sid]:
|
||||
deprel["from"] = predicates[deprel["from"]]
|
||||
|
||||
if DEBUG:
|
||||
print(to_sentence(sentence_arr))
|
||||
print(outdata[sid])
|
||||
print(sid)
|
||||
print()
|
||||
print()
|
||||
|
||||
with outfile.open("w") as fp:
|
||||
json.dump(outdata, fp)
|
||||
logging.info("SRL relations written to: {}".format(outfile))
|
||||
|
||||
|
||||
# main
|
||||
par = Parser()
|
||||
OUTPATH.mkdir(exist_ok=True)
|
||||
|
||||
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
|
||||
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
|
||||
|
||||
with Pool(CPU_CORES) as p:
|
||||
p.map(handle_file, infiles)
|
||||
|
||||
logging.info("Finished generating .json files.")
|
|
@ -1,8 +1,3 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from parser.parser import Parser
|
||||
import configparser
|
||||
|
@ -13,11 +8,10 @@ from multiprocessing import Pool
|
|||
|
||||
# parse config
|
||||
config = configparser.ConfigParser()
|
||||
config.read("tools.cfg.ssj500k2.3")
|
||||
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
|
||||
INPATH = Path(config["tools"]["ssj500k_srl"])
|
||||
OUTPATH = Path(config["tools"]["ssj500k_json"])
|
||||
INTERNAL_DATA = Path(config["tools"]["internal_data"])
|
||||
config.read("tools.cfg")
|
||||
ORIGPATH = Path(config["tools"]["kres_orig"])
|
||||
INPATH = Path(config["tools"]["kres_srl"])
|
||||
OUTPATH = Path(config["tools"]["kres_json"])
|
||||
DEBUG = config["tools"]["debug"] == "True"
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
|
@ -54,13 +48,6 @@ def match_sentence_id(sentence, orig_dict):
|
|||
return k
|
||||
raise KeyError
|
||||
|
||||
def match_sentence_id_giga(sentence, orig_dict):
|
||||
for k, e in orig_dict.items():
|
||||
# orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
if sentence == e["text"]:
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
def get_dep_rel(token):
|
||||
logging.debug(token)
|
||||
for i, field in enumerate(token[14:]):
|
||||
|
@ -72,7 +59,7 @@ def get_dep_rel(token):
|
|||
}
|
||||
return None
|
||||
|
||||
def handle_file_old(infile_tpl):
|
||||
def handle_file(infile_tpl):
|
||||
i = infile_tpl[0]
|
||||
infile = infile_tpl[1]
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||
|
@ -114,275 +101,14 @@ def handle_file_old(infile_tpl):
|
|||
logging.info("SRL relations written to: {}".format(outfile))
|
||||
|
||||
|
||||
def handle_file(whole_input):
|
||||
# sentence_id = whole_input[0][3]
|
||||
# orig_infile = whole_input[0][1]
|
||||
sentence_id = whole_input[3]
|
||||
orig_infile = whole_input[1]
|
||||
|
||||
# origfile = origfiles[0][1]
|
||||
# infile_tpl = infile_tpl[0]
|
||||
|
||||
# i = infile_tpl[0]
|
||||
# infile = infile_tpl[1]
|
||||
outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
|
||||
|
||||
if outfile.exists():
|
||||
return
|
||||
# origfile = get_origfile()
|
||||
orig_dict = par.parse_tei(orig_infile)
|
||||
outdata = {}
|
||||
|
||||
gen = srl_multiple_files_sentences_generator(sentence_id)
|
||||
# gen = srl_multiple_files_sentences_generator(whole_input[1])
|
||||
|
||||
mismatch_sentences = 0
|
||||
|
||||
for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
|
||||
if orig_id == 'GF0014802.2685.7':
|
||||
print('PAUSE')
|
||||
|
||||
# look at neighbouring sentences if they are correct
|
||||
sentence, sentence_arr = next(gen)
|
||||
# orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
assert sentence.replace(' ', '') == orig_val['text']
|
||||
# if i != 10 and i != 0:
|
||||
# print('OK!')
|
||||
sid = orig_id
|
||||
|
||||
outdata[sid] = []
|
||||
|
||||
# find all predicate indices in the sentence
|
||||
predicates = []
|
||||
for token in sentence_arr:
|
||||
if token[12] == "Y":
|
||||
predicates += [token[0]] # idx
|
||||
|
||||
deprel = get_dep_rel(token)
|
||||
if deprel is not None:
|
||||
outdata[sid].append(deprel)
|
||||
|
||||
# deprel["from"] points to n-th predicate
|
||||
# replace with predicate's token index
|
||||
for deprel in outdata[sid]:
|
||||
deprel["from"] = predicates[deprel["from"]]
|
||||
|
||||
if DEBUG:
|
||||
print(to_sentence(sentence_arr))
|
||||
print(outdata[sid])
|
||||
print(sid)
|
||||
print()
|
||||
print()
|
||||
|
||||
if mismatch_sentences > 0:
|
||||
if mismatch_sentences / len(orig_dict.items()) < 0.1:
|
||||
print('Slight mismatch - %d' % sentence_id)
|
||||
print(whole_input)
|
||||
print('ABS mitigated %d' % mismatch_sentences)
|
||||
print('------------------------------------------------')
|
||||
else:
|
||||
print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
|
||||
print('Big mismatch - %d' % sentence_id)
|
||||
print(whole_input)
|
||||
print('ABS mitigated errors:')
|
||||
print(mismatch_sentences)
|
||||
print('------------------------------------------------')
|
||||
|
||||
|
||||
with outfile.open("w") as fp:
|
||||
json.dump(outdata, fp)
|
||||
logging.info("SRL relations written to: {}".format(outfile))
|
||||
|
||||
def count_orig_file_sentences(filename):
|
||||
|
||||
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
|
||||
return
|
||||
print(filename[0])
|
||||
orig_dict = par.parse_tei(filename[1])
|
||||
# return filename[0], filename[1], len(orig_dict)
|
||||
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
|
||||
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
|
||||
|
||||
|
||||
def count_srl_file_sentences(filename):
|
||||
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
|
||||
return
|
||||
|
||||
print(filename[0])
|
||||
num_sentences = 0
|
||||
with filename[1].open("r") as fp:
|
||||
for line in fp:
|
||||
if line == '\n':
|
||||
num_sentences += 1
|
||||
|
||||
# return filename[0], filename[1], num_sentences
|
||||
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
|
||||
pickle.dump((filename[0], filename[1], num_sentences), output)
|
||||
|
||||
def srl_sentences_generator(infile, curr_index, sen_start_index):
|
||||
with infile.open("rb") as fp:
|
||||
outdata = {}
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
if curr_index < sen_start_index:
|
||||
curr_index += 1
|
||||
else:
|
||||
yield to_sentence(sentence_arr), sentence_arr
|
||||
yield None
|
||||
|
||||
|
||||
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
|
||||
sentence_id = max(0, sentence_id - 10)
|
||||
for i, srl_file in enumerate(srl_file_sizes):
|
||||
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
|
||||
srl_files = srl_file_sizes[i:]
|
||||
break
|
||||
|
||||
for file_info in srl_files:
|
||||
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
|
||||
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
|
||||
el = next(srl_gen)
|
||||
while el is not None:
|
||||
yield el
|
||||
el = next(srl_gen)
|
||||
|
||||
yield None
|
||||
|
||||
|
||||
# main
|
||||
par = Parser()
|
||||
OUTPATH.mkdir(exist_ok=True)
|
||||
|
||||
infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
|
||||
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
|
||||
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
|
||||
|
||||
origfiles = []
|
||||
for subdir, dirs, files in os.walk(ORIGPATH):
|
||||
for file in files:
|
||||
origfiles.append(Path(os.path.join(subdir, file)))
|
||||
origfiles=list(enumerate(sorted(origfiles)))
|
||||
##### REMOVE ############
|
||||
# origfiles = origfiles[:3]
|
||||
|
||||
# count sentences in orig (if not counted before)
|
||||
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
|
||||
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
|
||||
# srl_file_sizes = {}
|
||||
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
|
||||
os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
|
||||
# with Pool(CPU_CORES) as p:
|
||||
# # p.map(handle_file, infiles)
|
||||
# p.map(count_orig_file_sentences, origfiles)
|
||||
for i in range(len(origfiles)):
|
||||
count_orig_file_sentences(origfiles[i])
|
||||
orig_file_sizes = []
|
||||
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
|
||||
print(x.name)
|
||||
if x.is_file():
|
||||
with x.open('rb') as pkl_small_file:
|
||||
orig_file_sizes.append(pickle.load(pkl_small_file))
|
||||
# orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
|
||||
print("Sorting orig files")
|
||||
orig_file_sizes = sorted(orig_file_sizes)
|
||||
total_size = 0
|
||||
orig_file_sizes_final = []
|
||||
print("Calculating orig files size")
|
||||
for n, pa, si in orig_file_sizes:
|
||||
orig_file_sizes_final.append((n, pa, si, total_size))
|
||||
total_size += si
|
||||
orig_file_sizes = orig_file_sizes_final
|
||||
print("Saving orig files size")
|
||||
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
|
||||
pickle.dump(orig_file_sizes, output)
|
||||
print("Orig files saved")
|
||||
else:
|
||||
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
|
||||
orig_file_sizes = pickle.load(pkl_file)
|
||||
|
||||
|
||||
# count sentences in srl (if not counted before)
|
||||
# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
|
||||
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
|
||||
# srl_file_sizes = {}
|
||||
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
|
||||
os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
|
||||
# with Pool(CPU_CORES) as p:
|
||||
# # p.map(handle_file, infiles)
|
||||
# p.map(count_srl_file_sentences, infiles)
|
||||
|
||||
for i in range(len(infiles)):
|
||||
count_srl_file_sentences(infiles[i])
|
||||
|
||||
srl_file_sizes = []
|
||||
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
|
||||
print(x.name)
|
||||
if x.is_file():
|
||||
with x.open('rb') as pkl_small_file:
|
||||
srl_file_sizes.append(pickle.load(pkl_small_file))
|
||||
print("Sorting srl files")
|
||||
srl_file_sizes = sorted(srl_file_sizes)
|
||||
total_size = 0
|
||||
srl_file_sizes_final = []
|
||||
print("Calculating srl files size")
|
||||
for n, pa, si in srl_file_sizes:
|
||||
srl_file_sizes_final.append((n, pa, si, total_size))
|
||||
total_size += si
|
||||
srl_file_sizes = srl_file_sizes_final
|
||||
print("Saving srl files size")
|
||||
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
|
||||
pickle.dump(srl_file_sizes, output)
|
||||
print("Srl files saved")
|
||||
else:
|
||||
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
|
||||
srl_file_sizes = pickle.load(pkl_file)
|
||||
|
||||
|
||||
# print(len(orig_file_sizes))
|
||||
# print('asd' + 2)
|
||||
|
||||
# inputs = []
|
||||
# srl_i = 0
|
||||
# srl_file = srl_file_sizes[srl_i]
|
||||
# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
|
||||
# interesting_srl_files = []
|
||||
# # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
|
||||
# # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
|
||||
# # srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
|
||||
# while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
|
||||
# # if beginning of file is in
|
||||
# if srl_file[3] > orig_first_sent_i:
|
||||
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
|
||||
# # print('if %d' % srl_file[3])
|
||||
# else:
|
||||
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
|
||||
# # print('else %d' % orig_first_sent_i)
|
||||
#
|
||||
# if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
|
||||
# srl_i += 1
|
||||
# if srl_i < len(srl_file_sizes):
|
||||
# srl_file = srl_file_sizes[srl_i]
|
||||
# else:
|
||||
# break
|
||||
# # print(srl_i)
|
||||
# # print('a ' + 2)
|
||||
# else:
|
||||
# break
|
||||
#
|
||||
# inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
|
||||
# print(inputs[-1])
|
||||
|
||||
|
||||
|
||||
# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
|
||||
# a = next(srl_gen)
|
||||
# b = next(srl_gen)
|
||||
# c = next(srl_gen)
|
||||
|
||||
print('beginning processing')
|
||||
with Pool(CPU_CORES) as p:
|
||||
# p.map(handle_file, inputs)
|
||||
p.map(handle_file, orig_file_sizes)
|
||||
|
||||
# for of in orig_file_sizes:
|
||||
# handle_file(of)
|
||||
p.map(handle_file, infiles)
|
||||
|
||||
logging.info("Finished generating .json files.")
|
||||
|
|
|
@ -1,294 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from parser.parser import Parser
|
||||
import configparser
|
||||
import json
|
||||
import sys
|
||||
import logging
|
||||
from multiprocessing import Pool
|
||||
|
||||
# parse config
|
||||
config = configparser.ConfigParser()
|
||||
config.read("tools.cfg")
|
||||
ORIGPATH = Path(config["tools"]["giga"])
|
||||
INPATH = Path(config["tools"]["giga_srl_errors"])
|
||||
OUTPATH = Path(config["tools"]["giga_json"])
|
||||
INTERNAL_DATA = Path(config["tools"]["internal_data"])
|
||||
DEBUG = config["tools"]["debug"] == "True"
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||
LOGFILE.touch(exist_ok=True)
|
||||
LOGFILE.resolve()
|
||||
|
||||
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
|
||||
|
||||
|
||||
|
||||
|
||||
def get_origfile(filename):
|
||||
for origfile in ORIGPATH.iterdir():
|
||||
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||
return origfile
|
||||
raise FileNotFoundError
|
||||
|
||||
def extract_sentences(line_reader):
|
||||
acc = []
|
||||
# last char in line is \n, remove it
|
||||
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
|
||||
if len(line) == 1: # empty line
|
||||
tmp = acc
|
||||
acc = []
|
||||
yield tmp
|
||||
else:
|
||||
acc.append(line)
|
||||
|
||||
def to_sentence(sentence_arr):
|
||||
return " ".join([token[1] for token in sentence_arr])
|
||||
|
||||
def match_sentence_id(sentence, orig_dict):
|
||||
for k, e in orig_dict.items():
|
||||
orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
if sentence == orig_sentence:
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
def match_sentence_id_giga(sentence, orig_dict):
|
||||
for k, e in orig_dict.items():
|
||||
# orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
if sentence == e["text"]:
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
def get_dep_rel(token):
|
||||
logging.debug(token)
|
||||
for i, field in enumerate(token[14:]):
|
||||
if field != "_":
|
||||
return {
|
||||
"arg": field,
|
||||
"from": i, # i-th predicate in sentence
|
||||
"dep": token[0],
|
||||
}
|
||||
return None
|
||||
|
||||
def handle_file_old(infile_tpl):
|
||||
i = infile_tpl[0]
|
||||
infile = infile_tpl[1]
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||
origfile = get_origfile(infile)
|
||||
orig_dict = par.parse_tei(origfile)
|
||||
|
||||
with infile.open("rb") as fp:
|
||||
outdata = {}
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
# tsv dropped sentence ids, match the ID, using original data
|
||||
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
||||
|
||||
outdata[sid] = []
|
||||
|
||||
# find all predicate indices in the sentence
|
||||
predicates = []
|
||||
for token in sentence_arr:
|
||||
if token[12] == "Y":
|
||||
predicates += [token[0]] # idx
|
||||
|
||||
deprel = get_dep_rel(token)
|
||||
if deprel is not None:
|
||||
outdata[sid].append(deprel)
|
||||
|
||||
# deprel["from"] points to n-th predicate
|
||||
# replace with predicate's token index
|
||||
for deprel in outdata[sid]:
|
||||
deprel["from"] = predicates[deprel["from"]]
|
||||
|
||||
if DEBUG:
|
||||
print(to_sentence(sentence_arr))
|
||||
print(outdata[sid])
|
||||
print(sid)
|
||||
print()
|
||||
print()
|
||||
|
||||
with outfile.open("w") as fp:
|
||||
json.dump(outdata, fp)
|
||||
logging.info("SRL relations written to: {}".format(outfile))
|
||||
|
||||
|
||||
def fix_json(srl_gen, error_sentence, orig_json_data):
|
||||
# sentence_id = whole_input[0][3]
|
||||
# orig_infile = whole_input[0][1]
|
||||
# sentence_id = whole_input[3]
|
||||
# orig_infile = whole_input[1]
|
||||
|
||||
# origfile = origfiles[0][1]
|
||||
# infile_tpl = infile_tpl[0]
|
||||
|
||||
# i = infile_tpl[0]
|
||||
# infile = infile_tpl[1]
|
||||
# outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
|
||||
|
||||
# if outfile.exists():
|
||||
# return
|
||||
# origfile = get_origfile()
|
||||
# orig_dict = par.parse_tei(orig_infile)
|
||||
# outdata = {}
|
||||
|
||||
# gen = srl_multiple_files_sentences_generator(sentence_id)
|
||||
# gen = srl_multiple_files_sentences_generator(whole_input[1])
|
||||
|
||||
# mismatch_sentences = 0
|
||||
|
||||
# look at neighbouring sentences if they are correct
|
||||
sentence, sentence_arr = next(srl_gen)
|
||||
# orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
sid = error_sentence
|
||||
# a = orig_json_data[sid]
|
||||
if orig_json_data[sid] != []:
|
||||
# print('POSSIBLE ERROR:')
|
||||
# print(orig_json_data[sid])
|
||||
orig_json_data[sid] = []
|
||||
|
||||
# find all predicate indices in the sentence
|
||||
predicates = []
|
||||
for token in sentence_arr:
|
||||
if token[12] == "Y":
|
||||
predicates += [token[0]] # idx
|
||||
|
||||
deprel = get_dep_rel(token)
|
||||
if deprel is not None:
|
||||
orig_json_data[sid].append(deprel)
|
||||
|
||||
# deprel["from"] points to n-th predicate
|
||||
# replace with predicate's token index
|
||||
for deprel in orig_json_data[sid]:
|
||||
deprel["from"] = predicates[deprel["from"]]
|
||||
|
||||
if DEBUG:
|
||||
print(to_sentence(sentence_arr))
|
||||
print(orig_json_data[sid])
|
||||
print(sid)
|
||||
print()
|
||||
print()
|
||||
# a = orig_json_data[sid]
|
||||
return orig_json_data
|
||||
|
||||
def count_orig_file_sentences(filename):
|
||||
|
||||
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
|
||||
return
|
||||
print(filename[0])
|
||||
orig_dict = par.parse_tei(filename[1])
|
||||
# return filename[0], filename[1], len(orig_dict)
|
||||
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
|
||||
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
|
||||
|
||||
|
||||
def count_srl_file_sentences(filename):
|
||||
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
|
||||
return
|
||||
|
||||
print(filename[0])
|
||||
num_sentences = 0
|
||||
with filename[1].open("r") as fp:
|
||||
for line in fp:
|
||||
if line == '\n':
|
||||
num_sentences += 1
|
||||
|
||||
# return filename[0], filename[1], num_sentences
|
||||
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
|
||||
pickle.dump((filename[0], filename[1], num_sentences), output)
|
||||
|
||||
def srl_error_fix_generator(infile):
|
||||
with infile.open("rb") as fp:
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
yield to_sentence(sentence_arr), sentence_arr
|
||||
yield None
|
||||
|
||||
def srl_sentences_generator(infile, curr_index, sen_start_index):
|
||||
with infile.open("rb") as fp:
|
||||
outdata = {}
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
if curr_index < sen_start_index:
|
||||
curr_index += 1
|
||||
else:
|
||||
yield to_sentence(sentence_arr), sentence_arr
|
||||
yield None
|
||||
|
||||
|
||||
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
|
||||
sentence_id = max(0, sentence_id - 10)
|
||||
for i, srl_file in enumerate(srl_file_sizes):
|
||||
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
|
||||
srl_files = srl_file_sizes[i:]
|
||||
break
|
||||
|
||||
for file_info in srl_files:
|
||||
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
|
||||
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
|
||||
el = next(srl_gen)
|
||||
while el is not None:
|
||||
yield el
|
||||
el = next(srl_gen)
|
||||
|
||||
yield None
|
||||
|
||||
error_sentences_grouped = []
|
||||
group = False
|
||||
prev_name = ''
|
||||
# group sentences by their files
|
||||
for name in error_sentences:
|
||||
if name[:9] == prev_name:
|
||||
group.append(name)
|
||||
else:
|
||||
prev_name = name[:9]
|
||||
if group:
|
||||
error_sentences_grouped.append(group)
|
||||
group = [name]
|
||||
error_sentences_grouped.append(group)
|
||||
|
||||
srl_gen = srl_error_fix_generator(INPATH)
|
||||
|
||||
# find errors in json files:
|
||||
# with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output:
|
||||
# sentence_ids = pickle.load(output)
|
||||
#
|
||||
#
|
||||
#
|
||||
# origfiles = []
|
||||
# for subdir, dirs, files in os.walk(OUTPATH):
|
||||
# for file in files:
|
||||
# origfiles.append(Path(os.path.join(subdir, file)))
|
||||
# origfiles=sorted(origfiles)
|
||||
#
|
||||
#
|
||||
#
|
||||
# for sent in origfiles:
|
||||
# # for sent in sentence_ids:
|
||||
# # outfile = Path(OUTPATH, sent[:9] + '-dedup.json')
|
||||
# outfile = sent
|
||||
#
|
||||
# try:
|
||||
# with outfile.open() as json_file:
|
||||
# json.load(json_file)
|
||||
# pass
|
||||
# except:
|
||||
# print(outfile.name)
|
||||
#
|
||||
#
|
||||
# raise Exception('test')
|
||||
# iterate over all wronged sentences and fix them
|
||||
for errors_in_file in error_sentences_grouped:
|
||||
outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json')
|
||||
with outfile.open() as json_file:
|
||||
print(outfile.name)
|
||||
orig_json_data = json.load(json_file)
|
||||
for error_sentence in errors_in_file:
|
||||
orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data)
|
||||
|
||||
with outfile.open('w') as json_file:
|
||||
json.dump(orig_json_data, json_file)
|
||||
logging.info("SRL relations written to: {}".format(outfile))
|
|
@ -1,47 +0,0 @@
|
|||
# parse config
|
||||
import configparser
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from tools.parser.parser import Parser
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read("tools.cfg.ssj500k2.3")
|
||||
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
|
||||
JSONPATH = Path(config["tools"]["ssj500k_json"] + '/ssj500k-sl.body.json')
|
||||
OUTPATH = Path(config["tools"]["ssj500k_tei"])
|
||||
INTERNAL_DATA = Path(config["tools"]["internal_data"])
|
||||
DEBUG = config["tools"]["debug"] == "True"
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||
LOGFILE.touch(exist_ok=True)
|
||||
LOGFILE.resolve()
|
||||
|
||||
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||
|
||||
|
||||
par = Parser()
|
||||
OUTPATH.mkdir(exist_ok=True)
|
||||
|
||||
jsondata = []
|
||||
with open(JSONPATH, 'r') as jf:
|
||||
jsondata = json.load(jf)
|
||||
|
||||
logging.info("Generating TEI with annotated SRL.")
|
||||
|
||||
def handle_file(file, jsondata):
|
||||
teifile = (ORIGPATH / file)
|
||||
resfile = (OUTPATH / file)
|
||||
|
||||
orig_dict = par.parse_tei(teifile)
|
||||
|
||||
# origfile = get_origfile()
|
||||
orig_dict = par.minimize_tei(teifile, jsondata)
|
||||
|
||||
origfiles = []
|
||||
for subdir, dirs, files in os.walk(ORIGPATH):
|
||||
for file in files:
|
||||
handle_file(file, jsondata)
|
|
@ -1,5 +1,3 @@
|
|||
import pickle
|
||||
|
||||
from parser.parser import Parser
|
||||
import os
|
||||
from os.path import join, dirname
|
||||
|
@ -16,31 +14,9 @@ par = Parser()
|
|||
|
||||
# path to data
|
||||
config = configparser.ConfigParser()
|
||||
# config.read("tools.cfg")
|
||||
config.read("tools.cfg.ssj500k2.3")
|
||||
analysis = ''
|
||||
if 'kres_orig' in config["tools"]:
|
||||
analysis = 'kres'
|
||||
INDIR = Path(config["tools"]["kres_orig"])
|
||||
OUTDIR = Path(config["tools"]["kres_tsv"])
|
||||
elif 'giga_orig' in config["tools"]:
|
||||
# analysis = 'gigafida'
|
||||
analysis = 'giga'
|
||||
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
|
||||
INDIR_GIGA = Path(config["tools"]["giga_orig"])
|
||||
INDIR_JOS = Path(config["tools"]["giga_jos"])
|
||||
OUTDIR = Path(config["tools"]["giga_tsv"])
|
||||
GIGA_PARTS = int(config["tools"]["giga_parts"])
|
||||
INTERNAL_DATA = config["tools"]["internal_data"]
|
||||
elif 'ssj500k_orig' in config["tools"]:
|
||||
# analysis = 'gigafida'
|
||||
analysis = 'ssj500k'
|
||||
INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])
|
||||
INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])
|
||||
INDIR_JOS = Path(config["tools"]["ssj500k_jos"])
|
||||
OUTDIR = Path(config["tools"]["ssj500k_tsv"])
|
||||
INTERNAL_DATA = config["tools"]["internal_data"]
|
||||
|
||||
config.read("tools.cfg")
|
||||
INDIR = Path(config["tools"]["kres_orig"])
|
||||
OUTDIR = Path(config["tools"]["kres_tsv"])
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||
|
@ -58,365 +34,41 @@ print("end parsing ssj")
|
|||
"""
|
||||
|
||||
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
||||
# OUTDIR.mkdir(exist_ok=True)
|
||||
OUTDIR.mkdir(exist_ok=True)
|
||||
|
||||
if analysis == 'kres':
|
||||
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
|
||||
logging.info("Parsing kres: {} files.".format(len(infiles)))
|
||||
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
|
||||
logging.info("Parsing kres: {} files.".format(len(infiles)))
|
||||
|
||||
|
||||
def handle_ssj500k_file():
|
||||
kres_file = INDIR_SSJ500K_ORIG
|
||||
outfile = OUTDIR
|
||||
def handle_file(infile):
|
||||
i = infile[0]
|
||||
kres_file = infile[1]
|
||||
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
|
||||
|
||||
if outfile.is_file():
|
||||
logging.info("Skipping existing file: {}.".format(str(kres_file)))
|
||||
return True
|
||||
|
||||
# try:
|
||||
res_dict = par.parse_tei(kres_file)
|
||||
kres_out_str = ""
|
||||
for _, sentence in res_dict.items():
|
||||
kres_out_str += par.to_conll_2009_SRL(sentence)
|
||||
# except Exception as exc:
|
||||
# logging.info("Failed processing file: {}".format(str(kres_file)))
|
||||
# logging.error(exc)
|
||||
# return False
|
||||
try:
|
||||
res_dict = par.parse_tei(kres_file)
|
||||
kres_out_str = ""
|
||||
for _, sentence in res_dict.items():
|
||||
kres_out_str += par.to_conll_2009_SRL(sentence)
|
||||
except Exception as exc:
|
||||
logging.info("Failed processing file: {}".format(str(kres_file)))
|
||||
logging.error(exc)
|
||||
return False
|
||||
|
||||
|
||||
with outfile.open("wb+") as fp:
|
||||
fp.write(kres_out_str.encode("utf-8"))
|
||||
# logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
|
||||
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
|
||||
return True
|
||||
return False
|
||||
|
||||
def ssj500k_orig_generator():
|
||||
with open(INDIR_SSJ500K, 'r') as gof:
|
||||
previous_new_line = False
|
||||
for l_gof in gof:
|
||||
if l_gof == '\n':
|
||||
if previous_new_line:
|
||||
continue
|
||||
previous_new_line = True
|
||||
elif previous_new_line:
|
||||
previous_new_line = False
|
||||
yield l_gof
|
||||
|
||||
|
||||
def handle_gigafida_file():
|
||||
"""
|
||||
File that splits big text file into more minor files. Only split on empty lines.
|
||||
"""
|
||||
# with open(INDIR_GIGA, 'r') as gof:
|
||||
# with open(INDIR_JOS, 'r') as gjf:
|
||||
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||
# pass
|
||||
# num_lines = i + 1
|
||||
# print(num_lines)
|
||||
gof_generator = giga_orig_generator()
|
||||
# with open(INDIR_GIGA, 'r') as gof:
|
||||
with open(INDIR_JOS, 'r') as gjf:
|
||||
sentence = {}
|
||||
sentence['tokens'] = []
|
||||
sentence['links'] = {}
|
||||
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
|
||||
ignore_lines = True
|
||||
wf = False
|
||||
else:
|
||||
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
||||
ignore_lines = False
|
||||
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||
for i, l_gjf in enumerate(gjf):
|
||||
l_gof = next(gof_generator)
|
||||
if ignore_lines:
|
||||
if i > num_lines_per_part * curr_part and l_gof == '\n':
|
||||
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
|
||||
ignore_lines = False
|
||||
# delete last file (probably not whole)
|
||||
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
|
||||
if ignore_lines:
|
||||
print(curr_part)
|
||||
curr_part += 1
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
l_gof_split = l_gof.split('\t')
|
||||
l_gjf_split = l_gjf.split('\t')
|
||||
|
||||
# if punctuation
|
||||
if l_gof != '\n':
|
||||
if l_gof_split[1][-1] == 'u':
|
||||
# print(l_gjf_split)
|
||||
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
||||
else:
|
||||
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
||||
|
||||
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
||||
|
||||
# if l_gof == '\n':
|
||||
else:
|
||||
if wf:
|
||||
# print(i)
|
||||
wf.write(par.to_conll_2009_SRL(sentence))
|
||||
sentence['tokens'] = []
|
||||
sentence['links'] = {}
|
||||
# wf.flush()
|
||||
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
||||
if i > num_lines_per_part * (curr_part + 1):
|
||||
curr_part += 1
|
||||
# if wf doesn't exist (first one)
|
||||
if wf:
|
||||
wf.close()
|
||||
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
||||
curr_part += 1
|
||||
wf.close()
|
||||
|
||||
|
||||
def handle_ssj500k_file2():
|
||||
"""
|
||||
File that splits big text file into more minor files. Only split on empty lines.
|
||||
"""
|
||||
gof_generator = ssj500k_orig_generator()
|
||||
# with open(INDIR_GIGA, 'r') as gof:
|
||||
with open(INDIR_JOS, 'r') as gjf:
|
||||
sentence = {}
|
||||
sentence['tokens'] = []
|
||||
sentence['links'] = {}
|
||||
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
|
||||
ignore_lines = True
|
||||
wf = False
|
||||
else:
|
||||
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
||||
ignore_lines = False
|
||||
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||
for i, l_gjf in enumerate(gjf):
|
||||
l_gof = next(gof_generator)
|
||||
if ignore_lines:
|
||||
if i > num_lines_per_part * curr_part and l_gof == '\n':
|
||||
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
|
||||
ignore_lines = False
|
||||
# delete last file (probably not whole)
|
||||
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
|
||||
if ignore_lines:
|
||||
print(curr_part)
|
||||
curr_part += 1
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
l_gof_split = l_gof.split('\t')
|
||||
l_gjf_split = l_gjf.split('\t')
|
||||
|
||||
# if punctuation
|
||||
if l_gof != '\n':
|
||||
if l_gof_split[1][-1] == 'u':
|
||||
# print(l_gjf_split)
|
||||
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
||||
else:
|
||||
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
||||
|
||||
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
||||
|
||||
# if l_gof == '\n':
|
||||
else:
|
||||
if wf:
|
||||
# print(i)
|
||||
wf.write(par.to_conll_2009_SRL(sentence))
|
||||
sentence['tokens'] = []
|
||||
sentence['links'] = {}
|
||||
# wf.flush()
|
||||
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
||||
if i > num_lines_per_part * (curr_part + 1):
|
||||
curr_part += 1
|
||||
# if wf doesn't exist (first one)
|
||||
if wf:
|
||||
wf.close()
|
||||
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
||||
curr_part += 1
|
||||
wf.close()
|
||||
|
||||
|
||||
import time
|
||||
def handle_giga_file(ran):
|
||||
"""
|
||||
File that splits big text file into more minor files. Only split on empty lines.
|
||||
"""
|
||||
# with open(INDIR_GIGA, 'r') as gof:
|
||||
# with open(INDIR_JOS, 'r') as gjf:
|
||||
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||
# pass
|
||||
# num_lines = i + 1
|
||||
# print(num_lines)
|
||||
num_lines = 1393184026
|
||||
# 1393184026
|
||||
# 1393184033
|
||||
# return
|
||||
num_lines_per_part = num_lines / GIGA_PARTS
|
||||
curr_part = 0
|
||||
gof_generator = giga_orig_generator()
|
||||
# with open(INDIR_GIGA, 'r') as gof:
|
||||
with open(INDIR_JOS, 'r') as gjf:
|
||||
sentence = {}
|
||||
sentence['tokens'] = []
|
||||
sentence['links'] = {}
|
||||
wf = None
|
||||
if curr_part in file_indices:
|
||||
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
|
||||
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
|
||||
|
||||
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
|
||||
|
||||
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||
for i, l_gjf in enumerate(gjf):
|
||||
l_gof = next(gof_generator)
|
||||
if curr_part < ran[0]:
|
||||
if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
|
||||
if curr_part < ran[0]:
|
||||
print(curr_part)
|
||||
curr_part += 1
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
|
||||
l_gof_split = l_gof.split('\t')
|
||||
l_gjf_split = l_gjf.split('\t')
|
||||
|
||||
# if punctuation
|
||||
if l_gof != '\n':
|
||||
if curr_part not in file_indices:
|
||||
continue
|
||||
if l_gof_split[1][-1] == 'u':
|
||||
# print(l_gjf_split)
|
||||
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
||||
else:
|
||||
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
||||
|
||||
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
||||
|
||||
# if l_gof == '\n':
|
||||
else:
|
||||
if curr_part in file_indices:
|
||||
wf.write(par.to_conll_2009_SRL(sentence))
|
||||
sentence['tokens'] = []
|
||||
sentence['links'] = {}
|
||||
# wf.flush()
|
||||
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
||||
if i > num_lines_per_part * (curr_part + 1):
|
||||
curr_part += 1
|
||||
# if wf doesn't exist (first one)
|
||||
if curr_part in file_indices and wf:
|
||||
wf.close()
|
||||
if curr_part >= ran[1]:
|
||||
break
|
||||
if curr_part in file_indices:
|
||||
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
|
||||
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
|
||||
|
||||
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
||||
|
||||
curr_part += 1
|
||||
wf.close()
|
||||
|
||||
def handle_giga_file_selected_sentences(error_sentences):
|
||||
"""
|
||||
File that splits big text file into more minor files. Only split on empty lines.
|
||||
"""
|
||||
# with open(INDIR_GIGA, 'r') as gof:
|
||||
# with open(INDIR_JOS, 'r') as gjf:
|
||||
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||
# pass
|
||||
# num_lines = i + 1
|
||||
# print(num_lines)
|
||||
# print('num_lines' + 3)
|
||||
# num_lines = 1393184026
|
||||
num_lines = 1393222523
|
||||
# 1393184026
|
||||
# 1393184033
|
||||
# return
|
||||
# num_lines_per_part = num_lines / GIGA_PARTS
|
||||
# curr_part = 0
|
||||
gof_generator = giga_orig_generator()
|
||||
# with open(INDIR_GIGA, 'r') as gof:
|
||||
with open(INDIR_JOS, 'r') as gjf:
|
||||
sentence = {}
|
||||
sentence['tokens'] = []
|
||||
sentence['links'] = {}
|
||||
wf = None
|
||||
if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
|
||||
os.remove(os.path.join(OUTDIR, 'giga_errors'))
|
||||
|
||||
wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
|
||||
|
||||
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
|
||||
sentence_ids_list = pickle.load(pkl_file)
|
||||
|
||||
sentence_id = 0
|
||||
skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
|
||||
|
||||
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
||||
for i, l_gjf in enumerate(gjf):
|
||||
l_gof = next(gof_generator)
|
||||
|
||||
|
||||
if l_gjf == '\n':
|
||||
if not skip_sentence:
|
||||
wf.write(par.to_conll_2009_SRL(sentence))
|
||||
sentence['tokens'] = []
|
||||
sentence['links'] = {}
|
||||
sentence_id += 1
|
||||
if sentence_ids_list[sentence_id] in error_sentences:
|
||||
print(sentence_ids_list[sentence_id])
|
||||
skip_sentence = False
|
||||
else:
|
||||
skip_sentence = True
|
||||
|
||||
if skip_sentence:
|
||||
continue
|
||||
|
||||
|
||||
# if curr_part < ran[0]:
|
||||
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
|
||||
# if curr_part < ran[0]:
|
||||
# print(curr_part)
|
||||
# curr_part += 1
|
||||
# continue
|
||||
# else:
|
||||
# continue
|
||||
|
||||
l_gof_split = l_gof.split('\t')
|
||||
l_gjf_split = l_gjf.split('\t')
|
||||
|
||||
# if punctuation
|
||||
if l_gof != '\n':
|
||||
if l_gof_split[1][-1] == 'u':
|
||||
# print(l_gjf_split)
|
||||
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
|
||||
else:
|
||||
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
|
||||
|
||||
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
|
||||
|
||||
# if l_gof == '\n':
|
||||
# wf.flush()
|
||||
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
||||
# if i > num_lines_per_part * (curr_part + 1):
|
||||
# curr_part += 1
|
||||
# # if wf doesn't exist (first one)
|
||||
# if curr_part in file_indices and wf:
|
||||
# wf.close()
|
||||
# if curr_part >= ran[1]:
|
||||
# break
|
||||
# if curr_part in file_indices:
|
||||
# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
|
||||
# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
|
||||
#
|
||||
# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
||||
|
||||
# curr_part += 1
|
||||
wf.close()
|
||||
|
||||
|
||||
|
||||
handle_ssj500k_file()
|
||||
with Pool(CPU_CORES) as p:
|
||||
p.map(handle_file, infiles)
|
||||
|
||||
|
||||
logging.info("end parsing kres")
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
import copy
|
||||
|
||||
from lxml import etree
|
||||
import re
|
||||
from parser.msd.msdmap import Msdmap
|
||||
|
@ -7,7 +5,6 @@ import pickle
|
|||
from pathlib import Path
|
||||
from fillpred_model.step1 import build_model_row
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
class Parser:
|
||||
# reads a TEI xml file and returns a dictionary:
|
||||
|
@ -32,23 +29,17 @@ class Parser:
|
|||
def parse_tei(self, filepath):
|
||||
|
||||
def parse_links(s_el):
|
||||
sent_id = '#' + s_el.get('id')
|
||||
lgrps = s_el.findall(".//linkGrp")
|
||||
lgrps = s_el.findall(".//links")
|
||||
if len(lgrps) < 1:
|
||||
raise IOError("Can't find links.")
|
||||
res_links = {}
|
||||
for lgrp in lgrps:
|
||||
if lgrp.get("type") == "JOS-SYN":
|
||||
for link in lgrp:
|
||||
jos_type = link.get("ana").split(":")[-1]
|
||||
link_data = link.get("target").split(" ")
|
||||
link_from = int(link_data[1].split('.')[-1][1:])
|
||||
link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
|
||||
res_links[link_from] = (
|
||||
jos_type,
|
||||
link_from,
|
||||
link_to,
|
||||
)
|
||||
for link in lgrps[0]:
|
||||
dep = int(link.get("dep").split(".")[-1])
|
||||
res_links[dep] = (
|
||||
link.get("afun"),
|
||||
dep,
|
||||
int(link.get("from").split(".")[-1]),
|
||||
)
|
||||
return res_links
|
||||
|
||||
guess_corpus = None # SSJ | KRES
|
||||
|
@ -66,206 +57,7 @@ class Parser:
|
|||
divs = [] # in ssj, there are divs, in Kres, there are separate files
|
||||
if "id" in root.keys():
|
||||
# Kres files start with <TEI id=...>
|
||||
if root.get("id")[0:2] == 'GF':
|
||||
guess_corpus = "GIGA"
|
||||
else:
|
||||
guess_corpus = "KRES"
|
||||
divs = [root]
|
||||
else:
|
||||
guess_corpus = "SSJ"
|
||||
divs = root.findall(".//div")
|
||||
|
||||
# parse divs
|
||||
for div in divs:
|
||||
f_id = div.get("id")[:-6]
|
||||
|
||||
if guess_corpus == "GIGA":
|
||||
div = div.findall(".//body")[0]
|
||||
|
||||
# parse paragraphs
|
||||
for p in div.findall(".//p"):
|
||||
p_id = p.get("id").split(".")[-1]
|
||||
|
||||
# parse sentences
|
||||
for s in p.findall(".//s"):
|
||||
# test if sentence has jos-syn annotations and doesn't have SRL
|
||||
sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
|
||||
if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
|
||||
continue
|
||||
|
||||
s_id = s.get("id").split(".")[-1]
|
||||
sentence_text = ""
|
||||
sentence_list = []
|
||||
sentence_tokens = []
|
||||
|
||||
# parse tokens
|
||||
for el in s.iter():
|
||||
if el.tag in self.W_TAGS:
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
uPosTag = None
|
||||
uPosFeats = []
|
||||
for msd_el in el.get("msd").split('|'):
|
||||
key, val = msd_el.split('=')
|
||||
if key == 'UPosTag':
|
||||
uPosTag = val
|
||||
else:
|
||||
uPosFeats.append(msd_el)
|
||||
uPosFeats = '|'.join(uPosFeats)
|
||||
sentence_tokens += [(
|
||||
"w",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.get("lemma"),
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
uPosTag,
|
||||
uPosFeats
|
||||
)]
|
||||
elif el.tag in self.C_TAGS:
|
||||
# only Kres' C_TAGS have ids
|
||||
if guess_corpus != "GIGA":
|
||||
el_id = el.get("id") or "none"
|
||||
el_id = el_id.split(".")[-1]
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [("c", el_id, el.text,)]
|
||||
elif el.tag in self.S_TAGS:
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
uPosTag = None
|
||||
uPosFeats = []
|
||||
for msd_el in el.get("msd").split('|'):
|
||||
key, val = msd_el.split('=')
|
||||
if key == 'UPosTag':
|
||||
uPosTag = val
|
||||
else:
|
||||
uPosFeats.append(msd_el)
|
||||
uPosFeats = '|'.join(uPosFeats)
|
||||
sentence_tokens += [(
|
||||
"pc",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.text,
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
uPosTag,
|
||||
uPosFeats
|
||||
)]
|
||||
else:
|
||||
# pass links and linkGroups
|
||||
pass
|
||||
sentence_id = s.get("id")
|
||||
if sentence_id in res_dict:
|
||||
raise KeyError("duplicated id: {}".format(sentence_id))
|
||||
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": sentence_text,
|
||||
"tokens": sentence_tokens,
|
||||
"links": (
|
||||
parse_links(s)
|
||||
)
|
||||
}
|
||||
fp.close()
|
||||
return res_dict
|
||||
|
||||
|
||||
def minimize_tei(self, filepath, jsondata):
|
||||
def set_xml_attr(node, attribute, value):
|
||||
node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value
|
||||
|
||||
def parse_links(s_el):
|
||||
sent_id = '#' + s_el.get('id')
|
||||
lgrps = s_el.findall(".//linkGrp")
|
||||
if len(lgrps) < 1:
|
||||
raise IOError("Can't find links.")
|
||||
res_links = {}
|
||||
for lgrp in lgrps:
|
||||
if lgrp.get("type") == "JOS-SYN":
|
||||
for link in lgrp:
|
||||
jos_type = link.get("ana").split(":")[-1]
|
||||
link_data = link.get("target").split(" ")
|
||||
link_from = int(link_data[1].split('.')[-1][1:])
|
||||
link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
|
||||
res_links[link_from] = (
|
||||
jos_type,
|
||||
link_from,
|
||||
link_to,
|
||||
)
|
||||
return res_links
|
||||
|
||||
guess_corpus = None # SSJ | KRES
|
||||
res_dict = {}
|
||||
# with filepath.open("rb") as fp, open("../data/ssj500k2.3/final_tei/res.xml", 'w') as sf:
|
||||
with filepath.open("rb") as fp:
|
||||
used_ssj_documents = set([k.split('.')[0] for k, v in jsondata.items()])
|
||||
used_ssj_paragraphs = set(['.'.join(k.split('.')[:-1]) for k, v in jsondata.items()])
|
||||
used_ssj_sentences = set([k for k, v in jsondata.items()])
|
||||
|
||||
ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
|
||||
tree = ET.parse(fp)
|
||||
root_res = tree.getroot()
|
||||
# root_res = copy.deepcopy(root)
|
||||
ns = '{http://www.w3.org/XML/1998/namespace}'
|
||||
ns2 = '{http://www.tei-c.org/ns/1.0}'
|
||||
|
||||
for doc in list(root_res):
|
||||
doc_id = doc.get(ns + 'id')
|
||||
if doc_id not in used_ssj_documents:
|
||||
root_res.remove(doc)
|
||||
continue
|
||||
|
||||
for par in list(doc):
|
||||
par_id = par.get(ns + 'id')
|
||||
if par_id not in used_ssj_paragraphs:
|
||||
if par.tag != ns2 + 'bibl':
|
||||
doc.remove(par)
|
||||
continue
|
||||
|
||||
for sen in list(par):
|
||||
sen_id = sen.get(ns + 'id')
|
||||
if sen_id not in used_ssj_sentences:
|
||||
par.remove(sen)
|
||||
continue
|
||||
|
||||
linkGrp = ET.Element(f'{ns2}linkGrp')
|
||||
|
||||
linkGrp.attrib[f'targFunc'] = 'head argument'
|
||||
linkGrp.attrib[f'type'] = 'SRL'
|
||||
|
||||
for srl_el in jsondata[sen_id]:
|
||||
link = ET.Element(f'{ns2}link')
|
||||
link.attrib['ana'] = f'srl:{srl_el["arg"]}'
|
||||
link.attrib['target'] = f'#{sen_id}.t{srl_el["from"]} #{sen_id}.t{srl_el["dep"]}'
|
||||
linkGrp.append(link)
|
||||
sen.append(linkGrp)
|
||||
|
||||
|
||||
# <linkGrp corresp="#ssj1.1.1" targFunc="head argument" type="SRL">
|
||||
# <link ana="srl:TIME" target="#ssj1.1.1.t6 #ssj1.1.1.t3"/>
|
||||
# <link ana="srl:QUANT" target="#ssj1.1.1.t6 #ssj1.1.1.t5"/>
|
||||
# <link ana="srl:TIME" target="#ssj1.1.1.t8 #ssj1.1.1.t11"/>
|
||||
# <link ana="srl:PAT" target="#ssj1.1.1.t23 #ssj1.1.1.t21"/>
|
||||
# <link ana="srl:ACT" target="#ssj1.1.1.t23 #ssj1.1.1.t22"/>
|
||||
# <link ana="srl:RESLT" target="#ssj1.1.1.t18 #ssj1.1.1.t23"/>
|
||||
# </linkGrp>
|
||||
# print('aaa')
|
||||
|
||||
# sf.write(etree.tostring(tree, pretty_print=True, encoding='utf-8').decode())
|
||||
tree.write("../data/ssj500k2.3/final_tei/res.xml", encoding='utf-8')
|
||||
|
||||
return
|
||||
divs = [] # in ssj, there are divs, in Kres, there are separate files
|
||||
if "id" in root.keys():
|
||||
# Kres files start with <TEI id=...>
|
||||
if root.get("id")[0:2] == 'GF':
|
||||
guess_corpus = "GIGA"
|
||||
else:
|
||||
guess_corpus = "KRES"
|
||||
guess_corpus = "KRES"
|
||||
divs = [root]
|
||||
else:
|
||||
guess_corpus = "SSJ"
|
||||
|
@ -275,24 +67,14 @@ class Parser:
|
|||
for div in divs:
|
||||
f_id = div.get("id")
|
||||
|
||||
if guess_corpus == "GIGA":
|
||||
div = div.findall(".//body")[0]
|
||||
|
||||
# parse paragraphs
|
||||
for p in div.findall(".//p"):
|
||||
p_id = p.get("id").split(".")[-1]
|
||||
|
||||
# parse sentences
|
||||
for s in p.findall(".//s"):
|
||||
# test if sentence has jos-syn annotations and doesn't have SRL
|
||||
sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
|
||||
if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
|
||||
del s
|
||||
continue
|
||||
|
||||
s_id = s.get("id").split(".")[-1]
|
||||
sentence_text = ""
|
||||
sentence_list = []
|
||||
sentence_tokens = []
|
||||
|
||||
# parse tokens
|
||||
|
@ -302,73 +84,37 @@ class Parser:
|
|||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
sentence_text += el.text
|
||||
uPosTag = None
|
||||
uPosFeats = []
|
||||
for msd_el in el.get("msd").split('|'):
|
||||
key, val = msd_el.split('=')
|
||||
if key == 'UPosTag':
|
||||
uPosTag = val
|
||||
else:
|
||||
uPosFeats.append(msd_el)
|
||||
uPosFeats = '|'.join(uPosFeats)
|
||||
sentence_tokens += [(
|
||||
"w",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.get("lemma"),
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
uPosTag,
|
||||
uPosFeats
|
||||
(el.get("msd") if guess_corpus == "KRES"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
)]
|
||||
elif el.tag in self.C_TAGS:
|
||||
# only Kres' C_TAGS have ids
|
||||
if guess_corpus != "GIGA":
|
||||
el_id = el.get("id") or "none"
|
||||
el_id = el_id.split(".")[-1]
|
||||
sentence_text += el.text
|
||||
sentence_tokens += [("c", el_id, el.text,)]
|
||||
elif el.tag in self.S_TAGS:
|
||||
el_id = el.get("id").split(".")[-1]
|
||||
if el_id[0] == 't':
|
||||
el_id = el_id[1:] # ssj W_TAG ids start with t
|
||||
el_id = el.get("id") or "none"
|
||||
el_id = el_id.split(".")[-1]
|
||||
sentence_text += el.text
|
||||
uPosTag = None
|
||||
uPosFeats = []
|
||||
for msd_el in el.get("msd").split('|'):
|
||||
key, val = msd_el.split('=')
|
||||
if key == 'UPosTag':
|
||||
uPosTag = val
|
||||
else:
|
||||
uPosFeats.append(msd_el)
|
||||
uPosFeats = '|'.join(uPosFeats)
|
||||
sentence_tokens += [(
|
||||
"pc",
|
||||
int(el_id),
|
||||
el.text,
|
||||
el.text,
|
||||
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
|
||||
else el.get("ana").split(":")[-1]),
|
||||
uPosTag,
|
||||
uPosFeats
|
||||
)]
|
||||
sentence_tokens += [("c", el_id, el.text,)]
|
||||
elif el.tag in self.S_TAGS:
|
||||
# Kres' <S /> doesn't contain .text
|
||||
sentence_text += " "
|
||||
else:
|
||||
# pass links and linkGroups
|
||||
pass
|
||||
sentence_id = s.get("id")
|
||||
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
|
||||
if sentence_id in res_dict:
|
||||
raise KeyError("duplicated id: {}".format(sentence_id))
|
||||
|
||||
res_dict[sentence_id] = {
|
||||
"sid": sentence_id,
|
||||
"text": sentence_text,
|
||||
"tokens": sentence_tokens,
|
||||
"links": (
|
||||
parse_links(s)
|
||||
parse_links(s) if guess_corpus == "KRES" else None
|
||||
)
|
||||
}
|
||||
et = etree.ElementTree(root)
|
||||
et.write("../data/ssj500k2.3/final_tei/res.xml", pretty_print=True, encoding='unicode')
|
||||
fp.close()
|
||||
return res_dict
|
||||
|
||||
|
@ -377,7 +123,7 @@ class Parser:
|
|||
|
||||
def fillpred(tsv_row):
|
||||
mrow = build_model_row(tsv_row)
|
||||
x = mrow[:-1]
|
||||
x = mrow[:-1]
|
||||
y = self.fillpred_model.predict([x])
|
||||
return y[0] # bool
|
||||
|
||||
|
@ -389,8 +135,12 @@ class Parser:
|
|||
|
||||
# handle stop signs
|
||||
if token[0] != "w":
|
||||
out_list = [t_id] + [form for x in range(7)] + ["0", "0", "modra", "modra", "_", "_"] + ["\n"]
|
||||
out_str += '\t'.join(map(str, out_list))
|
||||
out_str += '\t'.join(
|
||||
[t_id] +
|
||||
[form for x in range(7)] +
|
||||
["0", "0", "modra", "modra", "_", "_"] +
|
||||
["\n"]
|
||||
)
|
||||
continue
|
||||
|
||||
pos = self.msdmap.slo_msd_to_eng_pos(token[4])
|
||||
|
|
|
@ -34,8 +34,7 @@ JVM_ARGS="-cp $CP -Xmx$MEM"
|
|||
NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step. This setting is equivalent to the CoNLL 2009 ST.
|
||||
|
||||
|
||||
$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang "$INPUT" $MODEL $RERANKER $NOPI "$OUTPUT"
|
||||
# CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang '$INPUT' $MODEL $RERANKER $NOPI '$OUTPUT'"
|
||||
# echo "Executing: $CMD"
|
||||
CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang $INPUT $MODEL $RERANKER $NOPI $OUTPUT"
|
||||
echo "Executing: $CMD"
|
||||
|
||||
# $CMD
|
||||
$CMD
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# parsing tools.cfg values
|
||||
IN_FOLDER="$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg)"
|
||||
echo "input folder: $IN_FOLDER"
|
||||
OUT_FOLDER="$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg)"
|
||||
echo "output folder: $OUT_FOLDER"
|
||||
|
||||
SUFFIX="srl.tsv"
|
||||
|
||||
mkdir -p "$OUT_FOLDER"
|
||||
rm "$OUT_FOLDER/*${SUFFIX}" &> /dev/null
|
||||
|
||||
for infile in "$IN_FOLDER/*"; do
|
||||
echo "Tagging: ${infile}"
|
||||
base=$(basename $infile | cut -d'.' -f1)
|
||||
outfile="${OUT_FOLDER}/${base}.${SUFFIX}"
|
||||
|
||||
# mate-tools tagger
|
||||
./scripts/parse_srl_only_mod.sh "$infile" "$outfile"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Saved as ${outfile}"
|
||||
else
|
||||
echo "ERR"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# parsing tools.cfg values
|
||||
IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
|
||||
echo "input folder: $IN_FOLDER"
|
||||
OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
|
||||
echo "output folder: $OUT_FOLDER"
|
||||
|
||||
SUFFIX="srl.tsv"
|
||||
|
||||
mkdir -p $OUT_FOLDER
|
||||
rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
|
||||
|
||||
for infile in $IN_FOLDER/*; do
|
||||
echo "Tagging: ${infile}"
|
||||
base=$(basename $infile | cut -d'.' -f1)
|
||||
outfile=${OUT_FOLDER}/${base}.${SUFFIX}
|
||||
|
||||
# mate-tools tagger
|
||||
./scripts/parse_srl_only_mod.sh $infile $outfile
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Saved as ${outfile}"
|
||||
else
|
||||
echo "ERR"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
|
@ -1,16 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
# parsing tools.cfg values
|
||||
IN_FOLDER="../$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg.kres_new)"
|
||||
IN_FOLDER=$IN_FOLDER$1
|
||||
IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
|
||||
echo "input folder: $IN_FOLDER"
|
||||
OUT_FOLDER="../$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg.kres_new)"
|
||||
OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
|
||||
echo "output folder: $OUT_FOLDER"
|
||||
|
||||
SUFFIX="srl.tsv"
|
||||
|
||||
mkdir -p $OUT_FOLDER
|
||||
# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
|
||||
rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
|
||||
|
||||
for infile in $IN_FOLDER/*; do
|
||||
echo "Tagging: ${infile}"
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# parsing tools.cfg values
|
||||
IN_FOLDER="../$(sed -n -e 's/^\s*ssj500k_tsv_folder\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
|
||||
IN_FOLDER=$IN_FOLDER$1
|
||||
echo "input folder: $IN_FOLDER"
|
||||
OUT_FOLDER="../$(sed -n -e 's/^\s*ssj500k_srl\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
|
||||
echo "output folder: $OUT_FOLDER"
|
||||
|
||||
SUFFIX="srl.tsv"
|
||||
|
||||
mkdir -p $OUT_FOLDER
|
||||
# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
|
||||
|
||||
for infile in $IN_FOLDER/*; do
|
||||
echo "Tagging: ${infile}"
|
||||
base=$(basename $infile | cut -d'.' -f1)
|
||||
outfile=${OUT_FOLDER}/${base}.${SUFFIX}
|
||||
|
||||
# mate-tools tagger
|
||||
./scripts/parse_srl_only_mod.sh $infile $outfile
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Saved as ${outfile}"
|
||||
else
|
||||
echo "ERR"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
|
@ -1,13 +1,8 @@
|
|||
[tools]
|
||||
giga = ../data/gf_example/gf2_orig
|
||||
giga_orig = ../data/gf_example/gf2-dedup.patch0001
|
||||
giga_jos = ../data/gf_example/gf2-dedup.jos.patch0001
|
||||
giga_tsv = ../data/gf_example/gf_files_part
|
||||
giga_srl = ../data/gf_example/2_srl
|
||||
;giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
|
||||
giga_json = ../data/gf_example/final_json
|
||||
internal_data = ../data/gf_example/internal_data
|
||||
giga_parts = 100000
|
||||
logfile = ../data/gf_example/progress.log
|
||||
cpu_cores = 1
|
||||
debug = True
|
||||
kres_orig = /kres_mount/kres_parsed/tei
|
||||
kres_tsv = ../data/kres_out/1_tsv
|
||||
kres_srl = ../data/kres_out/2_srl
|
||||
kres_json = ../data/kres_out/final_json
|
||||
logfile = ../progress.log
|
||||
cpu_cores = 5
|
||||
debug = False
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
[tools]
|
||||
giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
|
||||
giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
|
||||
; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
|
||||
giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
|
||||
giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
|
||||
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
|
||||
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
|
||||
; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
|
||||
giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
|
||||
giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
|
||||
internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
|
||||
giga_parts = 100000
|
||||
logfile = ../progress.log
|
||||
cpu_cores = 1
|
||||
debug = False
|
|
@ -1,8 +0,0 @@
|
|||
[tools]
|
||||
kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
|
||||
kres_tsv = ../data/kres_out/1_tsv
|
||||
kres_srl = ../data/kres_out/2_srl
|
||||
kres_json = ../data/kres_out/final_json
|
||||
logfile = ../progress.log
|
||||
cpu_cores = 5
|
||||
debug = False
|
|
@ -1,8 +0,0 @@
|
|||
[tools]
|
||||
kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
|
||||
giga_tsv = ../data/giga_out/1_tsv
|
||||
giga_srl = ../data/giga_out/2_srl
|
||||
kres_json = ../data/giga_out/final_json
|
||||
logfile = ../progress.log
|
||||
cpu_cores = 5
|
||||
debug = False
|
|
@ -1,15 +0,0 @@
|
|||
[tools]
|
||||
ssj500k = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
|
||||
ssj500k_orig = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
|
||||
ssj500k_orig_folder = ../data/ssj500k2.3/orig
|
||||
ssj500k_jos = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
|
||||
ssj500k_tsv = ../data/ssj500k2.3/tsvs/tsvs.tsv
|
||||
ssj500k_tsv_folder = ../data/ssj500k2.3/tsvs
|
||||
ssj500k_srl = ../data/ssj500k2.3/srls
|
||||
ssj500k_json = ../data/ssj500k2.3/final_json
|
||||
ssj500k_tei = ../data/ssj500k2.3/final_tei
|
||||
internal_data = ../data/ssj500k2.3/internal_data
|
||||
;internal_data = ../data/gf_example/internal_data
|
||||
logfile = ../data/ssj500k2.3/progress.log
|
||||
cpu_cores = 1
|
||||
debug = True
|
Loading…
Reference in New Issue
Block a user