18 Commits

Author SHA1 Message Date
c1ecc4cdbc Big changes 2022-02-04 11:24:47 +01:00
a6cee3d459 migrated to cjvt-gitea 2019-03-03 21:35:05 +01:00
b32bd3e7c6 Setup that SRL tagged kres 2019-03-03 21:10:23 +01:00
044fae2001 added parallel json output creation 2019-02-28 23:37:47 +01:00
406e88ade8 added msd-not-found exception 2019-02-28 21:49:49 +01:00
bf999a965f sending some pipe-breaking files 2019-02-28 15:05:10 +01:00
d45b6d9f47 added number of cores to config 2019-02-28 13:57:27 +01:00
a61ec8770a parsing... 2019-02-28 11:12:12 +01:00
ff25acd3c7 small cfg fix 2019-02-28 10:54:37 +01:00
3881c74613 added multiprocessing to parse_all.py 2019-02-28 10:53:27 +01:00
17cb0677a7 added logging; paralelize the first part now 2019-02-28 10:34:12 +01:00
fd0f9794f1 added logger 2019-02-28 10:15:14 +01:00
12f3994115 todo: add logger 2019-02-28 09:57:46 +01:00
dcc2935c3c some changes on server 2019-02-28 09:40:25 +01:00
60ac569f40 ready to go 2019-02-28 08:20:21 +01:00
b4c7ac5427 fixed paths 2019-02-27 17:32:19 +01:00
5c9cf59723 testing new config 2019-02-27 17:04:03 +01:00
577c8418d2 tmp 2019-02-27 16:58:04 +01:00
28 changed files with 1725 additions and 114 deletions

10
.gitignore vendored
View File

@@ -1,5 +1,11 @@
*.pyc
*.pickle
*.log
data/*/*.xml
data/*/*.tsv
nohup.out
data/kres_out/*
data/kres_example/
venv/
.idea/
data/

View File

@@ -1,19 +1,22 @@
.PHONY: tsv_files srl_tagged_files json_files env
.PHONY: tsv_files srl_tagged_files json_files env clean
all: json_files
all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files
json_files: #TODO srl_tagged_files
json_files: # srl_tagged_files
cd tools; python3 gen_json.py
srl_tagged_files: tsv_files
srl_tagged_files: # tsv_files
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
cd tools/srl-20131216; ./tag_all.sh
tsv_files: fillpred_model/model.pickle
tsv_files: # tools/fillpred_model/model.pickle
cd tools; python3 parse_all.py
fillpred_model/model.pickle:
tools/fillpred_model/model.pickle:
cd tools/fillpred_model; $(MAKE)
env:
cd dockerfiles; cd python-java; $(MAKE)
clean:
rm tools/fillpred_model/model.pickle

View File

@@ -11,7 +11,9 @@ Check out `./tools/srl-20131216/README.md`.
## Scripts
Check all possible xml tags (that occur after the <body> tag.
'cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq'
``` bash
cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq
```
## Tools
* Parser for reading both `SSJ500k 2.1 TEI xml` and `Kres F....xml.parsed.xml"` files found in `./tools/parser/parser.py`.
@@ -26,6 +28,12 @@ $ cd ./cjvt-srl-tagging
$ make
```
If you want to run it on a server overnight, you might want to use `nohup`, so you can close the ssh connection without closing the process.
```
$ nohup make &
```
See progress in generated logfile (check git root).
# Makefile
The Makefile follows certain steps:
1. Create a fillpred model.

Binary file not shown.

View File

@@ -7,6 +7,8 @@ default-jdk \
python3 \
python3-pip
RUN apt-get install -y sshfs
RUN pip3 install lxml pandas sklearn
ENV PYTHONIOENCODING UTF-8

View File

@@ -5,14 +5,16 @@ all: build run
build:
docker build . -t $(IMAGE_NAME)
run:
docker run \
-it \
--user $(shell id -u):$(shell id -g) \
-v /home/${USER}:/home/${USER} \
--user $(shell id -u):$(shell id -g) \
-v /etc/passwd:/etc/passwd \
-v /etc/group:/etc/group \
-v $(shell pwd)/../../:/cjvt-srl-tagging \
-w /cjvt-srl-tagging \
python-java \
/bin/bash
-v /home/luka/Development/srl/data:/kres_mount:ro \
python-java \
/bin/bash

15
parser/tei_to_dict.py Normal file
View File

@@ -0,0 +1,15 @@
#!/usr/bin/python3
from lxml import etree
def tei_to_dict(s_el):
if __name__ == "__main__":
with open("/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml") as f:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
xml_tree = ElementTree.XML(xmlstring)

151
parser/test.py Executable file
View File

@@ -0,0 +1,151 @@
#!/usr/bin/python2
from __future__ import print_function, unicode_literals, division
import sys
import os
import re
import pickle
from pathlib import Path
try:
from lxml import etree as ElementTree
except ImportError:
import xml.etree.ElementTree as ElementTree
# attributes
ID_ATTR = "id"
LEMMA_ATTR = "lemma"
ANA_ATTR = "ana"
# tags
SENTENCE_TAG = 's'
BIBL_TAG = 'bibl'
PARAGRAPH_TAG = 'p'
PC_TAG = 'pc'
WORD_TAG = 'w'
C_TAG = 'c'
S_TAG = 'S'
SEG_TAG = 'seg'
class Sentence:
def __init__(self, sentence, s_id):
self.id = s_id
self.words = []
self.text = ""
for word in sentence:
self.handle_word(word)
def handle_word(self, word):
# handle space after
if word.tag == S_TAG:
assert(word.text is None)
self.text += ' '
return
# ASK am I handling this correctly?
elif word.tag == SEG_TAG:
for segword in word:
self.handle_word(segword)
return
# ASK handle unknown tags (are there others?)
elif word.tag not in (WORD_TAG, C_TAG):
return
# ID
idx = str(len(self.words) + 1)
# TOKEN
token = word.text
# LEMMA
if word.tag == WORD_TAG:
lemma = word.get(LEMMA_ATTR)
assert(lemma is not None)
else:
lemma = token
# XPOS
xpos = word.get('msd')
if word.tag == C_TAG:
xpos = "Z"
elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
xpos = "N"
elif xpos is None:
print(self.id)
# save word entry
self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
# save for text
self.text += word.text
def to_conllu(self):
lines = []
# lines.append('# sent_id = ' + self.id)
# CONLLu does not like spaces at the end of # text
# lines.append('# text = ' + self.text.strip())
for word in self.words:
lines.append('\t'.join('_' if data is None else data for data in word))
return lines
def convert_file(in_file, out_file):
print("Nalaganje xml: {}".format(in_file))
with open(str(in_file), 'r') as fp:
uni_str = fp.read().decode("utf-8")
xmlstring = re.sub(' xmlns="[^"]+"', '', uni_str, count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
print(xmlstring[:1000])
xml_tree = ElementTree.XML(xmlstring)
print("Pretvarjanje TEI -> TSV-U ...")
lines = []
for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
sidx = 1
for sentence in paragraph:
if sentence.tag != SENTENCE_TAG:
continue
sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
lines.extend(sentence.to_conllu())
lines.append('') # ASK newline between sentences
sidx += 1
if len(lines) == 0:
raise RuntimeError("Nobenih stavkov najdenih")
print("Zapisovanje izhodne datoteke: {}".format(out_file))
with open(out_file, 'w') as fp:
for line in lines:
if sys.version_info < (3, 0):
line = line.encode('utf-8')
print(line, file=fp)
if __name__ == "__main__":
"""
Input: folder of TEI files, msds are encoded as msd="Z"
Ouput: just a folder
"""
infile = "/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml"
outfile = "test.out"
convert_file(infile, outfile)
sys.exit()
in_folder = sys.argv[1]
out_folder = sys.argv[2]
num_processes = int(sys.argv[3])
files = Path(in_folder).rglob("*.xml")
in_out = []
for filename in files:
out_file = out_folder + "/" + filename.name[:-4] + ".txt"
convert_file(filename, out_file)

View File

@@ -0,0 +1,19 @@
import os
# INPATH = Path(config["tools"]["giga_srl"])
# infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
SOURCEPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files'
from shutil import copyfile
INPATH = '/media/luka/Portable Disk/Datasets/gigafida_jos/2_srl'
OUTPATH = '/home/luka/Development/srl/cjvt-srl-tagging/data/giga_out/1_tsv'
for i in range(100000):
# print(os.path.join(INPATH, 'giga.%07d.tsv' % i))
# if not os.path.exists(os.path.join(INPATH, 'giga.%07d.tsv' % i)):
# print('giga.%07d.tsv' % i)
if not os.path.exists(os.path.join(INPATH, 'giga%07d.srl.tsv' % i)):
copyfile(os.path.join(SOURCEPATH, 'giga.%07d.tsv' % i), os.path.join(OUTPATH, 'giga%07d.tsv' % i))
print('giga%07d.srl.tsv' % i)
if i % 1000 == 0:
print(i)

View File

View File

@@ -51,4 +51,4 @@ if __name__ == "__main__":
print(i, df.shape)
print(ndf.head())
ndf.to_pickle(OUTFILE)
ndf.to_pickle(Path(OUTFILE))

View File

@@ -27,4 +27,6 @@ if __name__ == "__main__":
clf_full = DecisionTreeClassifier()
clf_full.fit(X, y)
pickle.dump(clf_full, open(OUTFILE, "wb"))
with open(OUTFILE, "wb") as fp:
pickle.dump(clf_full, fp)

View File

@@ -0,0 +1,192 @@
import pickle
from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'giga'
INDIR_GIGA = Path(config["tools"]["giga_orig"])
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
INDIR_JOS = Path(config["tools"]["giga_jos"])
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
origfiles = []
for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
for file in files:
origfiles.append(Path(os.path.join(subdir, file)))
origfiles=list(enumerate(sorted(origfiles)))
def giga_orig_sentence_generator():
with open(INDIR_GIGA, 'r') as gof:
previous_new_line = False
sentence_words = []
for l_gof in gof:
if l_gof == '\n':
yield ' '.join(sentence_words)
sentence_words = []
else:
sentence_words.append(l_gof.split('\t')[0])
# yield l_gof
sentence_generator = giga_orig_sentence_generator()
sentence_ids = []
for origfile in origfiles:
split_file_sentences = par.parse_tei(origfile[1])
for k, v in split_file_sentences.items():
one_file_sentence = next(sentence_generator)
if one_file_sentence == v['text']:
sentence_ids.append(v['sid'])
else:
print('----------------')
print('ERROR')
print(v['sid'])
print(one_file_sentence)
print(v['text'])
print(origfile[0])
# count sentences in orig (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
pickle.dump(sentence_ids, output)
# def giga_orig_generator():
# with open(INDIR_GIGA, 'r') as gof:
# previous_new_line = False
# for l_gof in gof:
# if l_gof == '\n':
# if previous_new_line:
# continue
# previous_new_line = True
# elif previous_new_line:
# previous_new_line = False
# yield l_gof
# import time
# def handle_giga_file(ran):
# """
# File that splits big text file into more minor files. Only split on empty lines.
# """
# # with open(INDIR_GIGA, 'r') as gof:
# # with open(INDIR_JOS, 'r') as gjf:
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# # pass
# # num_lines = i + 1
# # print(num_lines)
# num_lines = 1393184026
# # 1393184026
# # 1393184033
# # return
# num_lines_per_part = num_lines / GIGA_PARTS
# curr_part = 0
# gof_generator = giga_orig_generator()
#
# diff_files = set()
# # with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_GIGA_OLD, 'r') as gjf:
# # sentence = {}
# # sentence['tokens'] = []
# # sentence['links'] = {}
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
#
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
#
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# for i, l_gjf in enumerate(gjf):
# l_gof = next(gof_generator)
# if curr_part < ran[0]:
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
# if curr_part < ran[0]:
# print(curr_part)
# curr_part += 1
# continue
# else:
# continue
#
# l_gof_split = l_gof.split('\t')
# l_gjf_split = l_gjf.split('\t')
#
# # if punctuation
# if l_gof != '\n':
# if l_gof_split != l_gjf_split:
# print(curr_part)
# diff_files.add(curr_part)
# l_gof = next(gof_generator)
#
#
# # if l_gof == '\n':
# else:
# # wf.flush()
# # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
# if i > num_lines_per_part * (curr_part + 1):
# curr_part += 1
# # if wf doesn't exist (first one)
# # wf.close()
# if curr_part >= ran[1]:
# break
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
#
# curr_part += 1
# return diff_files
# # wf.close()
#
# with Pool(CPU_CORES) as p:
# final_range = [0, 100000]
# # final_range = [0, 150]
# # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
# # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
# # ranges = []
# # ps = None
# # for i in range(CPU_CORES):
# # s = int(final_range[0] + size_per_proc * i)
# # ns = int(final_range[0] + size_per_proc * (i + 1))
# # ranges.append([s, ns])
# # # ranges = [[0, 1]]
# # res = p.map(handle_giga_file, ranges)
#
# res = handle_giga_file(final_range)
# res = sorted(list(res))
# if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
# os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
# with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
# pickle.dump(res, pkl_file)
# # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
# # mydict2 = pickle.load(pkl_file)
# print('test')

114
tools/gen_json.kres.py Normal file
View File

@@ -0,0 +1,114 @@
from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
import logging
from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
# ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["giga_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
def handle_file(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
logging.info("Finished generating .json files.")

View File

@@ -1,57 +1,396 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import pickle
from pathlib import Path
from parser.parser import Parser
import configparser
# defaults
ORIGPATH = Path("../data/kres_example") # we need the IDs
INPATH = Path("../data/kres_example_srl")
OUTPATH = Path("../data/kres_example_json")
import json
import sys
import logging
from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
ORIGPATH = Path(config["tools"]["giga"])
INPATH = Path(config["tools"]["giga_srl"])
OUTPATH = Path(config["tools"]["giga_json"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
if line[0] == '\n':
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
acc = []
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def match_sentence_id(string, rd):
str1 = " ".join([token[1] for token in sentence_arr])
for k, e in rd.items():
str2 = " ".join(token[2] for token in dict_entry["tokens"])
if str1 == str2:
return k
raise KeyError
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
return k
raise KeyError
def match_sentence_id_giga(sentence, orig_dict):
for k, e in orig_dict.items():
# orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == e["text"]:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
def handle_file_old(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
if __name__ == "__main__":
def handle_file(whole_input):
# sentence_id = whole_input[0][3]
# orig_infile = whole_input[0][1]
sentence_id = whole_input[3]
orig_infile = whole_input[1]
par = Parser()
# origfile = origfiles[0][1]
# infile_tpl = infile_tpl[0]
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
origfile = get_origfile(infile)
rd = par.parse_tei(origfile)
# i = infile_tpl[0]
# infile = infile_tpl[1]
outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
fp = infile.open("rb")
for sentence_arr in extract_sentences(fp.readlines()):
sid = match_sentence_id(sentence_arr, rd)
print(sid)
# OK, we got the sentence id, now generate the predicate map!
if outfile.exists():
return
# origfile = get_origfile()
orig_dict = par.parse_tei(orig_infile)
outdata = {}
gen = srl_multiple_files_sentences_generator(sentence_id)
# gen = srl_multiple_files_sentences_generator(whole_input[1])
mismatch_sentences = 0
for sentence_i, (orig_id, orig_val) in enumerate(orig_dict.items()):
if orig_id == 'GF0014802.2685.7':
print('PAUSE')
# look at neighbouring sentences if they are correct
for i in range(100):
sentence, sentence_arr = next(gen)
# orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_val["text"]:
# if i != 10 and i != 0:
# print('OK!')
sid = orig_id
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
break
else:
if i == 99:
mismatch_sentences += 1
sid = orig_id
outdata[sid] = []
gen = srl_multiple_files_sentences_generator(sentence_id + sentence_i)
if mismatch_sentences > 0:
if mismatch_sentences / len(orig_dict.items()) < 0.1:
print('Slight mismatch - %d' % sentence_id)
print(whole_input)
print('ABS mitigated %d' % mismatch_sentences)
print('------------------------------------------------')
else:
print('ERRRRRRRRRRRRRRRROOOOOOORRRRRRRRRRR')
print('Big mismatch - %d' % sentence_id)
print(whole_input)
print('ABS mitigated errors:')
print(mismatch_sentences)
print('------------------------------------------------')
outfile = (OUTPATH / infile.name).with_suffix(".json")
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
def count_orig_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
return
print(filename[0])
orig_dict = par.parse_tei(filename[1])
# return filename[0], filename[1], len(orig_dict)
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
def count_srl_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
return
print(filename[0])
num_sentences = 0
with filename[1].open("r") as fp:
for line in fp:
if line == '\n':
num_sentences += 1
# return filename[0], filename[1], num_sentences
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], num_sentences), output)
def srl_sentences_generator(infile, curr_index, sen_start_index):
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
if curr_index < sen_start_index:
curr_index += 1
else:
yield to_sentence(sentence_arr), sentence_arr
yield None
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
sentence_id = max(0, sentence_id - 10)
for i, srl_file in enumerate(srl_file_sizes):
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
srl_files = srl_file_sizes[i:]
break
for file_info in srl_files:
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
el = next(srl_gen)
while el is not None:
yield el
el = next(srl_gen)
yield None
# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
origfiles = []
for subdir, dirs, files in os.walk(ORIGPATH):
for file in files:
origfiles.append(Path(os.path.join(subdir, file)))
origfiles=list(enumerate(sorted(origfiles)))
##### REMOVE ############
# origfiles = origfiles[:3]
# count sentences in orig (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl')):
# srl_file_sizes = {}
if not os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks')):
os.makedirs(os.path.join(INTERNAL_DATA, 'orig_chunks'))
# with Pool(CPU_CORES) as p:
# # p.map(handle_file, infiles)
# p.map(count_orig_file_sentences, origfiles)
for i in range(len(origfiles)):
count_orig_file_sentences(origfiles[i])
orig_file_sizes = []
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'orig_chunks')).iterdir())):
print(x.name)
if x.is_file():
with x.open('rb') as pkl_small_file:
orig_file_sizes.append(pickle.load(pkl_small_file))
# orig_file_sizes = list(enumerate([x for x in iter(sorted(INPATH.iterdir())) if x.is_file()]))
print("Sorting orig files")
orig_file_sizes = sorted(orig_file_sizes)
total_size = 0
orig_file_sizes_final = []
print("Calculating orig files size")
for n, pa, si in orig_file_sizes:
orig_file_sizes_final.append((n, pa, si, total_size))
total_size += si
orig_file_sizes = orig_file_sizes_final
print("Saving orig files size")
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'wb') as output:
pickle.dump(orig_file_sizes, output)
print("Orig files saved")
else:
with open(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'), 'rb') as pkl_file:
orig_file_sizes = pickle.load(pkl_file)
# count sentences in srl (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'))
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl')):
# srl_file_sizes = {}
if not os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks')):
os.makedirs(os.path.join(INTERNAL_DATA, 'srl_chunks'))
# with Pool(CPU_CORES) as p:
# # p.map(handle_file, infiles)
# p.map(count_srl_file_sentences, infiles)
for i in range(len(infiles)):
count_srl_file_sentences(infiles[i])
srl_file_sizes = []
for x in iter(sorted(Path(os.path.join(INTERNAL_DATA, 'srl_chunks')).iterdir())):
print(x.name)
if x.is_file():
with x.open('rb') as pkl_small_file:
srl_file_sizes.append(pickle.load(pkl_small_file))
print("Sorting srl files")
srl_file_sizes = sorted(srl_file_sizes)
total_size = 0
srl_file_sizes_final = []
print("Calculating srl files size")
for n, pa, si in srl_file_sizes:
srl_file_sizes_final.append((n, pa, si, total_size))
total_size += si
srl_file_sizes = srl_file_sizes_final
print("Saving srl files size")
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'wb') as output:
pickle.dump(srl_file_sizes, output)
print("Srl files saved")
else:
with open(os.path.join(INTERNAL_DATA, 'srl_counted_sentences.pkl'), 'rb') as pkl_file:
srl_file_sizes = pickle.load(pkl_file)
# print(len(orig_file_sizes))
# print('asd' + 2)
# inputs = []
# srl_i = 0
# srl_file = srl_file_sizes[srl_i]
# for orig_i, orig_path, orig_size, orig_first_sent_i in orig_file_sizes:
# interesting_srl_files = []
# # beginning of srl chunk in range of orig chunk or ending of srl chunk in range of orig chunk
# # while srl_file[3] >= orig_first_sent_i and srl_file[3] < orig_first_sent_i + orig_size or \
# # srl_file[3] + srl_file[2] - 1 >= orig_first_sent_i and srl_file[3] + srl_file[2] - 1 < orig_first_sent_i + orig_size:
# while srl_file[3] < orig_first_sent_i + orig_size and srl_file[3] + srl_file[2] > orig_first_sent_i:
# # if beginning of file is in
# if srl_file[3] > orig_first_sent_i:
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], srl_file[3]))
# # print('if %d' % srl_file[3])
# else:
# interesting_srl_files.append((srl_file[0], srl_file[1], srl_file[2], srl_file[3], orig_first_sent_i))
# # print('else %d' % orig_first_sent_i)
#
# if orig_first_sent_i + orig_size >= srl_file[3] + srl_file[2]:
# srl_i += 1
# if srl_i < len(srl_file_sizes):
# srl_file = srl_file_sizes[srl_i]
# else:
# break
# # print(srl_i)
# # print('a ' + 2)
# else:
# break
#
# inputs.append([[orig_i, orig_path, orig_size, orig_first_sent_i], interesting_srl_files])
# print(inputs[-1])
# srl_gen = srl_sentences_generator(srl_file_sizes[0][1], 0, 533)
# a = next(srl_gen)
# b = next(srl_gen)
# c = next(srl_gen)
print('beginning processing')
with Pool(CPU_CORES) as p:
# p.map(handle_file, inputs)
p.map(handle_file, orig_file_sizes)
# for of in orig_file_sizes:
# handle_file(of)
logging.info("Finished generating .json files.")

View File

@@ -0,0 +1,294 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import pickle
from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
import logging
from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["giga"])
INPATH = Path(config["tools"]["giga_srl_errors"])
OUTPATH = Path(config["tools"]["giga_json"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
return k
raise KeyError
def match_sentence_id_giga(sentence, orig_dict):
for k, e in orig_dict.items():
# orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == e["text"]:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
def handle_file_old(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
def fix_json(srl_gen, error_sentence, orig_json_data):
# sentence_id = whole_input[0][3]
# orig_infile = whole_input[0][1]
# sentence_id = whole_input[3]
# orig_infile = whole_input[1]
# origfile = origfiles[0][1]
# infile_tpl = infile_tpl[0]
# i = infile_tpl[0]
# infile = infile_tpl[1]
# outfile = (OUTPATH / orig_infile.name).with_suffix(".json")
# if outfile.exists():
# return
# origfile = get_origfile()
# orig_dict = par.parse_tei(orig_infile)
# outdata = {}
# gen = srl_multiple_files_sentences_generator(sentence_id)
# gen = srl_multiple_files_sentences_generator(whole_input[1])
# mismatch_sentences = 0
# look at neighbouring sentences if they are correct
sentence, sentence_arr = next(srl_gen)
# orig_sentence = " ".join(token[2] for token in e["tokens"])
sid = error_sentence
# a = orig_json_data[sid]
if orig_json_data[sid] != []:
# print('POSSIBLE ERROR:')
# print(orig_json_data[sid])
orig_json_data[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
orig_json_data[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in orig_json_data[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(orig_json_data[sid])
print(sid)
print()
print()
# a = orig_json_data[sid]
return orig_json_data
def count_orig_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name)):
return
print(filename[0])
orig_dict = par.parse_tei(filename[1])
# return filename[0], filename[1], len(orig_dict)
with open(os.path.join(INTERNAL_DATA, 'orig_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], len(orig_dict)), output)
def count_srl_file_sentences(filename):
if os.path.exists(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name)):
return
print(filename[0])
num_sentences = 0
with filename[1].open("r") as fp:
for line in fp:
if line == '\n':
num_sentences += 1
# return filename[0], filename[1], num_sentences
with open(os.path.join(INTERNAL_DATA, 'srl_chunks', filename[1].name), 'wb') as output:
pickle.dump((filename[0], filename[1], num_sentences), output)
def srl_error_fix_generator(infile):
with infile.open("rb") as fp:
for sentence_arr in extract_sentences(fp.readlines()):
yield to_sentence(sentence_arr), sentence_arr
yield None
def srl_sentences_generator(infile, curr_index, sen_start_index):
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
if curr_index < sen_start_index:
curr_index += 1
else:
yield to_sentence(sentence_arr), sentence_arr
yield None
def srl_multiple_files_sentences_generator(sentence_id): # srl_files):
sentence_id = max(0, sentence_id - 10)
for i, srl_file in enumerate(srl_file_sizes):
if sentence_id >= srl_file[3] and sentence_id < srl_file[3] + srl_file[2]:
srl_files = srl_file_sizes[i:]
break
for file_info in srl_files:
# srl_gen = srl_sentences_generator(file_info[1], file_info[3], file_info[4])
srl_gen = srl_sentences_generator(file_info[1], file_info[3], sentence_id)
el = next(srl_gen)
while el is not None:
yield el
el = next(srl_gen)
yield None
error_sentences_grouped = []
group = False
prev_name = ''
# group sentences by their files
for name in error_sentences:
if name[:9] == prev_name:
group.append(name)
else:
prev_name = name[:9]
if group:
error_sentences_grouped.append(group)
group = [name]
error_sentences_grouped.append(group)
srl_gen = srl_error_fix_generator(INPATH)
# find errors in json files:
# with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as output:
# sentence_ids = pickle.load(output)
#
#
#
# origfiles = []
# for subdir, dirs, files in os.walk(OUTPATH):
# for file in files:
# origfiles.append(Path(os.path.join(subdir, file)))
# origfiles=sorted(origfiles)
#
#
#
# for sent in origfiles:
# # for sent in sentence_ids:
# # outfile = Path(OUTPATH, sent[:9] + '-dedup.json')
# outfile = sent
#
# try:
# with outfile.open() as json_file:
# json.load(json_file)
# pass
# except:
# print(outfile.name)
#
#
# raise Exception('test')
# iterate over all wronged sentences and fix them
for errors_in_file in error_sentences_grouped:
outfile = Path(OUTPATH, errors_in_file[0][:9] + '-dedup.json')
with outfile.open() as json_file:
print(outfile.name)
orig_json_data = json.load(json_file)
for error_sentence in errors_in_file:
orig_json_data = fix_json(srl_gen, error_sentence, orig_json_data)
with outfile.open('w') as json_file:
json.dump(orig_json_data, json_file)
logging.info("SRL relations written to: {}".format(outfile))

View File

@@ -1,3 +1,5 @@
import pickle
from parser.parser import Parser
import os
from os.path import join, dirname
@@ -6,10 +8,8 @@ import re
import sys
import cProfile
import configparser
# some defaults
INDIR = Path("../data/kres_example")
OUTDIR = Path("../data/kres_example_tsv")
import logging
from multiprocessing import Pool
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
@@ -17,8 +17,28 @@ par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'giga'
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
INDIR_GIGA = Path(config["tools"]["giga_orig"])
INDIR_JOS = Path(config["tools"]["giga_jos"])
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
"""
print("parsing ssj")
@@ -28,22 +48,330 @@ ssj_dict = par.parse_tei(ssj_file)
print("end parsing ssj")
"""
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
for kres_file in [x for x in INDIR.iterdir() if x.is_file()]:
if analysis == 'kres':
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
print("Processing file: " + str(kres_file))
res_dict = par.parse_tei(kres_file)
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
print("Longest sentence: ", longest_sent)
kres_out_str = ""
def handle_file(infile):
i = infile[0]
kres_file = infile[1]
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
except Exception as exc:
logging.info("Failed processing file: {}".format(str(kres_file)))
logging.error(exc)
return False
with outfile.open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
print("end parsing kres")
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
return True
return False
def giga_orig_generator():
with open(INDIR_GIGA, 'r') as gof:
previous_new_line = False
for l_gof in gof:
if l_gof == '\n':
if previous_new_line:
continue
previous_new_line = True
elif previous_new_line:
previous_new_line = False
yield l_gof
def handle_gigafida_file():
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
num_lines = 1393184026
# 1393184026
# 1393184033
# return
num_lines_per_part = num_lines / GIGA_PARTS
curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
ignore_lines = True
wf = False
else:
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
ignore_lines = False
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if ignore_lines:
if i > num_lines_per_part * curr_part and l_gof == '\n':
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
ignore_lines = False
# delete last file (probably not whole)
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
if ignore_lines:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if wf:
# print(i)
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if wf:
wf.close()
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
import time
def handle_giga_file(ran):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
num_lines = 1393184026
# 1393184026
# 1393184033
# return
num_lines_per_part = num_lines / GIGA_PARTS
curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if curr_part < ran[0]:
if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
if curr_part < ran[0]:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if curr_part not in file_indices:
continue
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if curr_part in file_indices:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if curr_part in file_indices and wf:
wf.close()
if curr_part >= ran[1]:
break
if curr_part in file_indices:
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
def handle_giga_file_selected_sentences(error_sentences):
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
# with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_JOS, 'r') as gjf:
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# pass
# num_lines = i + 1
# print(num_lines)
# print('num_lines' + 3)
# num_lines = 1393184026
num_lines = 1393222523
# 1393184026
# 1393184033
# return
# num_lines_per_part = num_lines / GIGA_PARTS
# curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
wf = None
if os.path.exists(os.path.join(OUTDIR, 'giga_errors')):
os.remove(os.path.join(OUTDIR, 'giga_errors'))
wf = open(os.path.join(OUTDIR, 'giga_errors'), 'a')
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'rb') as pkl_file:
sentence_ids_list = pickle.load(pkl_file)
sentence_id = 0
skip_sentence = not sentence_ids_list[sentence_id] in error_sentences
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if l_gjf == '\n':
if not skip_sentence:
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
sentence_id += 1
if sentence_ids_list[sentence_id] in error_sentences:
print(sentence_ids_list[sentence_id])
skip_sentence = False
else:
skip_sentence = True
if skip_sentence:
continue
# if curr_part < ran[0]:
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
# if curr_part < ran[0]:
# print(curr_part)
# curr_part += 1
# continue
# else:
# continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
# if i > num_lines_per_part * (curr_part + 1):
# curr_part += 1
# # if wf doesn't exist (first one)
# if curr_part in file_indices and wf:
# wf.close()
# if curr_part >= ran[1]:
# break
# if curr_part in file_indices:
# if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
# os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
# wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
# curr_part += 1
wf.close()
file_indices = set(range(0, 100000))
with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
file_indices = set(pickle.load(pkl_file))
with Pool(CPU_CORES) as p:
if analysis == 'kres':
p.map(handle_file, infiles)
elif analysis == 'gigafida':
handle_gigafida_file()
elif analysis == 'giga':
final_range = [0, 100000]
size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
# splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
ranges = []
ps = None
for i in range(CPU_CORES):
s = int(final_range[0] + size_per_proc * i)
ns = int(final_range[0] + size_per_proc * (i + 1))
ranges.append([s, ns])
# ranges = [[0, 1]]
# p.map(handle_giga_file, ranges)
# p.map(handle_giga_file, ranges)
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
handle_giga_file_selected_sentences(set(error_sentences))
logging.info("end parsing kres")

View File

@@ -35,7 +35,11 @@ class Msdmap:
def slo_msd_to_eng_long(self, slo_msd):
# old, slow
# return (self.msd_table[self.msd_table["slo_msd"] == slo_msd]["eng_long"]).values[0]
return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
# return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
query = self.msd_table.query("slo_msd == '{}'".format(slo_msd))
if query.empty:
return "No-matching-msd-found"
return query["eng_long"].values[0]
def slo_msd_to_eng_pos(self, slo_msd):
# first letter in slo_msd == slo_pos

View File

@@ -57,7 +57,10 @@ class Parser:
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
guess_corpus = "KRES"
if root.get("id")[0:2] == 'GF':
guess_corpus = "GIGA"
else:
guess_corpus = "KRES"
divs = [root]
else:
guess_corpus = "SSJ"
@@ -65,7 +68,10 @@ class Parser:
# parse divs
for div in divs:
f_id = div.get("id")
f_id = div.get("id")[:-6]
if guess_corpus == "GIGA":
div = div.findall(".//body")[0]
# parse paragraphs
for p in div.findall(".//p"):
@@ -75,60 +81,74 @@ class Parser:
for s in p.findall(".//s"):
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_list = []
sentence_tokens = []
# parse tokens
for el in s.iter():
if el.tag in self.W_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES"
else el.get("ana").split(":")[-1]),
)]
if guess_corpus != "GIGA":
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
)]
else:
sentence_list.append(el.text)
elif el.tag in self.C_TAGS:
# only Kres' C_TAGS have ids
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
if guess_corpus != "GIGA":
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in self.S_TAGS:
# Kres' <S /> doesn't contain .text
sentence_text += " "
if guess_corpus == "GIGA":
sentence_list.append(el.text)
else:
sentence_text += " "
else:
# pass links and linkGroups
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
)
}
if guess_corpus == "GIGA":
res_dict[sentence_id] = {
"sid": sentence_id,
"text": ' '.join(sentence_list),
"tokens": None,
"links": None
}
else:
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
)
}
fp.close()
return res_dict
def to_conll_2009_SRL(self, sentence_entry, napreds=9):
def to_conll_2009_SRL(self, sentence_entry):
def fillpred(tsv_row):
mrow = build_model_row(tsv_row)
x = mrow[:-1]
x = mrow[:-1]
y = self.fillpred_model.predict([x])
return y[0] # bool
apreds_string = '\t'.join(["_" for x in range(napreds)])
# works with kres, with parsed links
out_str = ""
for token in sentence_entry["tokens"]:
@@ -141,7 +161,7 @@ class Parser:
[t_id] +
[form for x in range(7)] +
["0", "0", "modra", "modra", "_", "_"] +
[apreds_string, "\n"]
["\n"]
)
continue
@@ -170,7 +190,6 @@ class Parser:
sentence_entry["links"][t_id][0], # pdeprel
"_", # fillpred
"_", # pred
apreds_string,
"\n",
]
fprd = fillpred(row_list)

View File

@@ -34,7 +34,8 @@ JVM_ARGS="-cp $CP -Xmx$MEM"
NOPI="-nopi" #Uncomment this if you want to skip the predicate identification step. This setting is equivalent to the CoNLL 2009 ST.
CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang $INPUT $MODEL $RERANKER $NOPI $OUTPUT"
echo "Executing: $CMD"
$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang "$INPUT" $MODEL $RERANKER $NOPI "$OUTPUT"
# CMD="$JAVA $JVM_ARGS se.lth.cs.srl.Parse $Lang '$INPUT' $MODEL $RERANKER $NOPI '$OUTPUT'"
# echo "Executing: $CMD"
$CMD
# $CMD

View File

@@ -0,0 +1,29 @@
#!/bin/bash
# parsing tools.cfg values
IN_FOLDER="$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg)"
echo "input folder: $IN_FOLDER"
OUT_FOLDER="$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg)"
echo "output folder: $OUT_FOLDER"
SUFFIX="srl.tsv"
mkdir -p "$OUT_FOLDER"
rm "$OUT_FOLDER/*${SUFFIX}" &> /dev/null
for infile in "$IN_FOLDER/*"; do
echo "Tagging: ${infile}"
base=$(basename $infile | cut -d'.' -f1)
outfile="${OUT_FOLDER}/${base}.${SUFFIX}"
# mate-tools tagger
./scripts/parse_srl_only_mod.sh "$infile" "$outfile"
if [ $? -eq 0 ]; then
echo "Saved as ${outfile}"
else
echo "ERR"
exit 1
fi
done

View File

@@ -0,0 +1,29 @@
#!/bin/bash
# parsing tools.cfg values
IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
echo "input folder: $IN_FOLDER"
OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
echo "output folder: $OUT_FOLDER"
SUFFIX="srl.tsv"
mkdir -p $OUT_FOLDER
rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
for infile in $IN_FOLDER/*; do
echo "Tagging: ${infile}"
base=$(basename $infile | cut -d'.' -f1)
outfile=${OUT_FOLDER}/${base}.${SUFFIX}
# mate-tools tagger
./scripts/parse_srl_only_mod.sh $infile $outfile
if [ $? -eq 0 ]; then
echo "Saved as ${outfile}"
else
echo "ERR"
exit 1
fi
done

View File

@@ -1,15 +1,16 @@
#!/bin/bash
# parsing tools.cfg values
IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
IN_FOLDER="../$(sed -n -e 's/^\s*giga_tsv\s*=\s*//p' ../tools.cfg.kres_new)"
IN_FOLDER=$IN_FOLDER$1
echo "input folder: $IN_FOLDER"
OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
OUT_FOLDER="../$(sed -n -e 's/^\s*giga_srl\s*=\s*//p' ../tools.cfg.kres_new)"
echo "output folder: $OUT_FOLDER"
SUFFIX="srl.tsv"
mkdir -p $OUT_FOLDER
rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
for infile in $IN_FOLDER/*; do
echo "Tagging: ${infile}"

View File

@@ -1,5 +1,18 @@
[tools]
kres_orig = ../data/kres_example
kres_tsv = ../data/kres_example_tsv
kres_srl = ../data/kres_example_srl
kres_json = ../data/kres/example_json
giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP
giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
giga_parts = 100000
logfile = ../progress.log
cpu_cores = 16
debug = False

16
tools/tools.cfg.gigafida Normal file
View File

@@ -0,0 +1,16 @@
[tools]
giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
giga_parts = 100000
logfile = ../progress.log
cpu_cores = 1
debug = False

8
tools/tools.cfg.kres Normal file
View File

@@ -0,0 +1,8 @@
[tools]
kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
kres_tsv = ../data/kres_out/1_tsv
kres_srl = ../data/kres_out/2_srl
kres_json = ../data/kres_out/final_json
logfile = ../progress.log
cpu_cores = 5
debug = False

8
tools/tools.cfg.kres_new Normal file
View File

@@ -0,0 +1,8 @@
[tools]
kres_orig = /home/luka/Development/srl/data/kres_parsed/tei
giga_tsv = ../data/giga_out/1_tsv
giga_srl = ../data/giga_out/2_srl
kres_json = ../data/giga_out/final_json
logfile = ../progress.log
cpu_cores = 5
debug = False

8
tools/tools.cfg.local Normal file
View File

@@ -0,0 +1,8 @@
[tools]
kres_orig = ../data/kres_example
kres_tsv = ../data/kres_out/1_tsv
kres_srl = ../data/kres_out/2_srl
kres_json = ../data/kres_out/final_json
logfile = ../progress.log
cpu_cores = 1
debug = False