testing new config

This commit is contained in:
voje 2019-02-27 17:04:03 +01:00
commit 5c9cf59723
6 changed files with 94 additions and 67 deletions

View File

@ -7,7 +7,7 @@ json_files: srl_tagged_files
srl_tagged_files: tsv_files
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
cd tools/srl-20131216; ./tag_all.sh ../../data/kres_example_tsv ../../data/kres_example_srl
cd tools/srl-20131216; ./tag_all.sh
tsv_files: tools/fillpred_model/model.pickle
cd tools; python3 parse_all.py

View File

@ -1,12 +1,21 @@
from pathlib import Path
from parser.parser import Parser
import json
import configparser
# defaults
ORIGPATH = Path("../data/kres_example") # we need the IDs
INPATH = Path("../data/kres_example_srl")
OUTPATH = Path("../data/kres_example_json")
DEBUG = False
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = bool(config["tools"]["debug"])
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
@ -47,45 +56,43 @@ def get_dep_rel(token):
return None
if __name__ == "__main__":
par = Parser()
OUTPATH.mkdir(exist_ok=True)
par = Parser()
OUTPATH.mkdir(exist_ok=True)
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
fp = infile.open("rb")
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
fp = infile.open("rb")
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
outfile = (OUTPATH / infile.name).with_suffix(".json")
# print(outdata)
json.dump(outdata, outfile.open("w"))
outfile = (OUTPATH / infile.name).with_suffix(".json")
# print(outdata)
json.dump(outdata, outfile.open("w"))

View File

@ -5,40 +5,45 @@ from pathlib import Path
import re
import sys
import cProfile
import configparser
# some defaults
INDIR = Path("../data/kres_example")
OUTDIR = Path("../data/kres_example_tsv")
if __name__ == "__main__":
# make sure you sanitize every input into unicode
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
"""
print("parsing ssj")
ssj_file = "../data/ssj500k-sl.sample.xml"
ssj_dict = par.parse_tei(ssj_file)
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
print("end parsing ssj")
"""
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
kres_dir = Path("../data/kres_example/").resolve()
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
kres_out_dir = kres_dir.parent / (kres_dir.name + "_tsv")
kres_out_dir.mkdir(exist_ok=True)
for kres_file in [x for x in INDIR.iterdir() if x.is_file()]:
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
print("Processing file: " + str(kres_file))
res_dict = par.parse_tei(kres_file)
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
print("Longest sentence: ", longest_sent)
kres_out_str = ""
print("Processing file: " + str(kres_file))
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
print("end parsing kres")
with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
fp.close()
print("end parsing kres")

View File

@ -4,6 +4,7 @@ from parser.msd.msdmap import Msdmap
import pickle
from pathlib import Path
from fillpred_model.step1 import build_model_row
import sys
class Parser:
# reads a TEI xml file and returns a dictionary:
@ -18,8 +19,12 @@ class Parser:
self.W_TAGS = ['w']
self.C_TAGS = ['c']
self.S_TAGS = ['S', 'pc']
with Path("./fillpred_model/model.pickle").open("rb") as fp:
try:
fp = Path("./fillpred_model/model.pickle").open("rb")
self.fillpred_model = pickle.load(fp)
except IOError:
print("Generate the model first: $ make tools/fillpred_mode/model.pickle")
sys.exit(1)
def parse_tei(self, filepath):

View File

@ -1,7 +1,11 @@
#!/bin/bash
IN_FOLDER="$1"
OUT_FOLDER="$2"
# parsing tools.cfg values
IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
echo "input folder: $IN_FOLDER"
OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
echo "output folder: $OUT_FOLDER"
SUFFIX="srl.tsv"
mkdir -p $OUT_FOLDER

6
tools/tools.cfg Normal file
View File

@ -0,0 +1,6 @@
[tools]
kres_orig = ../data/kres_example
kres_tsv = ../data/kres_example_tsv
kres_srl = ../data/kres_example_srl
kres_json = ../data/kres/example_json
debug = False