added tools.cfg for configurable paths
This commit is contained in:
parent
bcaf226b9e
commit
de7035dfe5
2
Makefile
2
Makefile
@ -7,7 +7,7 @@ json_files: #TODO srl_tagged_files
|
|||||||
|
|
||||||
srl_tagged_files: tsv_files
|
srl_tagged_files: tsv_files
|
||||||
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
|
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
|
||||||
cd tools/srl-20131216; ./tag_all.sh ../../data/kres_example_tsv ../../data/kres_example_srl
|
cd tools/srl-20131216; ./tag_all.sh
|
||||||
|
|
||||||
tsv_files: fillpred_model/model.pickle
|
tsv_files: fillpred_model/model.pickle
|
||||||
cd tools; python3 parse_all.py
|
cd tools; python3 parse_all.py
|
||||||
|
@ -1,10 +1,19 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from parser.parser import Parser
|
from parser.parser import Parser
|
||||||
|
import configparser
|
||||||
|
|
||||||
|
# defaults
|
||||||
ORIGPATH = Path("../data/kres_example") # we need the IDs
|
ORIGPATH = Path("../data/kres_example") # we need the IDs
|
||||||
INPATH = Path("../data/kres_example_srl")
|
INPATH = Path("../data/kres_example_srl")
|
||||||
OUTPATH = Path("../data/kres_example_json")
|
OUTPATH = Path("../data/kres_example_json")
|
||||||
|
|
||||||
|
# parse config
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.read("tools.cfg")
|
||||||
|
ORIGPATH = Path(config["tools"]["kres_orig"])
|
||||||
|
INPATH = Path(config["tools"]["kres_srl"])
|
||||||
|
OUTPATH = Path(config["tools"]["kres_json"])
|
||||||
|
|
||||||
def get_origfile(filename):
|
def get_origfile(filename):
|
||||||
for origfile in ORIGPATH.iterdir():
|
for origfile in ORIGPATH.iterdir():
|
||||||
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||||
@ -25,7 +34,7 @@ def match_sentence_id(string, rd):
|
|||||||
str1 = " ".join([token[1] for token in sentence_arr])
|
str1 = " ".join([token[1] for token in sentence_arr])
|
||||||
for k, e in rd.items():
|
for k, e in rd.items():
|
||||||
str2 = " ".join(token[2] for token in dict_entry["tokens"])
|
str2 = " ".join(token[2] for token in dict_entry["tokens"])
|
||||||
if str1 == str2
|
if str1 == str2:
|
||||||
return k
|
return k
|
||||||
raise KeyError
|
raise KeyError
|
||||||
|
|
||||||
|
@ -5,42 +5,45 @@ from pathlib import Path
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import cProfile
|
import cProfile
|
||||||
|
import configparser
|
||||||
|
|
||||||
|
# some defaults
|
||||||
|
INDIR = Path("../data/kres_example")
|
||||||
|
OUTDIR = Path("../data/kres_example_tsv")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
SSJ500K_2_1 = 27829 # number of sentences
|
||||||
# make sure you sanitize every input into unicode
|
par = Parser()
|
||||||
|
|
||||||
SSJ500K_2_1 = 27829 # number of sentences
|
# path to data
|
||||||
par = Parser()
|
config = configparser.ConfigParser()
|
||||||
|
config.read("tools.cfg")
|
||||||
|
INDIR = Path(config["tools"]["kres_orig"])
|
||||||
|
OUTDIR = Path(config["tools"]["kres_tsv"])
|
||||||
|
|
||||||
"""
|
"""
|
||||||
print("parsing ssj")
|
print("parsing ssj")
|
||||||
ssj_file = "../data/ssj500k-sl.sample.xml"
|
ssj_file = "../data/ssj500k-sl.sample.xml"
|
||||||
ssj_dict = par.parse_tei(ssj_file)
|
ssj_dict = par.parse_tei(ssj_file)
|
||||||
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
# assert (len(ssj_dict) == 27829), "Parsed wrong number of sentences."
|
||||||
print("end parsing ssj")
|
print("end parsing ssj")
|
||||||
"""
|
"""
|
||||||
|
|
||||||
print("parsing kres")
|
print("parsing kres")
|
||||||
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
||||||
kres_dir = Path("../data/kres_example/").resolve()
|
OUTDIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
kres_out_dir = kres_dir.parent / (kres_dir.name + "_tsv")
|
for kres_file in [x for x in INDIR.iterdir() if x.is_file()]:
|
||||||
kres_out_dir.mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
for kres_file in [x for x in kres_dir.iterdir() if x.is_file()]:
|
print("Processing file: " + str(kres_file))
|
||||||
|
res_dict = par.parse_tei(kres_file)
|
||||||
|
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
|
||||||
|
print("Longest sentence: ", longest_sent)
|
||||||
|
kres_out_str = ""
|
||||||
|
|
||||||
print("Processing file: " + str(kres_file))
|
for _, sentence in res_dict.items():
|
||||||
res_dict = par.parse_tei(kres_file)
|
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
|
||||||
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
|
|
||||||
print("Longest sentence: ", longest_sent)
|
|
||||||
kres_out_str = ""
|
|
||||||
|
|
||||||
for _, sentence in res_dict.items():
|
with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
|
||||||
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
|
fp.write(kres_out_str.encode("utf-8"))
|
||||||
|
fp.close()
|
||||||
with (kres_out_dir / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
|
print("end parsing kres")
|
||||||
fp.write(kres_out_str.encode("utf-8"))
|
|
||||||
fp.close()
|
|
||||||
|
|
||||||
print("end parsing kres")
|
|
||||||
|
@ -4,6 +4,7 @@ from parser.msd.msdmap import Msdmap
|
|||||||
import pickle
|
import pickle
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from fillpred_model.step1 import build_model_row
|
from fillpred_model.step1 import build_model_row
|
||||||
|
import sys
|
||||||
|
|
||||||
class Parser:
|
class Parser:
|
||||||
# reads a TEI xml file and returns a dictionary:
|
# reads a TEI xml file and returns a dictionary:
|
||||||
@ -18,8 +19,12 @@ class Parser:
|
|||||||
self.W_TAGS = ['w']
|
self.W_TAGS = ['w']
|
||||||
self.C_TAGS = ['c']
|
self.C_TAGS = ['c']
|
||||||
self.S_TAGS = ['S', 'pc']
|
self.S_TAGS = ['S', 'pc']
|
||||||
with Path("./fillpred_model/model.pickle").open("rb") as fp:
|
try:
|
||||||
|
fp = Path("./fillpred_model/model.pickle").open("rb")
|
||||||
self.fillpred_model = pickle.load(fp)
|
self.fillpred_model = pickle.load(fp)
|
||||||
|
except IOError:
|
||||||
|
print("Generate the model first: $ make tools/fillpred_mode/model.pickle")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def parse_tei(self, filepath):
|
def parse_tei(self, filepath):
|
||||||
|
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
IN_FOLDER="$1"
|
# parsing tools.cfg values
|
||||||
OUT_FOLDER="$2"
|
IN_FOLDER="../$(sed -n -e 's/^\s*kres_tsv\s*=\s*//p' ../tools.cfg)"
|
||||||
|
echo "input folder: $IN_FOLDER"
|
||||||
|
OUT_FOLDER="../$(sed -n -e 's/^\s*kres_srl\s*=\s*//p' ../tools.cfg)"
|
||||||
|
echo "output folder: $OUT_FOLDER"
|
||||||
|
|
||||||
SUFFIX="srl.tsv"
|
SUFFIX="srl.tsv"
|
||||||
|
|
||||||
mkdir -p $OUT_FOLDER
|
mkdir -p $OUT_FOLDER
|
||||||
|
5
tools/tools.cfg
Normal file
5
tools/tools.cfg
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
[tools]
|
||||||
|
kres_orig = ../data/kres_example
|
||||||
|
kres_tsv = ../data/kres_example_tsv
|
||||||
|
kres_srl = ../data/kres_example_srl
|
||||||
|
kres_json = ../data/kres/example_json
|
Loading…
Reference in New Issue
Block a user