2019-02-25 23:22:15 +00:00
|
|
|
from pathlib import Path
|
|
|
|
from parser.parser import Parser
|
2019-02-27 08:15:40 +00:00
|
|
|
import configparser
|
2019-02-25 23:22:15 +00:00
|
|
|
|
2019-02-27 08:15:40 +00:00
|
|
|
# defaults
|
2019-02-25 23:22:15 +00:00
|
|
|
ORIGPATH = Path("../data/kres_example") # we need the IDs
|
|
|
|
INPATH = Path("../data/kres_example_srl")
|
|
|
|
OUTPATH = Path("../data/kres_example_json")
|
|
|
|
|
2019-02-27 08:15:40 +00:00
|
|
|
# parse config
|
|
|
|
config = configparser.ConfigParser()
|
|
|
|
config.read("tools.cfg")
|
|
|
|
ORIGPATH = Path(config["tools"]["kres_orig"])
|
|
|
|
INPATH = Path(config["tools"]["kres_srl"])
|
|
|
|
OUTPATH = Path(config["tools"]["kres_json"])
|
|
|
|
|
2019-02-25 23:22:15 +00:00
|
|
|
def get_origfile(filename):
|
|
|
|
for origfile in ORIGPATH.iterdir():
|
|
|
|
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
|
|
|
return origfile
|
|
|
|
raise FileNotFoundError
|
|
|
|
|
|
|
|
def extract_sentences(line_reader):
|
|
|
|
acc = []
|
|
|
|
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
|
|
|
|
if line[0] == '\n':
|
|
|
|
tmp = acc
|
|
|
|
acc = []
|
|
|
|
yield tmp
|
|
|
|
else:
|
|
|
|
acc.append(line)
|
|
|
|
|
|
|
|
def match_sentence_id(string, rd):
|
|
|
|
str1 = " ".join([token[1] for token in sentence_arr])
|
|
|
|
for k, e in rd.items():
|
|
|
|
str2 = " ".join(token[2] for token in dict_entry["tokens"])
|
2019-02-27 08:15:40 +00:00
|
|
|
if str1 == str2:
|
2019-02-25 23:22:15 +00:00
|
|
|
return k
|
|
|
|
raise KeyError
|
|
|
|
|
2019-02-25 12:44:24 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2019-02-25 23:22:15 +00:00
|
|
|
|
|
|
|
par = Parser()
|
|
|
|
|
|
|
|
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
|
|
|
|
origfile = get_origfile(infile)
|
|
|
|
rd = par.parse_tei(origfile)
|
|
|
|
|
|
|
|
fp = infile.open("rb")
|
|
|
|
for sentence_arr in extract_sentences(fp.readlines()):
|
|
|
|
sid = match_sentence_id(sentence_arr, rd)
|
|
|
|
print(sid)
|
|
|
|
# OK, we got the sentence id, now generate the predicate map!
|
|
|
|
|
|
|
|
|
|
|
|
outfile = (OUTPATH / infile.name).with_suffix(".json")
|