48 lines
1.2 KiB
Python
48 lines
1.2 KiB
Python
from pathlib import Path
|
|
from parser.parser import Parser
|
|
|
|
ORIGPATH = Path("../data/kres_example") # we need the IDs
|
|
INPATH = Path("../data/kres_example_srl")
|
|
OUTPATH = Path("../data/kres_example_json")
|
|
|
|
def get_origfile(filename):
|
|
for origfile in ORIGPATH.iterdir():
|
|
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
|
return origfile
|
|
raise FileNotFoundError
|
|
|
|
def extract_sentences(line_reader):
|
|
acc = []
|
|
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
|
|
if line[0] == '\n':
|
|
tmp = acc
|
|
acc = []
|
|
yield tmp
|
|
else:
|
|
acc.append(line)
|
|
|
|
def match_sentence_id(string, rd):
|
|
str1 = " ".join([token[1] for token in sentence_arr])
|
|
for k, e in rd.items():
|
|
str2 = " ".join(token[2] for token in dict_entry["tokens"])
|
|
if str1 == str2
|
|
return k
|
|
raise KeyError
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
par = Parser()
|
|
|
|
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
|
|
origfile = get_origfile(infile)
|
|
rd = par.parse_tei(origfile)
|
|
|
|
fp = infile.open("rb")
|
|
for sentence_arr in extract_sentences(fp.readlines()):
|
|
sid = match_sentence_id(sentence_arr, rd)
|
|
print(sid)
|
|
# OK, we got the sentence id, now generate the predicate map!
|
|
|
|
|
|
outfile = (OUTPATH / infile.name).with_suffix(".json") |