parent
66c43b3d19
commit
bcaf226b9e
@ -1,5 +1,48 @@
|
||||
import Path
|
||||
from pathlib import Path
|
||||
from parser.parser import Parser
|
||||
|
||||
ORIGPATH = Path("../data/kres_example") # we need the IDs
|
||||
INPATH = Path("../data/kres_example_srl")
|
||||
OUTPATH = Path("../data/kres_example_json")
|
||||
|
||||
def get_origfile(filename):
|
||||
for origfile in ORIGPATH.iterdir():
|
||||
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||
return origfile
|
||||
raise FileNotFoundError
|
||||
|
||||
def extract_sentences(line_reader):
|
||||
acc = []
|
||||
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
|
||||
if line[0] == '\n':
|
||||
tmp = acc
|
||||
acc = []
|
||||
yield tmp
|
||||
else:
|
||||
acc.append(line)
|
||||
|
||||
def match_sentence_id(string, rd):
|
||||
str1 = " ".join([token[1] for token in sentence_arr])
|
||||
for k, e in rd.items():
|
||||
str2 = " ".join(token[2] for token in dict_entry["tokens"])
|
||||
if str1 == str2
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("TODO: take data/kres_example_srl/* and generate data/kres_example_json/*")
|
||||
print("TODO: check ssj and kres <links> for structure")
|
||||
|
||||
par = Parser()
|
||||
|
||||
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
|
||||
origfile = get_origfile(infile)
|
||||
rd = par.parse_tei(origfile)
|
||||
|
||||
fp = infile.open("rb")
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
sid = match_sentence_id(sentence_arr, rd)
|
||||
print(sid)
|
||||
# OK, we got the sentence id, now generate the predicate map!
|
||||
|
||||
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
Loading…
Reference in new issue