Restucturing, now words are parsed right after loading one file, not after loading all of them. Should be easilly parallelizable now

This commit is contained in:
Ozbolt Menegatti 2019-02-14 14:33:15 +01:00
parent f3fe981614
commit dec173ae33

22
wani.py
View File

@ -801,7 +801,7 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
# strange errors, just skip...
pass
return words.values()
return list(words.values())
class Writer:
def __init__(self, args):
@ -932,10 +932,17 @@ class ColocationIds:
def set_written(self, key):
self.data[key][2] = True
def merge_matches(self, matches, new_matches):
for _id, nms in new_matches.items():
for nm in nms:
matches[_id].append(nm)
self.add_match(nm[2])
def match_file(words, structures, colocation_ids, matches=None):
if matches is None:
matches = {s.id: [] for s in structures}
return matches
def match_file(words, structures):
matches = {s.id: [] for s in structures}
for idx, s in enumerate(structures):
logging.info("{}/{}: {:7s}".format(idx, len(structures), s.id))
@ -947,11 +954,11 @@ def match_file(words, structures, colocation_ids, matches=None):
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
colocation_id = tuple(colocation_id)
colocation_ids.add_match(colocation_id)
matches[s.id].append((match, reason, colocation_id))
return matches
def main(input_file, structures_file, args):
writer = Writer(args)
structures = build_structures(structures_file)
@ -959,10 +966,11 @@ def main(input_file, structures_file, args):
logging.debug(str(s))
colocation_ids = ColocationIds()
matches = None
matches = {s.id: [] for s in structures}
for words in load_files(args):
matches = match_file(words, structures, colocation_ids, matches)
new_matches = match_file(words, structures)
matches = colocation_ids.merge_matches(matches, new_matches)
writer.write_out(matches, structures, colocation_ids)