From dec173ae33d0dbe4a11ddff01b4773ffe996fd3f Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Thu, 14 Feb 2019 14:33:15 +0100 Subject: [PATCH] Restucturing, now words are parsed right after loading one file, not after loading all of them. Should be easilly parallelizable now --- wani.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/wani.py b/wani.py index 7ce397f..44ba831 100644 --- a/wani.py +++ b/wani.py @@ -801,7 +801,7 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status): # strange errors, just skip... pass - return words.values() + return list(words.values()) class Writer: def __init__(self, args): @@ -932,10 +932,17 @@ class ColocationIds: def set_written(self, key): self.data[key][2] = True + def merge_matches(self, matches, new_matches): + for _id, nms in new_matches.items(): + for nm in nms: + matches[_id].append(nm) + self.add_match(nm[2]) -def match_file(words, structures, colocation_ids, matches=None): - if matches is None: - matches = {s.id: [] for s in structures} + return matches + + +def match_file(words, structures): + matches = {s.id: [] for s in structures} for idx, s in enumerate(structures): logging.info("{}/{}: {:7s}".format(idx, len(structures), s.id)) @@ -947,11 +954,11 @@ def match_file(words, structures, colocation_ids, matches=None): colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0])) colocation_id = tuple(colocation_id) - colocation_ids.add_match(colocation_id) matches[s.id].append((match, reason, colocation_id)) return matches + def main(input_file, structures_file, args): writer = Writer(args) structures = build_structures(structures_file) @@ -959,10 +966,11 @@ def main(input_file, structures_file, args): logging.debug(str(s)) colocation_ids = ColocationIds() - matches = None + matches = {s.id: [] for s in structures} for words in load_files(args): - matches = match_file(words, structures, colocation_ids, matches) + new_matches = match_file(words, structures) + matches = colocation_ids.merge_matches(matches, new_matches) writer.write_out(matches, structures, colocation_ids)