Restucturing, now words are parsed right after loading one file, not after loading all of them. Should be easilly parallelizable now
This commit is contained in:
parent
f3fe981614
commit
dec173ae33
22
wani.py
22
wani.py
|
@ -801,7 +801,7 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
|
|||
# strange errors, just skip...
|
||||
pass
|
||||
|
||||
return words.values()
|
||||
return list(words.values())
|
||||
|
||||
class Writer:
|
||||
def __init__(self, args):
|
||||
|
@ -932,10 +932,17 @@ class ColocationIds:
|
|||
def set_written(self, key):
|
||||
self.data[key][2] = True
|
||||
|
||||
def merge_matches(self, matches, new_matches):
|
||||
for _id, nms in new_matches.items():
|
||||
for nm in nms:
|
||||
matches[_id].append(nm)
|
||||
self.add_match(nm[2])
|
||||
|
||||
def match_file(words, structures, colocation_ids, matches=None):
|
||||
if matches is None:
|
||||
matches = {s.id: [] for s in structures}
|
||||
return matches
|
||||
|
||||
|
||||
def match_file(words, structures):
|
||||
matches = {s.id: [] for s in structures}
|
||||
|
||||
for idx, s in enumerate(structures):
|
||||
logging.info("{}/{}: {:7s}".format(idx, len(structures), s.id))
|
||||
|
@ -947,11 +954,11 @@ def match_file(words, structures, colocation_ids, matches=None):
|
|||
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
|
||||
colocation_id = tuple(colocation_id)
|
||||
|
||||
colocation_ids.add_match(colocation_id)
|
||||
matches[s.id].append((match, reason, colocation_id))
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
def main(input_file, structures_file, args):
|
||||
writer = Writer(args)
|
||||
structures = build_structures(structures_file)
|
||||
|
@ -959,10 +966,11 @@ def main(input_file, structures_file, args):
|
|||
logging.debug(str(s))
|
||||
|
||||
colocation_ids = ColocationIds()
|
||||
matches = None
|
||||
matches = {s.id: [] for s in structures}
|
||||
|
||||
for words in load_files(args):
|
||||
matches = match_file(words, structures, colocation_ids, matches)
|
||||
new_matches = match_file(words, structures)
|
||||
matches = colocation_ids.merge_matches(matches, new_matches)
|
||||
|
||||
writer.write_out(matches, structures, colocation_ids)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user