Restucturing, now words are parsed right after loading one file, not after loading all of them. Should be easilly parallelizable now
This commit is contained in:
parent
f3fe981614
commit
dec173ae33
20
wani.py
20
wani.py
|
@ -801,7 +801,7 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
|
||||||
# strange errors, just skip...
|
# strange errors, just skip...
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return words.values()
|
return list(words.values())
|
||||||
|
|
||||||
class Writer:
|
class Writer:
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
|
@ -932,9 +932,16 @@ class ColocationIds:
|
||||||
def set_written(self, key):
|
def set_written(self, key):
|
||||||
self.data[key][2] = True
|
self.data[key][2] = True
|
||||||
|
|
||||||
|
def merge_matches(self, matches, new_matches):
|
||||||
|
for _id, nms in new_matches.items():
|
||||||
|
for nm in nms:
|
||||||
|
matches[_id].append(nm)
|
||||||
|
self.add_match(nm[2])
|
||||||
|
|
||||||
def match_file(words, structures, colocation_ids, matches=None):
|
return matches
|
||||||
if matches is None:
|
|
||||||
|
|
||||||
|
def match_file(words, structures):
|
||||||
matches = {s.id: [] for s in structures}
|
matches = {s.id: [] for s in structures}
|
||||||
|
|
||||||
for idx, s in enumerate(structures):
|
for idx, s in enumerate(structures):
|
||||||
|
@ -947,11 +954,11 @@ def match_file(words, structures, colocation_ids, matches=None):
|
||||||
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
|
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
|
||||||
colocation_id = tuple(colocation_id)
|
colocation_id = tuple(colocation_id)
|
||||||
|
|
||||||
colocation_ids.add_match(colocation_id)
|
|
||||||
matches[s.id].append((match, reason, colocation_id))
|
matches[s.id].append((match, reason, colocation_id))
|
||||||
|
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
|
||||||
def main(input_file, structures_file, args):
|
def main(input_file, structures_file, args):
|
||||||
writer = Writer(args)
|
writer = Writer(args)
|
||||||
structures = build_structures(structures_file)
|
structures = build_structures(structures_file)
|
||||||
|
@ -959,10 +966,11 @@ def main(input_file, structures_file, args):
|
||||||
logging.debug(str(s))
|
logging.debug(str(s))
|
||||||
|
|
||||||
colocation_ids = ColocationIds()
|
colocation_ids = ColocationIds()
|
||||||
matches = None
|
matches = {s.id: [] for s in structures}
|
||||||
|
|
||||||
for words in load_files(args):
|
for words in load_files(args):
|
||||||
matches = match_file(words, structures, colocation_ids, matches)
|
new_matches = match_file(words, structures)
|
||||||
|
matches = colocation_ids.merge_matches(matches, new_matches)
|
||||||
|
|
||||||
writer.write_out(matches, structures, colocation_ids)
|
writer.write_out(matches, structures, colocation_ids)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user