chunk size now handled in file-sentence-generator
This commit is contained in:
parent
0d8aeb2282
commit
f0109771aa
16
src/wani.py
16
src/wani.py
|
@ -31,6 +31,8 @@ def load_files(args):
|
|||
do_msd_translate = not args.no_msd_translate
|
||||
|
||||
for n, fname in enumerate(filenames):
|
||||
et = load_xml(fname)
|
||||
yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
|
||||
if args.count_files:
|
||||
status = " :: {} / {}".format(n, len(filenames))
|
||||
else:
|
||||
|
@ -46,10 +48,10 @@ def load_xml(filename, status):
|
|||
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||
return ElementTree.XML(xmlstring)
|
||||
|
||||
def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status):
|
||||
et = load_xml(filename, status)
|
||||
|
||||
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
|
||||
words = {}
|
||||
for sentence in et.iter('s'):
|
||||
words = {}
|
||||
for w in sentence.iter("w"):
|
||||
words[w.get('id')] = Word(w, do_msd_translate)
|
||||
for pc in sentence.iter(pc_tag):
|
||||
|
@ -83,7 +85,11 @@ def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, s
|
|||
# strange errors, just skip...
|
||||
pass
|
||||
|
||||
yield list(words.values())
|
||||
if chunk_size > 0 and len(words) > chunk_size:
|
||||
yield list(words.values())
|
||||
words = {}
|
||||
|
||||
yield list(words.values())
|
||||
|
||||
def match_file(words, structures):
|
||||
matches = {s: [] for s in structures}
|
||||
|
@ -107,6 +113,8 @@ def main(structures_file, args):
|
|||
match_store = MatchStore(args)
|
||||
word_stats = WordStats(lemma_msds)
|
||||
|
||||
args.chunk_size = 50000
|
||||
|
||||
if args.parallel:
|
||||
num_parallel = int(args.parallel)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user