chunk size now handled in file-sentence-generator

This commit is contained in:
Ozbolt Menegatti 2019-06-16 00:59:44 +02:00
parent 0d8aeb2282
commit f0109771aa

View File

@ -31,6 +31,8 @@ def load_files(args):
do_msd_translate = not args.no_msd_translate
for n, fname in enumerate(filenames):
et = load_xml(fname)
yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
if args.count_files:
status = " :: {} / {}".format(n, len(filenames))
else:
@ -46,10 +48,10 @@ def load_xml(filename, status):
xmlstring = xmlstring.replace(' xml:', ' ')
return ElementTree.XML(xmlstring)
def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status):
et = load_xml(filename, status)
for sentence in et.iter('s'):
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
words = {}
for sentence in et.iter('s'):
for w in sentence.iter("w"):
words[w.get('id')] = Word(w, do_msd_translate)
for pc in sentence.iter(pc_tag):
@ -83,6 +85,10 @@ def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, s
# strange errors, just skip...
pass
if chunk_size > 0 and len(words) > chunk_size:
yield list(words.values())
words = {}
yield list(words.values())
def match_file(words, structures):
@ -107,6 +113,8 @@ def main(structures_file, args):
match_store = MatchStore(args)
word_stats = WordStats(lemma_msds)
args.chunk_size = 50000
if args.parallel:
num_parallel = int(args.parallel)