From f0109771aa60d29da58c6185a1b01310f084b622 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Sun, 16 Jun 2019 00:59:44 +0200 Subject: [PATCH] chunk size now handled in file-sentence-generator --- src/wani.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/wani.py b/src/wani.py index bf732eb..9f13196 100644 --- a/src/wani.py +++ b/src/wani.py @@ -31,6 +31,8 @@ def load_files(args): do_msd_translate = not args.no_msd_translate for n, fname in enumerate(filenames): + et = load_xml(fname) + yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size) if args.count_files: status = " :: {} / {}".format(n, len(filenames)) else: @@ -46,10 +48,10 @@ def load_xml(filename, status): xmlstring = xmlstring.replace(' xml:', ' ') return ElementTree.XML(xmlstring) -def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status): - et = load_xml(filename, status) + +def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size): + words = {} for sentence in et.iter('s'): - words = {} for w in sentence.iter("w"): words[w.get('id')] = Word(w, do_msd_translate) for pc in sentence.iter(pc_tag): @@ -83,7 +85,11 @@ def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, s # strange errors, just skip... pass - yield list(words.values()) + if chunk_size > 0 and len(words) > chunk_size: + yield list(words.values()) + words = {} + + yield list(words.values()) def match_file(words, structures): matches = {s: [] for s in structures} @@ -107,6 +113,8 @@ def main(structures_file, args): match_store = MatchStore(args) word_stats = WordStats(lemma_msds) + args.chunk_size = 50000 + if args.parallel: num_parallel = int(args.parallel)