chunk size now handled in file-sentence-generator

2019-06-16 00:59:44 +02:00 · 2019-06-16 00:59:44 +02:00 · f0109771aa
commit f0109771aa
parent 0d8aeb2282
1 changed files with 12 additions and 4 deletions
--- a/src/wani.py
+++ b/src/wani.py
@ -31,6 +31,8 @@ def load_files(args):
    do_msd_translate = not args.no_msd_translate
    for n, fname in enumerate(filenames):
        et = load_xml(fname)
        yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
        if args.count_files:
            status = " :: {} / {}".format(n, len(filenames))
        else:
@ -46,10 +48,10 @@ def load_xml(filename, status):
    xmlstring = xmlstring.replace(' xml:', ' ')
    return ElementTree.XML(xmlstring)
-def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status):
+
-    et = load_xml(filename, status)
+def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
    for sentence in et.iter('s'):
    words = {}
    for sentence in et.iter('s'):
        for w in sentence.iter("w"):
            words[w.get('id')] = Word(w, do_msd_translate)
        for pc in sentence.iter(pc_tag):
@ -83,6 +85,10 @@ def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, s
                # strange errors, just skip...
                pass
        if chunk_size > 0 and len(words) > chunk_size:
            yield list(words.values())
            words = {}
    yield list(words.values())
 def match_file(words, structures):
@ -107,6 +113,8 @@ def main(structures_file, args):
    match_store = MatchStore(args)
    word_stats = WordStats(lemma_msds)
    args.chunk_size = 50000
    if args.parallel:
        num_parallel = int(args.parallel)