chunk size now handled in file-sentence-generator
This commit is contained in:
parent
0d8aeb2282
commit
f0109771aa
16
src/wani.py
16
src/wani.py
|
@ -31,6 +31,8 @@ def load_files(args):
|
||||||
do_msd_translate = not args.no_msd_translate
|
do_msd_translate = not args.no_msd_translate
|
||||||
|
|
||||||
for n, fname in enumerate(filenames):
|
for n, fname in enumerate(filenames):
|
||||||
|
et = load_xml(fname)
|
||||||
|
yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
|
||||||
if args.count_files:
|
if args.count_files:
|
||||||
status = " :: {} / {}".format(n, len(filenames))
|
status = " :: {} / {}".format(n, len(filenames))
|
||||||
else:
|
else:
|
||||||
|
@ -46,10 +48,10 @@ def load_xml(filename, status):
|
||||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||||
return ElementTree.XML(xmlstring)
|
return ElementTree.XML(xmlstring)
|
||||||
|
|
||||||
def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status):
|
|
||||||
et = load_xml(filename, status)
|
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
|
||||||
|
words = {}
|
||||||
for sentence in et.iter('s'):
|
for sentence in et.iter('s'):
|
||||||
words = {}
|
|
||||||
for w in sentence.iter("w"):
|
for w in sentence.iter("w"):
|
||||||
words[w.get('id')] = Word(w, do_msd_translate)
|
words[w.get('id')] = Word(w, do_msd_translate)
|
||||||
for pc in sentence.iter(pc_tag):
|
for pc in sentence.iter(pc_tag):
|
||||||
|
@ -83,7 +85,11 @@ def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, s
|
||||||
# strange errors, just skip...
|
# strange errors, just skip...
|
||||||
pass
|
pass
|
||||||
|
|
||||||
yield list(words.values())
|
if chunk_size > 0 and len(words) > chunk_size:
|
||||||
|
yield list(words.values())
|
||||||
|
words = {}
|
||||||
|
|
||||||
|
yield list(words.values())
|
||||||
|
|
||||||
def match_file(words, structures):
|
def match_file(words, structures):
|
||||||
matches = {s: [] for s in structures}
|
matches = {s: [] for s in structures}
|
||||||
|
@ -107,6 +113,8 @@ def main(structures_file, args):
|
||||||
match_store = MatchStore(args)
|
match_store = MatchStore(args)
|
||||||
word_stats = WordStats(lemma_msds)
|
word_stats = WordStats(lemma_msds)
|
||||||
|
|
||||||
|
args.chunk_size = 50000
|
||||||
|
|
||||||
if args.parallel:
|
if args.parallel:
|
||||||
num_parallel = int(args.parallel)
|
num_parallel = int(args.parallel)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user