chunk size now handled in file-sentence-generator
This commit is contained in:
		
							parent
							
								
									0d8aeb2282
								
							
						
					
					
						commit
						f0109771aa
					
				
							
								
								
									
										14
									
								
								src/wani.py
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								src/wani.py
									
									
									
									
									
								
							@ -31,6 +31,8 @@ def load_files(args):
 | 
				
			|||||||
    do_msd_translate = not args.no_msd_translate
 | 
					    do_msd_translate = not args.no_msd_translate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for n, fname in enumerate(filenames):
 | 
					    for n, fname in enumerate(filenames):
 | 
				
			||||||
 | 
					        et = load_xml(fname)
 | 
				
			||||||
 | 
					        yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
 | 
				
			||||||
        if args.count_files:
 | 
					        if args.count_files:
 | 
				
			||||||
            status = " :: {} / {}".format(n, len(filenames))
 | 
					            status = " :: {} / {}".format(n, len(filenames))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
@ -46,10 +48,10 @@ def load_xml(filename, status):
 | 
				
			|||||||
    xmlstring = xmlstring.replace(' xml:', ' ')
 | 
					    xmlstring = xmlstring.replace(' xml:', ' ')
 | 
				
			||||||
    return ElementTree.XML(xmlstring)
 | 
					    return ElementTree.XML(xmlstring)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status):
 | 
					
 | 
				
			||||||
    et = load_xml(filename, status)
 | 
					def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
 | 
				
			||||||
    for sentence in et.iter('s'):
 | 
					 | 
				
			||||||
    words = {}
 | 
					    words = {}
 | 
				
			||||||
 | 
					    for sentence in et.iter('s'):
 | 
				
			||||||
        for w in sentence.iter("w"):
 | 
					        for w in sentence.iter("w"):
 | 
				
			||||||
            words[w.get('id')] = Word(w, do_msd_translate)
 | 
					            words[w.get('id')] = Word(w, do_msd_translate)
 | 
				
			||||||
        for pc in sentence.iter(pc_tag):
 | 
					        for pc in sentence.iter(pc_tag):
 | 
				
			||||||
@ -83,6 +85,10 @@ def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, s
 | 
				
			|||||||
                # strange errors, just skip...
 | 
					                # strange errors, just skip...
 | 
				
			||||||
                pass
 | 
					                pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if chunk_size > 0 and len(words) > chunk_size:
 | 
				
			||||||
 | 
					            yield list(words.values())
 | 
				
			||||||
 | 
					            words = {}
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    yield list(words.values())
 | 
					    yield list(words.values())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def match_file(words, structures):
 | 
					def match_file(words, structures):
 | 
				
			||||||
@ -107,6 +113,8 @@ def main(structures_file, args):
 | 
				
			|||||||
    match_store = MatchStore(args)
 | 
					    match_store = MatchStore(args)
 | 
				
			||||||
    word_stats = WordStats(lemma_msds)
 | 
					    word_stats = WordStats(lemma_msds)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    args.chunk_size = 50000
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    if args.parallel:
 | 
					    if args.parallel:
 | 
				
			||||||
        num_parallel = int(args.parallel)
 | 
					        num_parallel = int(args.parallel)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user