Loader to its own module
This commit is contained in:
		
							parent
							
								
									51cf3e7064
								
							
						
					
					
						commit
						3552f14b81
					
				
							
								
								
									
										72
									
								
								src/loader.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								src/loader.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,72 @@ | ||||
| from xml.etree import ElementTree | ||||
| import logging | ||||
| import re | ||||
| import sys | ||||
| 
 | ||||
| from tqdm import tqdm | ||||
| 
 | ||||
| from word import Word | ||||
| 
 | ||||
| 
 | ||||
| def is_root_id(id_): | ||||
|     return len(id_.split('.')) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def load_files(args): | ||||
|     filenames = args.input | ||||
|     skip_id_check = args.skip_id_check | ||||
|     do_msd_translate = not args.no_msd_translate | ||||
| 
 | ||||
|     for fname in filenames: | ||||
|         et = load_xml(fname) | ||||
|         yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) | ||||
| 
 | ||||
| 
 | ||||
| def load_xml(filename): | ||||
|     logging.info("LOADING XML: {}".format(filename)) | ||||
|     with open(filename, 'r') as fp: | ||||
|         content = fp.read() | ||||
| 
 | ||||
|     xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1) | ||||
|     xmlstring = xmlstring.replace(' xml:', ' ') | ||||
|     return ElementTree.XML(xmlstring) | ||||
| 
 | ||||
| 
 | ||||
| def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): | ||||
|     words = {} | ||||
|     sentences = list(et.iter('s')) | ||||
|     for sentence in tqdm(sentences, desc="load-text"): | ||||
|         for w in sentence.iter("w"): | ||||
|             words[w.get('id')] = Word(w, do_msd_translate) | ||||
|         for pc in sentence.iter(pc_tag): | ||||
|             words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) | ||||
| 
 | ||||
|         for l in sentence.iter("link"): | ||||
|             if 'dep' in l.keys(): | ||||
|                 ana = l.get('afun') | ||||
|                 lfrom = l.get('from') | ||||
|                 dest = l.get('dep') | ||||
|             else: | ||||
|                 ana = l.get('ana') | ||||
|                 if ana[:4] != 'syn:': # dont bother... | ||||
|                     continue | ||||
|                 ana = ana[4:] | ||||
|                 lfrom, dest = l.get('target').replace('#', '').split() | ||||
| 
 | ||||
|             if lfrom in words: | ||||
|                 if not skip_id_check and is_root_id(lfrom): | ||||
|                     logging.error("NOO: {}".format(lfrom)) | ||||
|                     sys.exit(1) | ||||
| 
 | ||||
|                 if dest in words: | ||||
|                     next_word = words[dest] | ||||
|                     words[lfrom].add_link(ana, next_word) | ||||
|                 else: | ||||
|                     logging.error("Unknown id: {}".format(dest)) | ||||
|                     sys.exit(1) | ||||
| 
 | ||||
|             else: | ||||
|                 # strange errors, just skip... | ||||
|                 pass | ||||
| 
 | ||||
|     return list(words.values()) | ||||
							
								
								
									
										68
									
								
								src/wani.py
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								src/wani.py
									
									
									
									
									
								
							| @ -19,73 +19,9 @@ from syntactic_structure import build_structures | ||||
| from match_store import MatchStore | ||||
| from word_stats import WordStats | ||||
| from writer import Writer | ||||
| from loader import load_files | ||||
| 
 | ||||
| 
 | ||||
| def is_root_id(id_): | ||||
|     return len(id_.split('.')) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def load_files(args): | ||||
|     filenames = args.input | ||||
|     skip_id_check = args.skip_id_check | ||||
|     do_msd_translate = not args.no_msd_translate | ||||
| 
 | ||||
|     for n, fname in enumerate(filenames): | ||||
|         et = load_xml(fname) | ||||
|         yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size) | ||||
| 
 | ||||
| 
 | ||||
| def load_xml(filename): | ||||
|     logging.info("\rLOADING XML: {}".format(filename)) | ||||
|     with open(filename, 'r') as fp: | ||||
|         content = fp.read() | ||||
| 
 | ||||
|     xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1) | ||||
|     xmlstring = xmlstring.replace(' xml:', ' ') | ||||
|     return ElementTree.XML(xmlstring) | ||||
| 
 | ||||
| 
 | ||||
| def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size): | ||||
|     words = {} | ||||
|     for sentence in et.iter('s'): | ||||
|         for w in sentence.iter("w"): | ||||
|             words[w.get('id')] = Word(w, do_msd_translate) | ||||
|         for pc in sentence.iter(pc_tag): | ||||
|             words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) | ||||
| 
 | ||||
|         for l in sentence.iter("link"): | ||||
|             if 'dep' in l.keys(): | ||||
|                 ana = l.get('afun') | ||||
|                 lfrom = l.get('from') | ||||
|                 dest = l.get('dep') | ||||
|             else: | ||||
|                 ana = l.get('ana') | ||||
|                 if ana[:4] != 'syn:': # dont bother... | ||||
|                     continue | ||||
|                 ana = ana[4:] | ||||
|                 lfrom, dest = l.get('target').replace('#', '').split() | ||||
| 
 | ||||
|             if lfrom in words: | ||||
|                 if not skip_id_check and is_root_id(lfrom): | ||||
|                     logging.error("NOO: {}".format(lfrom)) | ||||
|                     sys.exit(1) | ||||
| 
 | ||||
|                 if dest in words: | ||||
|                     next_word = words[dest] | ||||
|                     words[lfrom].add_link(ana, next_word) | ||||
|                 else: | ||||
|                     logging.error("Unknown id: {}".format(dest)) | ||||
|                     sys.exit(1) | ||||
| 
 | ||||
|             else: | ||||
|                 # strange errors, just skip... | ||||
|                 pass | ||||
| 
 | ||||
|         if chunk_size > 0 and len(words) > chunk_size: | ||||
|             yield list(words.values()) | ||||
|             words = {} | ||||
|      | ||||
|     yield list(words.values()) | ||||
| 
 | ||||
| def match_file(words, structures): | ||||
|     matches = {s: [] for s in structures} | ||||
| @ -109,8 +45,6 @@ def main(args): | ||||
|     match_store = MatchStore(args) | ||||
|     word_stats = WordStats(lemma_msds) | ||||
| 
 | ||||
|     args.chunk_size = 50000 | ||||
|      | ||||
|     if args.parallel: | ||||
|         num_parallel = int(args.parallel) | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user