lemma/msd representations now global! Need to also use for agreements
This commit is contained in:
		
							parent
							
								
									d14efff709
								
							
						
					
					
						commit
						e99ba59908
					
				
							
								
								
									
										35
									
								
								wani.py
									
									
									
									
									
								
							
							
						
						
									
										35
									
								
								wani.py
									
									
									
									
									
								
							| @ -217,7 +217,7 @@ class ComponentRendition: | ||||
|         return self.rendition is rendition | ||||
|      | ||||
|     @staticmethod | ||||
|     def set_representations(matches, structure): | ||||
|     def set_representations(matches, structure, word_renderer): | ||||
|         representations = { | ||||
|             c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""] | ||||
|             for c in structure.components | ||||
| @ -236,11 +236,8 @@ class ComponentRendition: | ||||
|             for word in sorted_lst: | ||||
|                 othw = are_agreements_ok(word, representations_to_check) | ||||
|                 if othw is not None: | ||||
|                     if doprint: | ||||
|                         print("AOK", othw.text, othw) | ||||
| 
 | ||||
|                     matches.representations[word_component_id[othw.id]] = othw.text | ||||
|                     matches.representations[word_component_id[word.id]] = word.text | ||||
|                     matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer) | ||||
|                     matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer) | ||||
|                     return | ||||
|          | ||||
|         def are_agreements_ok(word, words_to_try): | ||||
| @ -858,17 +855,20 @@ class Word: | ||||
|                 self.links[link].extend(self.links[l]) | ||||
| 
 | ||||
|         return self.links[link] | ||||
|      | ||||
|     def most_frequent_text(self, word_renderer): | ||||
|         return word_renderer.render(self.lemma, self.msd) | ||||
| 
 | ||||
| class WordMsdRenderer: | ||||
|     def __init__(self): | ||||
|         self.all_words = [] | ||||
|         self.rendered_words = {} | ||||
|      | ||||
|     def add_word(self, word): | ||||
|         self.all_words.append(word) | ||||
|     def add_words(self, words): | ||||
|         self.all_words.extend(words) | ||||
|      | ||||
|     def generate_renders(self): | ||||
|         data = defaultdict(lambda: defaultdict([])) | ||||
|         data = defaultdict(lambda: defaultdict(list)) | ||||
|         for w in self.all_words: | ||||
|             data[w.lemma][w.msd].append(w.text) | ||||
| 
 | ||||
| @ -1114,10 +1114,10 @@ class ColocationIds: | ||||
|                 if group: | ||||
|                     break | ||||
| 
 | ||||
|     def set_representations(self, structures): | ||||
|     def set_representations(self, structures, word_renderer): | ||||
|         components_dict = {structure.id: structure for structure in structures} | ||||
|         for _1, sm in self.data.items(): | ||||
|             ComponentRendition.set_representations(sm, components_dict[sm.structure_id]) | ||||
|             ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer) | ||||
| 
 | ||||
| 
 | ||||
| def match_file(words, structures): | ||||
| @ -1144,6 +1144,7 @@ def main(input_file, structures_file, args): | ||||
|         logging.debug(str(s)) | ||||
| 
 | ||||
|     colocation_ids = ColocationIds() | ||||
|     word_renderer = WordMsdRenderer() | ||||
| 
 | ||||
|     if args.parallel: | ||||
|         num_parallel = int(args.parallel) | ||||
| @ -1170,22 +1171,28 @@ def main(input_file, structures_file, args): | ||||
|                 # fancy interface to wait for threads to finish | ||||
|                 for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]): | ||||
|                     with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp: | ||||
|                         matches = pickle.load(fp) | ||||
|                         words, matches = pickle.load(fp) | ||||
| 
 | ||||
|                     colocation_ids.add_matches(matches) | ||||
|                     word_renderer.add_words(words) | ||||
| 
 | ||||
|     else: | ||||
|         for words in load_files(args): | ||||
|             matches = match_file(words, structures) | ||||
|             # just save to temporary file, used for children of a parallel process | ||||
|             # MUST NOT have more than one file | ||||
|             if args.match_to_file is not None: | ||||
|                 with open(args.match_to_file, "wb") as fp: | ||||
|                     pickle.dump(matches, fp) | ||||
|                     pickle.dump((words, matches), fp) | ||||
|                     return | ||||
|             else: | ||||
|                 colocation_ids.add_matches(matches) | ||||
|                 word_renderer.add_words(words) | ||||
| 
 | ||||
|     # get word renders for lemma/msd | ||||
|     word_renderer.generate_renders() | ||||
|     # figure out representations! | ||||
|     colocation_ids.set_representations(structures) | ||||
|     colocation_ids.set_representations(structures, word_renderer) | ||||
| 
 | ||||
|     if args.all: | ||||
|         Writer.make_all_writer(args).write_out(structures, colocation_ids) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user