Adding restriction on spaces on punctuations.
This commit is contained in:
		
							parent
							
								
									6dd97838b4
								
							
						
					
					
						commit
						c63a9d47da
					
				| @ -1,9 +1,10 @@ | ||||
| from enum import Enum | ||||
| import logging | ||||
| 
 | ||||
| from luscenje_struktur.restriction import Restriction | ||||
| # from luscenje_struktur.restriction import Restriction | ||||
| from luscenje_struktur.order import Order | ||||
| from luscenje_struktur.representation_assigner import RepresentationAssigner | ||||
| from luscenje_struktur.restriction_group import RestrictionGroup | ||||
| 
 | ||||
| 
 | ||||
| class ComponentStatus(Enum): | ||||
| @ -38,7 +39,7 @@ class Component: | ||||
|         self.status = status | ||||
|         self.name = name | ||||
|         self.idx = idx | ||||
|         self.restrictions = [Restriction(None)] if 'restriction' in info else [] | ||||
|         self.restrictions = RestrictionGroup([None]) if 'restriction' in info else [] | ||||
|         self.next_element = [] | ||||
|         self.representation = [] | ||||
|         self.selection = {} | ||||
| @ -49,15 +50,17 @@ class Component: | ||||
|     def add_next(self, next_component, link_label, order): | ||||
|         self.next_element.append((next_component, link_label, Order.new(order))) | ||||
| 
 | ||||
|     def set_restriction(self, restrictions_tag): | ||||
|         if restrictions_tag is None: | ||||
|             self.restrictions = [Restriction(None)] | ||||
|     def set_restriction(self, restrictions_tags): | ||||
|         if not restrictions_tags: | ||||
|             self.restrictions = RestrictionGroup([None]) | ||||
| 
 | ||||
|         elif restrictions_tag.tag == "restriction": | ||||
|             self.restrictions = [Restriction(restrictions_tag)] | ||||
|         # if first element is of type restriction all following are as well | ||||
|         elif restrictions_tags[0].tag == "restriction": | ||||
|             self.restrictions = RestrictionGroup(restrictions_tags) | ||||
| 
 | ||||
|         elif restrictions_tag.tag == "restriction_or": | ||||
|             self.restrictions = [Restriction(el) for el in restrictions_tag] | ||||
|         # combinations of 'and' and 'or' restrictions are currently not implemented | ||||
|         elif restrictions_tags[0].tag == "restriction_or": | ||||
|             self.restrictions = RestrictionGroup(restrictions_tags[0], group_type='or') | ||||
| 
 | ||||
|         else: | ||||
|             raise RuntimeError("Unreachable") | ||||
| @ -118,9 +121,8 @@ class Component: | ||||
| 
 | ||||
|     def _match_self(self, word): | ||||
|         # matching | ||||
|         for restr in self.restrictions: | ||||
|             if restr.match(word): # match either | ||||
|                 return {self.idx: word} | ||||
|         if self.restrictions.match(word): | ||||
|             return {self.idx: word} | ||||
| 
 | ||||
|     def _match_next(self, word): | ||||
|         # matches for every component in links from this component | ||||
|  | ||||
| @ -41,7 +41,7 @@ def load_files(args, database, w_collection=None, input_corpus=None): | ||||
|         if extension == ".xml": | ||||
|             et = load_xml(fname) | ||||
|             if input_corpus is None: | ||||
|                 yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) | ||||
|                 yield file_sentence_generator(et, args) | ||||
|             else: | ||||
|                 sentence_generator = file_sentence_generator_valency(et, skip_id_check, do_msd_translate, args.pc_tag, w_collection) | ||||
|                 for sent_id, sentence, othr_attributes in sentence_generator: | ||||
| @ -189,16 +189,43 @@ def load_xml(filename): | ||||
|     return ElementTree.XML(xmlstring) | ||||
| 
 | ||||
| 
 | ||||
| def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): | ||||
| def file_sentence_generator(et, args): | ||||
|     skip_id_check = args.skip_id_check | ||||
|     do_msd_translate = not args.no_msd_translate | ||||
|     pc_tag = args.pc_tag | ||||
|     use_punctuations = not args.ignore_punctuations | ||||
|     previous_glue = '' | ||||
|     previous_pc = False | ||||
| 
 | ||||
|     words = {} | ||||
|     sentences = list(et.iter('s')) | ||||
|     for sentence in progress(sentences, "load-text"): | ||||
|         # create fake root word | ||||
|         words[sentence.get('id')] = Word.fake_root_word(sentence.get('id')) | ||||
|         for w in sentence.iter("w"): | ||||
|             words[w.get('id')] = Word.from_xml(w, do_msd_translate) | ||||
|         for pc in sentence.iter(pc_tag): | ||||
|             words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) | ||||
|         last_word_id = None | ||||
| 
 | ||||
|         for w in sentence.iter(): | ||||
|             if w.tag == 'w': | ||||
|                 words[w.get('id')] = Word.from_xml(w, do_msd_translate) | ||||
|                 if use_punctuations: | ||||
|                     previous_glue = '' | ||||
|                     last_word_id = None | ||||
|             elif w.tag == pc_tag: | ||||
|                 words[w.get('id')] = Word.pc_word(w, do_msd_translate) | ||||
|                 if use_punctuations: | ||||
|                     last_word_id = w.get('id') | ||||
|                     words[w.get('id')].previous_glue = previous_glue | ||||
|                     previous_glue = '' | ||||
|             elif use_punctuations and w.tag == 'c': | ||||
|                 # always save previous glue | ||||
|                 previous_glue = w.text | ||||
|                 if last_word_id: | ||||
|                     words[last_word_id].glue += w.text | ||||
| 
 | ||||
|         # for w in sentence.iter("w"): | ||||
|         #     words[w.get('id')] = Word.from_xml(w, do_msd_translate) | ||||
|         # for pc in sentence.iter(pc_tag): | ||||
|         #     words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) | ||||
| 
 | ||||
|         for l in sentence.iter("link"): | ||||
|             if 'dep' in l.keys(): | ||||
|  | ||||
| @ -8,6 +8,7 @@ class RestrictionType(Enum): | ||||
|     Morphology = 0 | ||||
|     Lexis = 1 | ||||
|     MatchAll = 2 | ||||
|     Space = 3 | ||||
| 
 | ||||
| 
 | ||||
| def determine_ppb(rgxs): | ||||
| @ -123,6 +124,32 @@ class LexisRegex: | ||||
|     def __call__(self, text): | ||||
|         return text in self.match_list | ||||
| 
 | ||||
| 
 | ||||
| class SpaceRegex: | ||||
|     def __init__(self, restriction): | ||||
|         restr_dict = {} | ||||
|         for feature in restriction: | ||||
|             restr_dict.update(feature.items()) | ||||
| 
 | ||||
|         assert "contact" in restr_dict | ||||
|         self.space = restr_dict['contact'].split('|') | ||||
|         for el in self.space: | ||||
|             if el not in ['both', 'right', 'left', 'neither']: | ||||
|                 raise Exception('Value of space restriction is not supported (it may be both, left, right or neither).') | ||||
| 
 | ||||
|     def __call__(self, word): | ||||
|         match = False | ||||
|         if 'both' in self.space: | ||||
|             match = match or (word.previous_glue != '' and word.glue != '') | ||||
|         if 'right' in self.space: | ||||
|             match = match or (word.previous_glue == '' and word.glue != '') | ||||
|         if 'left' in self.space: | ||||
|             match = match or (word.previous_glue != '' and word.glue == '') | ||||
|         if 'neither' in self.space: | ||||
|             match = match or (word.previous_glue == '' and word.glue == '') | ||||
| 
 | ||||
|         return match | ||||
| 
 | ||||
| class Restriction: | ||||
|     def __init__(self, restriction_tag): | ||||
|         self.ppb = 4 # polnopomenska beseda (0-4) | ||||
| @ -142,6 +169,10 @@ class Restriction: | ||||
|         elif restriction_type == "lexis": | ||||
|             self.type = RestrictionType.Lexis | ||||
|             self.matcher = LexisRegex(list(restriction_tag)) | ||||
| 
 | ||||
|         elif restriction_type == "space": | ||||
|             self.type = RestrictionType.Space | ||||
|             self.matcher = SpaceRegex(list(restriction_tag)) | ||||
|         else: | ||||
|             raise NotImplementedError() | ||||
| 
 | ||||
| @ -152,6 +183,8 @@ class Restriction: | ||||
|             match_to = word.lemma | ||||
|         elif self.type == RestrictionType.MatchAll: | ||||
|             return True | ||||
|         elif self.type == RestrictionType.Space: | ||||
|             match_to = word | ||||
|         else: | ||||
|             raise RuntimeError("Unreachable!") | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										24
									
								
								luscenje_struktur/restriction_group.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								luscenje_struktur/restriction_group.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,24 @@ | ||||
| from luscenje_struktur.restriction import Restriction | ||||
| 
 | ||||
| class RestrictionGroup: | ||||
|     def __init__(self, restrictions_tag, group_type='and'): | ||||
|         self.restrictions = [Restriction(el) for el in restrictions_tag] | ||||
|         self.group_type = group_type | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         for restriction in self.restrictions: | ||||
|             yield restriction | ||||
| 
 | ||||
|     def match(self, word): | ||||
|         if self.group_type == 'or': | ||||
|             for restr in self.restrictions: | ||||
|                 if restr.match(word): # match either | ||||
|                     return True | ||||
|             return False | ||||
|         elif self.group_type == 'and': | ||||
|             for restr in self.restrictions: | ||||
|                 if not restr.match(word): # match and | ||||
|                     return False | ||||
|             return True | ||||
|         else: | ||||
|             raise Exception("Unsupported group_type - it may only be 'and' or 'or'") | ||||
| @ -34,13 +34,12 @@ class SyntacticStructure: | ||||
| 
 | ||||
|         for comp in definitions: | ||||
|             n = comp.get('cid') | ||||
|             restrs[n] = None | ||||
|             restrs[n] = [] | ||||
|             forms[n] = [] | ||||
| 
 | ||||
|             for el in comp: | ||||
|                 if el.tag.startswith("restriction"): | ||||
|                     assert restrs[n] is None | ||||
|                     restrs[n] = el | ||||
|                     restrs[n].append(el) | ||||
|                 elif el.tag.startswith("representation"): | ||||
|                     st.add_representation(n, el, forms) | ||||
|                 else: | ||||
|  | ||||
| @ -32,13 +32,14 @@ class WordDummy: | ||||
| 
 | ||||
| 
 | ||||
| class Word: | ||||
|     def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False): | ||||
|     def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None): | ||||
|         self.lemma = lemma | ||||
|         self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd | ||||
|         self.id = wid | ||||
|         self.idi = None | ||||
|         self.text = text | ||||
|         self.glue = '' | ||||
|         self.previous_glue = '' if previous_punctuation is None else previous_punctuation | ||||
|         self.fake_word = fake_word | ||||
| 
 | ||||
|         self.links = defaultdict(list) | ||||
|  | ||||
							
								
								
									
										3
									
								
								wani.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								wani.py
									
									
									
									
									
								
							| @ -153,7 +153,8 @@ if __name__ == '__main__': | ||||
|                         help='Tag for separators, usually pc or c', default="pc") | ||||
|     parser.add_argument('--separator', | ||||
|                         help='Separator in output file', default="\t") | ||||
| 
 | ||||
|     parser.add_argument('--ignore-punctuations', | ||||
|                         help="Sort in reversed ored", action='store_true') | ||||
|     args = parser.parse_args() | ||||
|     logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user