Added some functions for compatibility with valency, fixed readme and fixed some minor bugs.
This commit is contained in:
		
							parent
							
								
									1b0e6a27eb
								
							
						
					
					
						commit
						01b08667d2
					
				
							
								
								
									
										12
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								README.md
									
									
									
									
									
								
							| @ -26,18 +26,22 @@ pip install -r requirements.txt | |||||||
| # Running | # Running | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB> | python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> | ||||||
| python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k |  | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| ## Most important optional parameters | ## Most important optional parameters | ||||||
| 
 | 
 | ||||||
| ### --sloleks_db | ### --sloleks_db | ||||||
|  | This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement.  | ||||||
|  | 
 | ||||||
| To use this sqlalchemy has to be installed as well. | To use this sqlalchemy has to be installed as well. | ||||||
| PATH TO SLOLEKS DB | 
 | ||||||
|  | This parameter has to include information about database in following order: | ||||||
|  | 
 | ||||||
|  | <DB_USERNAME>:<DB_PASSWORD>:<DB_NAME>:<DB_URL> | ||||||
| 
 | 
 | ||||||
| ### --collocation_sentence_map_dest | ### --collocation_sentence_map_dest | ||||||
| ../data/collocation_sentence_mapper  | If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids. | ||||||
| 
 | 
 | ||||||
| ### --db | ### --db | ||||||
| This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified. | This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified. | ||||||
|  | |||||||
| @ -135,6 +135,7 @@ class Component: | |||||||
|         # matches for every component in links from this component |         # matches for every component in links from this component | ||||||
|         to_ret = [] |         to_ret = [] | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|         # need to get all links that match |         # need to get all links that match | ||||||
|         for next, link, order in self.next_element: |         for next, link, order in self.next_element: | ||||||
|             next_links = word.get_links(link) |             next_links = word.get_links(link) | ||||||
| @ -146,6 +147,9 @@ class Component: | |||||||
|                 if not order.match(word, next_word): |                 if not order.match(word, next_word): | ||||||
|                     continue |                     continue | ||||||
| 
 | 
 | ||||||
|  |                 if word.lemma == 'aktivirati' and next_word.text == 'potomcih': | ||||||
|  |                     a = 0 | ||||||
|  | 
 | ||||||
|                 match = next.match(next_word) |                 match = next.match(next_word) | ||||||
| 
 | 
 | ||||||
|                 if match is not None: |                 if match is not None: | ||||||
|  | |||||||
| @ -82,7 +82,7 @@ class AllFormatter(Formatter): | |||||||
|         word = words[idx] |         word = words[idx] | ||||||
|         return [word.id, word.text, word.lemma, word.msd] |         return [word.id, word.text, word.lemma, word.msd] | ||||||
|      |      | ||||||
|     def content_right(self, _freq): |     def content_right(self, _freq, variable_word_order=None): | ||||||
|         return [] |         return [] | ||||||
|      |      | ||||||
|     def group(self): |     def group(self): | ||||||
|  | |||||||
| @ -8,7 +8,7 @@ def get_lemma_features(et): | |||||||
| 
 | 
 | ||||||
|     result = {} |     result = {} | ||||||
|     for pos in lf.iter('POS'): |     for pos in lf.iter('POS'): | ||||||
|         rgx_list = MorphologyRegex(pos).rgx |         rgx_list = MorphologyRegex(pos).rgxs[0] | ||||||
|         rgx_str = "" |         rgx_str = "" | ||||||
|         for position in rgx_list: |         for position in rgx_list: | ||||||
|             if position == ".": |             if position == ".": | ||||||
|  | |||||||
| @ -10,7 +10,10 @@ class RestrictionType(Enum): | |||||||
|     MatchAll = 2 |     MatchAll = 2 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def determine_ppb(rgx): | def determine_ppb(rgxs): | ||||||
|  |     if len(rgxs) != 1: | ||||||
|  |         return 0 | ||||||
|  |     rgx = rgxs[0] | ||||||
|     if rgx[0] in ("A", "N", "R"): |     if rgx[0] in ("A", "N", "R"): | ||||||
|         return 0 |         return 0 | ||||||
|     elif rgx[0] == "V": |     elif rgx[0] == "V": | ||||||
| @ -27,7 +30,7 @@ def determine_ppb(rgx): | |||||||
| 
 | 
 | ||||||
| class MorphologyRegex: | class MorphologyRegex: | ||||||
|     def __init__(self, restriction): |     def __init__(self, restriction): | ||||||
|         self.min_msd_length = 1 |         # self.min_msd_length = 1 | ||||||
| 
 | 
 | ||||||
|         restr_dict = {} |         restr_dict = {} | ||||||
|         for feature in restriction: |         for feature in restriction: | ||||||
| @ -44,13 +47,30 @@ class MorphologyRegex: | |||||||
|             restr_dict[key] = (value, match_type) |             restr_dict[key] = (value, match_type) | ||||||
| 
 | 
 | ||||||
|         assert 'POS' in restr_dict |         assert 'POS' in restr_dict | ||||||
|         category = restr_dict['POS'][0].capitalize() | 
 | ||||||
|         cat_code = CODES[category] |         # handle multiple word types | ||||||
|         rgx = [cat_code] + ['.'] * 10 |         if '|' in restr_dict['POS'][0]: | ||||||
|  |             categories = restr_dict['POS'][0].split('|') | ||||||
|  |         else: | ||||||
|  |             categories = [restr_dict['POS'][0]] | ||||||
|  | 
 | ||||||
|  |         self.rgxs = [] | ||||||
|  |         self.re_objects = [] | ||||||
|  |         self.min_msd_lengths = [] | ||||||
| 
 | 
 | ||||||
|         del restr_dict['POS'] |         del restr_dict['POS'] | ||||||
| 
 | 
 | ||||||
|  |         for category in categories: | ||||||
|  |             min_msd_length = 1 | ||||||
|  |             category = category.capitalize() | ||||||
|  |             cat_code = CODES[category] | ||||||
|  |             rgx = [cat_code] + ['.'] * 10 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|             for attribute, (value, typ) in restr_dict.items(): |             for attribute, (value, typ) in restr_dict.items(): | ||||||
|  |                 if attribute.lower() not in TAGSET[cat_code]: | ||||||
|  |                     continue | ||||||
|                 index = TAGSET[cat_code].index(attribute.lower()) |                 index = TAGSET[cat_code].index(attribute.lower()) | ||||||
|                 assert index >= 0 |                 assert index >= 0 | ||||||
| 
 | 
 | ||||||
| @ -63,7 +83,7 @@ class MorphologyRegex: | |||||||
|                 rgx[index + 1] = match |                 rgx[index + 1] = match | ||||||
| 
 | 
 | ||||||
|                 if typ: |                 if typ: | ||||||
|                 self.min_msd_length = max(index + 1, self.min_msd_length) |                     min_msd_length = max(index + 1, min_msd_length) | ||||||
| 
 | 
 | ||||||
|             # strip rgx |             # strip rgx | ||||||
|             for i in reversed(range(len(rgx))): |             for i in reversed(range(len(rgx))): | ||||||
| @ -72,17 +92,30 @@ class MorphologyRegex: | |||||||
|                 else: |                 else: | ||||||
|                     break |                     break | ||||||
| 
 | 
 | ||||||
|         self.re_objects = [re.compile(r) for r in rgx] |             self.re_objects.append([re.compile(r) for r in rgx]) | ||||||
|         self.rgx = rgx |             self.rgxs.append(rgx) | ||||||
|  |             self.min_msd_lengths.append(min_msd_length) | ||||||
|  | 
 | ||||||
|  |         # self.re_objects = [re.compile(r) for r in rgx] | ||||||
|  |         # self.rgx = rgx | ||||||
|      |      | ||||||
|     def __call__(self, text): |     def __call__(self, text): | ||||||
|         if len(text) <= self.min_msd_length: |         # if len(text) <= self.min_msd_length: | ||||||
|             return False |         #     return False | ||||||
|  |         # if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1: | ||||||
|  |         #     a = 1 | ||||||
|  |         for i, re_object in enumerate(self.re_objects): | ||||||
|  |             if len(text) <= self.min_msd_lengths[i]: | ||||||
|  |                 continue | ||||||
|  |             match = True | ||||||
| 
 | 
 | ||||||
|         for c, r in zip(text, self.re_objects): |             for c, r in zip(text, re_object): | ||||||
|                 if not r.match(c): |                 if not r.match(c): | ||||||
|                 return False |                     match = False | ||||||
|  |                     break | ||||||
|  |             if match: | ||||||
|                 return True |                 return True | ||||||
|  |         return False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class LexisRegex: | class LexisRegex: | ||||||
| @ -111,7 +144,7 @@ class Restriction: | |||||||
|         if restriction_type == "morphology": |         if restriction_type == "morphology": | ||||||
|             self.type = RestrictionType.Morphology |             self.type = RestrictionType.Morphology | ||||||
|             self.matcher = MorphologyRegex(list(restriction_tag)) |             self.matcher = MorphologyRegex(list(restriction_tag)) | ||||||
|             self.ppb = determine_ppb(self.matcher.rgx) |             self.ppb = determine_ppb(self.matcher.rgxs) | ||||||
| 
 | 
 | ||||||
|         elif restriction_type == "lexis": |         elif restriction_type == "lexis": | ||||||
|             self.type = RestrictionType.Lexis |             self.type = RestrictionType.Lexis | ||||||
|  | |||||||
| @ -134,7 +134,7 @@ if __name__ == '__main__': | |||||||
|                         action='store_true') |                         action='store_true') | ||||||
| 
 | 
 | ||||||
|     parser.add_argument('--load-sloleks', |     parser.add_argument('--load-sloleks', | ||||||
|                         help='Tells weather sloleks is loaded into memory at the beginning of processing or not.', |                         help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in', | ||||||
|                         action='store_true') |                         action='store_true') | ||||||
| 
 | 
 | ||||||
|     parser.add_argument('--sort-by', |     parser.add_argument('--sort-by', | ||||||
|  | |||||||
| @ -36,7 +36,9 @@ class Word: | |||||||
|         self.lemma = lemma |         self.lemma = lemma | ||||||
|         self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd |         self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd | ||||||
|         self.id = wid |         self.id = wid | ||||||
|  |         self.idi = None | ||||||
|         self.text = text |         self.text = text | ||||||
|  |         self.glue = '' | ||||||
| 
 | 
 | ||||||
|         self.links = defaultdict(list) |         self.links = defaultdict(list) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user