Added some functions for compatibility with valency, fixed readme and fixed some minor bugs.

This commit is contained in:
Luka 2020-09-10 15:06:09 +02:00
parent 1b0e6a27eb
commit 01b08667d2
7 changed files with 81 additions and 38 deletions

View File

@ -26,18 +26,22 @@ pip install -r requirements.txt
# Running # Running
```bash ```bash
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB> python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE>
python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
``` ```
## Most important optional parameters ## Most important optional parameters
### --sloleks_db ### --sloleks_db
This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement.
To use this sqlalchemy has to be installed as well. To use this sqlalchemy has to be installed as well.
PATH TO SLOLEKS DB
This parameter has to include information about database in following order:
<DB_USERNAME>:<DB_PASSWORD>:<DB_NAME>:<DB_URL>
### --collocation_sentence_map_dest ### --collocation_sentence_map_dest
../data/collocation_sentence_mapper If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids.
### --db ### --db
This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified. This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.

View File

@ -135,6 +135,7 @@ class Component:
# matches for every component in links from this component # matches for every component in links from this component
to_ret = [] to_ret = []
# need to get all links that match # need to get all links that match
for next, link, order in self.next_element: for next, link, order in self.next_element:
next_links = word.get_links(link) next_links = word.get_links(link)
@ -146,6 +147,9 @@ class Component:
if not order.match(word, next_word): if not order.match(word, next_word):
continue continue
if word.lemma == 'aktivirati' and next_word.text == 'potomcih':
a = 0
match = next.match(next_word) match = next.match(next_word)
if match is not None: if match is not None:

View File

@ -82,7 +82,7 @@ class AllFormatter(Formatter):
word = words[idx] word = words[idx]
return [word.id, word.text, word.lemma, word.msd] return [word.id, word.text, word.lemma, word.msd]
def content_right(self, _freq): def content_right(self, _freq, variable_word_order=None):
return [] return []
def group(self): def group(self):

View File

@ -8,7 +8,7 @@ def get_lemma_features(et):
result = {} result = {}
for pos in lf.iter('POS'): for pos in lf.iter('POS'):
rgx_list = MorphologyRegex(pos).rgx rgx_list = MorphologyRegex(pos).rgxs[0]
rgx_str = "" rgx_str = ""
for position in rgx_list: for position in rgx_list:
if position == ".": if position == ".":

View File

@ -10,7 +10,10 @@ class RestrictionType(Enum):
MatchAll = 2 MatchAll = 2
def determine_ppb(rgx): def determine_ppb(rgxs):
if len(rgxs) != 1:
return 0
rgx = rgxs[0]
if rgx[0] in ("A", "N", "R"): if rgx[0] in ("A", "N", "R"):
return 0 return 0
elif rgx[0] == "V": elif rgx[0] == "V":
@ -27,7 +30,7 @@ def determine_ppb(rgx):
class MorphologyRegex: class MorphologyRegex:
def __init__(self, restriction): def __init__(self, restriction):
self.min_msd_length = 1 # self.min_msd_length = 1
restr_dict = {} restr_dict = {}
for feature in restriction: for feature in restriction:
@ -44,13 +47,30 @@ class MorphologyRegex:
restr_dict[key] = (value, match_type) restr_dict[key] = (value, match_type)
assert 'POS' in restr_dict assert 'POS' in restr_dict
category = restr_dict['POS'][0].capitalize()
cat_code = CODES[category] # handle multiple word types
rgx = [cat_code] + ['.'] * 10 if '|' in restr_dict['POS'][0]:
categories = restr_dict['POS'][0].split('|')
else:
categories = [restr_dict['POS'][0]]
self.rgxs = []
self.re_objects = []
self.min_msd_lengths = []
del restr_dict['POS'] del restr_dict['POS']
for category in categories:
min_msd_length = 1
category = category.capitalize()
cat_code = CODES[category]
rgx = [cat_code] + ['.'] * 10
for attribute, (value, typ) in restr_dict.items(): for attribute, (value, typ) in restr_dict.items():
if attribute.lower() not in TAGSET[cat_code]:
continue
index = TAGSET[cat_code].index(attribute.lower()) index = TAGSET[cat_code].index(attribute.lower())
assert index >= 0 assert index >= 0
@ -63,7 +83,7 @@ class MorphologyRegex:
rgx[index + 1] = match rgx[index + 1] = match
if typ: if typ:
self.min_msd_length = max(index + 1, self.min_msd_length) min_msd_length = max(index + 1, min_msd_length)
# strip rgx # strip rgx
for i in reversed(range(len(rgx))): for i in reversed(range(len(rgx))):
@ -72,17 +92,30 @@ class MorphologyRegex:
else: else:
break break
self.re_objects = [re.compile(r) for r in rgx] self.re_objects.append([re.compile(r) for r in rgx])
self.rgx = rgx self.rgxs.append(rgx)
self.min_msd_lengths.append(min_msd_length)
# self.re_objects = [re.compile(r) for r in rgx]
# self.rgx = rgx
def __call__(self, text): def __call__(self, text):
if len(text) <= self.min_msd_length: # if len(text) <= self.min_msd_length:
return False # return False
# if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
# a = 1
for i, re_object in enumerate(self.re_objects):
if len(text) <= self.min_msd_lengths[i]:
continue
match = True
for c, r in zip(text, self.re_objects): for c, r in zip(text, re_object):
if not r.match(c): if not r.match(c):
return False match = False
break
if match:
return True return True
return False
class LexisRegex: class LexisRegex:
@ -111,7 +144,7 @@ class Restriction:
if restriction_type == "morphology": if restriction_type == "morphology":
self.type = RestrictionType.Morphology self.type = RestrictionType.Morphology
self.matcher = MorphologyRegex(list(restriction_tag)) self.matcher = MorphologyRegex(list(restriction_tag))
self.ppb = determine_ppb(self.matcher.rgx) self.ppb = determine_ppb(self.matcher.rgxs)
elif restriction_type == "lexis": elif restriction_type == "lexis":
self.type = RestrictionType.Lexis self.type = RestrictionType.Lexis

View File

@ -134,7 +134,7 @@ if __name__ == '__main__':
action='store_true') action='store_true')
parser.add_argument('--load-sloleks', parser.add_argument('--load-sloleks',
help='Tells weather sloleks is loaded into memory at the beginning of processing or not.', help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
action='store_true') action='store_true')
parser.add_argument('--sort-by', parser.add_argument('--sort-by',

View File

@ -36,7 +36,9 @@ class Word:
self.lemma = lemma self.lemma = lemma
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
self.id = wid self.id = wid
self.idi = None
self.text = text self.text = text
self.glue = ''
self.links = defaultdict(list) self.links = defaultdict(list)