Added some functions for compatibility with valency, fixed readme and fixed some minor bugs.

This commit is contained in:
Luka 2020-09-10 15:06:09 +02:00
parent 1b0e6a27eb
commit 01b08667d2
7 changed files with 81 additions and 38 deletions

View File

@ -26,18 +26,22 @@ pip install -r requirements.txt
# Running
```bash
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE>
```
## Most important optional parameters
### --sloleks_db
This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement.
To use this sqlalchemy has to be installed as well.
PATH TO SLOLEKS DB
This parameter has to include information about database in following order:
<DB_USERNAME>:<DB_PASSWORD>:<DB_NAME>:<DB_URL>
### --collocation_sentence_map_dest
../data/collocation_sentence_mapper
If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids.
### --db
This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.

View File

@ -135,6 +135,7 @@ class Component:
# matches for every component in links from this component
to_ret = []
# need to get all links that match
for next, link, order in self.next_element:
next_links = word.get_links(link)
@ -146,6 +147,9 @@ class Component:
if not order.match(word, next_word):
continue
if word.lemma == 'aktivirati' and next_word.text == 'potomcih':
a = 0
match = next.match(next_word)
if match is not None:

View File

@ -82,7 +82,7 @@ class AllFormatter(Formatter):
word = words[idx]
return [word.id, word.text, word.lemma, word.msd]
def content_right(self, _freq):
def content_right(self, _freq, variable_word_order=None):
return []
def group(self):

View File

@ -8,7 +8,7 @@ def get_lemma_features(et):
result = {}
for pos in lf.iter('POS'):
rgx_list = MorphologyRegex(pos).rgx
rgx_list = MorphologyRegex(pos).rgxs[0]
rgx_str = ""
for position in rgx_list:
if position == ".":

View File

@ -10,7 +10,10 @@ class RestrictionType(Enum):
MatchAll = 2
def determine_ppb(rgx):
def determine_ppb(rgxs):
if len(rgxs) != 1:
return 0
rgx = rgxs[0]
if rgx[0] in ("A", "N", "R"):
return 0
elif rgx[0] == "V":
@ -27,7 +30,7 @@ def determine_ppb(rgx):
class MorphologyRegex:
def __init__(self, restriction):
self.min_msd_length = 1
# self.min_msd_length = 1
restr_dict = {}
for feature in restriction:
@ -44,45 +47,75 @@ class MorphologyRegex:
restr_dict[key] = (value, match_type)
assert 'POS' in restr_dict
category = restr_dict['POS'][0].capitalize()
cat_code = CODES[category]
rgx = [cat_code] + ['.'] * 10
# handle multiple word types
if '|' in restr_dict['POS'][0]:
categories = restr_dict['POS'][0].split('|')
else:
categories = [restr_dict['POS'][0]]
self.rgxs = []
self.re_objects = []
self.min_msd_lengths = []
del restr_dict['POS']
for attribute, (value, typ) in restr_dict.items():
index = TAGSET[cat_code].index(attribute.lower())
assert index >= 0
for category in categories:
min_msd_length = 1
category = category.capitalize()
cat_code = CODES[category]
rgx = [cat_code] + ['.'] * 10
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
if typ:
self.min_msd_length = max(index + 1, self.min_msd_length)
for attribute, (value, typ) in restr_dict.items():
if attribute.lower() not in TAGSET[cat_code]:
continue
index = TAGSET[cat_code].index(attribute.lower())
assert index >= 0
# strip rgx
for i in reversed(range(len(rgx))):
if rgx[i] == '.':
rgx = rgx[:-1]
else:
break
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
self.re_objects = [re.compile(r) for r in rgx]
self.rgx = rgx
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
if typ:
min_msd_length = max(index + 1, min_msd_length)
# strip rgx
for i in reversed(range(len(rgx))):
if rgx[i] == '.':
rgx = rgx[:-1]
else:
break
self.re_objects.append([re.compile(r) for r in rgx])
self.rgxs.append(rgx)
self.min_msd_lengths.append(min_msd_length)
# self.re_objects = [re.compile(r) for r in rgx]
# self.rgx = rgx
def __call__(self, text):
if len(text) <= self.min_msd_length:
return False
# if len(text) <= self.min_msd_length:
# return False
# if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
# a = 1
for i, re_object in enumerate(self.re_objects):
if len(text) <= self.min_msd_lengths[i]:
continue
match = True
for c, r in zip(text, self.re_objects):
if not r.match(c):
return False
return True
for c, r in zip(text, re_object):
if not r.match(c):
match = False
break
if match:
return True
return False
class LexisRegex:
@ -111,7 +144,7 @@ class Restriction:
if restriction_type == "morphology":
self.type = RestrictionType.Morphology
self.matcher = MorphologyRegex(list(restriction_tag))
self.ppb = determine_ppb(self.matcher.rgx)
self.ppb = determine_ppb(self.matcher.rgxs)
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis

View File

@ -134,7 +134,7 @@ if __name__ == '__main__':
action='store_true')
parser.add_argument('--load-sloleks',
help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
action='store_true')
parser.add_argument('--sort-by',

View File

@ -36,7 +36,9 @@ class Word:
self.lemma = lemma
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
self.id = wid
self.idi = None
self.text = text
self.glue = ''
self.links = defaultdict(list)