Added some functions for compatibility with valency, fixed readme and fixed some minor bugs.

i2198
Luka 4 years ago
parent 1b0e6a27eb
commit 01b08667d2

@ -26,18 +26,22 @@ pip install -r requirements.txt
# Running
```bash
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE>
```
## Most important optional parameters
### --sloleks_db
This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement.
To use this sqlalchemy has to be installed as well.
PATH TO SLOLEKS DB
This parameter has to include information about database in following order:
<DB_USERNAME>:<DB_PASSWORD>:<DB_NAME>:<DB_URL>
### --collocation_sentence_map_dest
../data/collocation_sentence_mapper
If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids.
### --db
This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.

@ -135,6 +135,7 @@ class Component:
# matches for every component in links from this component
to_ret = []
# need to get all links that match
for next, link, order in self.next_element:
next_links = word.get_links(link)
@ -146,6 +147,9 @@ class Component:
if not order.match(word, next_word):
continue
if word.lemma == 'aktivirati' and next_word.text == 'potomcih':
a = 0
match = next.match(next_word)
if match is not None:

@ -82,7 +82,7 @@ class AllFormatter(Formatter):
word = words[idx]
return [word.id, word.text, word.lemma, word.msd]
def content_right(self, _freq):
def content_right(self, _freq, variable_word_order=None):
return []
def group(self):

@ -8,7 +8,7 @@ def get_lemma_features(et):
result = {}
for pos in lf.iter('POS'):
rgx_list = MorphologyRegex(pos).rgx
rgx_list = MorphologyRegex(pos).rgxs[0]
rgx_str = ""
for position in rgx_list:
if position == ".":

@ -10,7 +10,10 @@ class RestrictionType(Enum):
MatchAll = 2
def determine_ppb(rgx):
def determine_ppb(rgxs):
if len(rgxs) != 1:
return 0
rgx = rgxs[0]
if rgx[0] in ("A", "N", "R"):
return 0
elif rgx[0] == "V":
@ -27,7 +30,7 @@ def determine_ppb(rgx):
class MorphologyRegex:
def __init__(self, restriction):
self.min_msd_length = 1
# self.min_msd_length = 1
restr_dict = {}
for feature in restriction:
@ -44,45 +47,75 @@ class MorphologyRegex:
restr_dict[key] = (value, match_type)
assert 'POS' in restr_dict
category = restr_dict['POS'][0].capitalize()
cat_code = CODES[category]
rgx = [cat_code] + ['.'] * 10
# handle multiple word types
if '|' in restr_dict['POS'][0]:
categories = restr_dict['POS'][0].split('|')
else:
categories = [restr_dict['POS'][0]]
self.rgxs = []
self.re_objects = []
self.min_msd_lengths = []
del restr_dict['POS']
for attribute, (value, typ) in restr_dict.items():
index = TAGSET[cat_code].index(attribute.lower())
assert index >= 0
for category in categories:
min_msd_length = 1
category = category.capitalize()
cat_code = CODES[category]
rgx = [cat_code] + ['.'] * 10
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
for attribute, (value, typ) in restr_dict.items():
if attribute.lower() not in TAGSET[cat_code]:
continue
index = TAGSET[cat_code].index(attribute.lower())
assert index >= 0
if typ:
self.min_msd_length = max(index + 1, self.min_msd_length)
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
# strip rgx
for i in reversed(range(len(rgx))):
if rgx[i] == '.':
rgx = rgx[:-1]
else:
break
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
self.re_objects = [re.compile(r) for r in rgx]
self.rgx = rgx
if typ:
min_msd_length = max(index + 1, min_msd_length)
# strip rgx
for i in reversed(range(len(rgx))):
if rgx[i] == '.':
rgx = rgx[:-1]
else:
break
self.re_objects.append([re.compile(r) for r in rgx])
self.rgxs.append(rgx)
self.min_msd_lengths.append(min_msd_length)
# self.re_objects = [re.compile(r) for r in rgx]
# self.rgx = rgx
def __call__(self, text):
if len(text) <= self.min_msd_length:
return False
for c, r in zip(text, self.re_objects):
if not r.match(c):
return False
return True
# if len(text) <= self.min_msd_length:
# return False
# if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
# a = 1
for i, re_object in enumerate(self.re_objects):
if len(text) <= self.min_msd_lengths[i]:
continue
match = True
for c, r in zip(text, re_object):
if not r.match(c):
match = False
break
if match:
return True
return False
class LexisRegex:
@ -111,7 +144,7 @@ class Restriction:
if restriction_type == "morphology":
self.type = RestrictionType.Morphology
self.matcher = MorphologyRegex(list(restriction_tag))
self.ppb = determine_ppb(self.matcher.rgx)
self.ppb = determine_ppb(self.matcher.rgxs)
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis

@ -134,7 +134,7 @@ if __name__ == '__main__':
action='store_true')
parser.add_argument('--load-sloleks',
help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
action='store_true')
parser.add_argument('--sort-by',

@ -36,7 +36,9 @@ class Word:
self.lemma = lemma
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
self.id = wid
self.idi = None
self.text = text
self.glue = ''
self.links = defaultdict(list)

Loading…
Cancel
Save