Added some functions for compatibility with valency, fixed readme and fixed some minor bugs.
This commit is contained in:
parent
1b0e6a27eb
commit
01b08667d2
12
README.md
12
README.md
|
@ -26,18 +26,22 @@ pip install -r requirements.txt
|
|||
# Running
|
||||
|
||||
```bash
|
||||
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
|
||||
python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
|
||||
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE>
|
||||
```
|
||||
|
||||
## Most important optional parameters
|
||||
|
||||
### --sloleks_db
|
||||
This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement.
|
||||
|
||||
To use this sqlalchemy has to be installed as well.
|
||||
PATH TO SLOLEKS DB
|
||||
|
||||
This parameter has to include information about database in following order:
|
||||
|
||||
<DB_USERNAME>:<DB_PASSWORD>:<DB_NAME>:<DB_URL>
|
||||
|
||||
### --collocation_sentence_map_dest
|
||||
../data/collocation_sentence_mapper
|
||||
If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids.
|
||||
|
||||
### --db
|
||||
This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
|
||||
|
|
|
@ -135,6 +135,7 @@ class Component:
|
|||
# matches for every component in links from this component
|
||||
to_ret = []
|
||||
|
||||
|
||||
# need to get all links that match
|
||||
for next, link, order in self.next_element:
|
||||
next_links = word.get_links(link)
|
||||
|
@ -146,6 +147,9 @@ class Component:
|
|||
if not order.match(word, next_word):
|
||||
continue
|
||||
|
||||
if word.lemma == 'aktivirati' and next_word.text == 'potomcih':
|
||||
a = 0
|
||||
|
||||
match = next.match(next_word)
|
||||
|
||||
if match is not None:
|
||||
|
|
|
@ -82,7 +82,7 @@ class AllFormatter(Formatter):
|
|||
word = words[idx]
|
||||
return [word.id, word.text, word.lemma, word.msd]
|
||||
|
||||
def content_right(self, _freq):
|
||||
def content_right(self, _freq, variable_word_order=None):
|
||||
return []
|
||||
|
||||
def group(self):
|
||||
|
|
|
@ -8,7 +8,7 @@ def get_lemma_features(et):
|
|||
|
||||
result = {}
|
||||
for pos in lf.iter('POS'):
|
||||
rgx_list = MorphologyRegex(pos).rgx
|
||||
rgx_list = MorphologyRegex(pos).rgxs[0]
|
||||
rgx_str = ""
|
||||
for position in rgx_list:
|
||||
if position == ".":
|
||||
|
|
|
@ -10,7 +10,10 @@ class RestrictionType(Enum):
|
|||
MatchAll = 2
|
||||
|
||||
|
||||
def determine_ppb(rgx):
|
||||
def determine_ppb(rgxs):
|
||||
if len(rgxs) != 1:
|
||||
return 0
|
||||
rgx = rgxs[0]
|
||||
if rgx[0] in ("A", "N", "R"):
|
||||
return 0
|
||||
elif rgx[0] == "V":
|
||||
|
@ -27,7 +30,7 @@ def determine_ppb(rgx):
|
|||
|
||||
class MorphologyRegex:
|
||||
def __init__(self, restriction):
|
||||
self.min_msd_length = 1
|
||||
# self.min_msd_length = 1
|
||||
|
||||
restr_dict = {}
|
||||
for feature in restriction:
|
||||
|
@ -44,45 +47,75 @@ class MorphologyRegex:
|
|||
restr_dict[key] = (value, match_type)
|
||||
|
||||
assert 'POS' in restr_dict
|
||||
category = restr_dict['POS'][0].capitalize()
|
||||
cat_code = CODES[category]
|
||||
rgx = [cat_code] + ['.'] * 10
|
||||
|
||||
# handle multiple word types
|
||||
if '|' in restr_dict['POS'][0]:
|
||||
categories = restr_dict['POS'][0].split('|')
|
||||
else:
|
||||
categories = [restr_dict['POS'][0]]
|
||||
|
||||
self.rgxs = []
|
||||
self.re_objects = []
|
||||
self.min_msd_lengths = []
|
||||
|
||||
del restr_dict['POS']
|
||||
|
||||
for attribute, (value, typ) in restr_dict.items():
|
||||
index = TAGSET[cat_code].index(attribute.lower())
|
||||
assert index >= 0
|
||||
for category in categories:
|
||||
min_msd_length = 1
|
||||
category = category.capitalize()
|
||||
cat_code = CODES[category]
|
||||
rgx = [cat_code] + ['.'] * 10
|
||||
|
||||
if '|' in value:
|
||||
match = "".join(CODES[val] for val in value.split('|'))
|
||||
else:
|
||||
match = CODES[value]
|
||||
|
||||
match = "[{}{}]".format("" if typ else "^", match)
|
||||
rgx[index + 1] = match
|
||||
|
||||
if typ:
|
||||
self.min_msd_length = max(index + 1, self.min_msd_length)
|
||||
for attribute, (value, typ) in restr_dict.items():
|
||||
if attribute.lower() not in TAGSET[cat_code]:
|
||||
continue
|
||||
index = TAGSET[cat_code].index(attribute.lower())
|
||||
assert index >= 0
|
||||
|
||||
# strip rgx
|
||||
for i in reversed(range(len(rgx))):
|
||||
if rgx[i] == '.':
|
||||
rgx = rgx[:-1]
|
||||
else:
|
||||
break
|
||||
if '|' in value:
|
||||
match = "".join(CODES[val] for val in value.split('|'))
|
||||
else:
|
||||
match = CODES[value]
|
||||
|
||||
self.re_objects = [re.compile(r) for r in rgx]
|
||||
self.rgx = rgx
|
||||
match = "[{}{}]".format("" if typ else "^", match)
|
||||
rgx[index + 1] = match
|
||||
|
||||
if typ:
|
||||
min_msd_length = max(index + 1, min_msd_length)
|
||||
|
||||
# strip rgx
|
||||
for i in reversed(range(len(rgx))):
|
||||
if rgx[i] == '.':
|
||||
rgx = rgx[:-1]
|
||||
else:
|
||||
break
|
||||
|
||||
self.re_objects.append([re.compile(r) for r in rgx])
|
||||
self.rgxs.append(rgx)
|
||||
self.min_msd_lengths.append(min_msd_length)
|
||||
|
||||
# self.re_objects = [re.compile(r) for r in rgx]
|
||||
# self.rgx = rgx
|
||||
|
||||
def __call__(self, text):
|
||||
if len(text) <= self.min_msd_length:
|
||||
return False
|
||||
# if len(text) <= self.min_msd_length:
|
||||
# return False
|
||||
# if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
|
||||
# a = 1
|
||||
for i, re_object in enumerate(self.re_objects):
|
||||
if len(text) <= self.min_msd_lengths[i]:
|
||||
continue
|
||||
match = True
|
||||
|
||||
for c, r in zip(text, self.re_objects):
|
||||
if not r.match(c):
|
||||
return False
|
||||
return True
|
||||
for c, r in zip(text, re_object):
|
||||
if not r.match(c):
|
||||
match = False
|
||||
break
|
||||
if match:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class LexisRegex:
|
||||
|
@ -111,7 +144,7 @@ class Restriction:
|
|||
if restriction_type == "morphology":
|
||||
self.type = RestrictionType.Morphology
|
||||
self.matcher = MorphologyRegex(list(restriction_tag))
|
||||
self.ppb = determine_ppb(self.matcher.rgx)
|
||||
self.ppb = determine_ppb(self.matcher.rgxs)
|
||||
|
||||
elif restriction_type == "lexis":
|
||||
self.type = RestrictionType.Lexis
|
||||
|
|
|
@ -134,7 +134,7 @@ if __name__ == '__main__':
|
|||
action='store_true')
|
||||
|
||||
parser.add_argument('--load-sloleks',
|
||||
help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
|
||||
help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
|
||||
action='store_true')
|
||||
|
||||
parser.add_argument('--sort-by',
|
||||
|
|
|
@ -36,7 +36,9 @@ class Word:
|
|||
self.lemma = lemma
|
||||
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
|
||||
self.id = wid
|
||||
self.idi = None
|
||||
self.text = text
|
||||
self.glue = ''
|
||||
|
||||
self.links = defaultdict(list)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user