Added some functions for compatibility with valency, fixed readme and fixed some minor bugs.
This commit is contained in:
parent
1b0e6a27eb
commit
01b08667d2
12
README.md
12
README.md
|
@ -26,18 +26,22 @@ pip install -r requirements.txt
|
||||||
# Running
|
# Running
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE> --sloleks_db <PATH TO SLOLEKS DB>
|
python3 wani.py <LOCATION TO STRUCTURES> <EXTRACTION TEXT> --out <RESULTS FILE>
|
||||||
python3 wani.py ../data/Kolokacije_strukture_JOS-32-representation_3D_08.xml ../data/ssj500k-sl.body.small.xml --out ../data/izhod.csv --sloleks_db luka:akul:superdb_small:127.0.0.1 --collocation_sentence_map_dest ../data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani-ssj500k
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Most important optional parameters
|
## Most important optional parameters
|
||||||
|
|
||||||
### --sloleks_db
|
### --sloleks_db
|
||||||
|
This parameter is may be used, if you have access to sloleks_db. Parameter is useful when lemma_fallback would be shown in results file, because if you have sloleks_db script looks into this database to find correct replacement.
|
||||||
|
|
||||||
To use this sqlalchemy has to be installed as well.
|
To use this sqlalchemy has to be installed as well.
|
||||||
PATH TO SLOLEKS DB
|
|
||||||
|
This parameter has to include information about database in following order:
|
||||||
|
|
||||||
|
<DB_USERNAME>:<DB_PASSWORD>:<DB_NAME>:<DB_URL>
|
||||||
|
|
||||||
### --collocation_sentence_map_dest
|
### --collocation_sentence_map_dest
|
||||||
../data/collocation_sentence_mapper
|
If value for this parameter exists (it should be string path to directory), files will be generated that include links between collocation ids and sentence ids.
|
||||||
|
|
||||||
### --db
|
### --db
|
||||||
This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
|
This is path to file which will contain sqlite database with internal states. Used to save internal states in case code gets modified.
|
||||||
|
|
|
@ -135,6 +135,7 @@ class Component:
|
||||||
# matches for every component in links from this component
|
# matches for every component in links from this component
|
||||||
to_ret = []
|
to_ret = []
|
||||||
|
|
||||||
|
|
||||||
# need to get all links that match
|
# need to get all links that match
|
||||||
for next, link, order in self.next_element:
|
for next, link, order in self.next_element:
|
||||||
next_links = word.get_links(link)
|
next_links = word.get_links(link)
|
||||||
|
@ -146,6 +147,9 @@ class Component:
|
||||||
if not order.match(word, next_word):
|
if not order.match(word, next_word):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if word.lemma == 'aktivirati' and next_word.text == 'potomcih':
|
||||||
|
a = 0
|
||||||
|
|
||||||
match = next.match(next_word)
|
match = next.match(next_word)
|
||||||
|
|
||||||
if match is not None:
|
if match is not None:
|
||||||
|
|
|
@ -82,7 +82,7 @@ class AllFormatter(Formatter):
|
||||||
word = words[idx]
|
word = words[idx]
|
||||||
return [word.id, word.text, word.lemma, word.msd]
|
return [word.id, word.text, word.lemma, word.msd]
|
||||||
|
|
||||||
def content_right(self, _freq):
|
def content_right(self, _freq, variable_word_order=None):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def group(self):
|
def group(self):
|
||||||
|
|
|
@ -8,7 +8,7 @@ def get_lemma_features(et):
|
||||||
|
|
||||||
result = {}
|
result = {}
|
||||||
for pos in lf.iter('POS'):
|
for pos in lf.iter('POS'):
|
||||||
rgx_list = MorphologyRegex(pos).rgx
|
rgx_list = MorphologyRegex(pos).rgxs[0]
|
||||||
rgx_str = ""
|
rgx_str = ""
|
||||||
for position in rgx_list:
|
for position in rgx_list:
|
||||||
if position == ".":
|
if position == ".":
|
||||||
|
|
|
@ -10,7 +10,10 @@ class RestrictionType(Enum):
|
||||||
MatchAll = 2
|
MatchAll = 2
|
||||||
|
|
||||||
|
|
||||||
def determine_ppb(rgx):
|
def determine_ppb(rgxs):
|
||||||
|
if len(rgxs) != 1:
|
||||||
|
return 0
|
||||||
|
rgx = rgxs[0]
|
||||||
if rgx[0] in ("A", "N", "R"):
|
if rgx[0] in ("A", "N", "R"):
|
||||||
return 0
|
return 0
|
||||||
elif rgx[0] == "V":
|
elif rgx[0] == "V":
|
||||||
|
@ -27,7 +30,7 @@ def determine_ppb(rgx):
|
||||||
|
|
||||||
class MorphologyRegex:
|
class MorphologyRegex:
|
||||||
def __init__(self, restriction):
|
def __init__(self, restriction):
|
||||||
self.min_msd_length = 1
|
# self.min_msd_length = 1
|
||||||
|
|
||||||
restr_dict = {}
|
restr_dict = {}
|
||||||
for feature in restriction:
|
for feature in restriction:
|
||||||
|
@ -44,45 +47,75 @@ class MorphologyRegex:
|
||||||
restr_dict[key] = (value, match_type)
|
restr_dict[key] = (value, match_type)
|
||||||
|
|
||||||
assert 'POS' in restr_dict
|
assert 'POS' in restr_dict
|
||||||
category = restr_dict['POS'][0].capitalize()
|
|
||||||
cat_code = CODES[category]
|
# handle multiple word types
|
||||||
rgx = [cat_code] + ['.'] * 10
|
if '|' in restr_dict['POS'][0]:
|
||||||
|
categories = restr_dict['POS'][0].split('|')
|
||||||
|
else:
|
||||||
|
categories = [restr_dict['POS'][0]]
|
||||||
|
|
||||||
|
self.rgxs = []
|
||||||
|
self.re_objects = []
|
||||||
|
self.min_msd_lengths = []
|
||||||
|
|
||||||
del restr_dict['POS']
|
del restr_dict['POS']
|
||||||
|
|
||||||
for attribute, (value, typ) in restr_dict.items():
|
for category in categories:
|
||||||
index = TAGSET[cat_code].index(attribute.lower())
|
min_msd_length = 1
|
||||||
assert index >= 0
|
category = category.capitalize()
|
||||||
|
cat_code = CODES[category]
|
||||||
|
rgx = [cat_code] + ['.'] * 10
|
||||||
|
|
||||||
if '|' in value:
|
|
||||||
match = "".join(CODES[val] for val in value.split('|'))
|
|
||||||
else:
|
|
||||||
match = CODES[value]
|
|
||||||
|
|
||||||
match = "[{}{}]".format("" if typ else "^", match)
|
|
||||||
rgx[index + 1] = match
|
|
||||||
|
|
||||||
if typ:
|
for attribute, (value, typ) in restr_dict.items():
|
||||||
self.min_msd_length = max(index + 1, self.min_msd_length)
|
if attribute.lower() not in TAGSET[cat_code]:
|
||||||
|
continue
|
||||||
|
index = TAGSET[cat_code].index(attribute.lower())
|
||||||
|
assert index >= 0
|
||||||
|
|
||||||
# strip rgx
|
if '|' in value:
|
||||||
for i in reversed(range(len(rgx))):
|
match = "".join(CODES[val] for val in value.split('|'))
|
||||||
if rgx[i] == '.':
|
else:
|
||||||
rgx = rgx[:-1]
|
match = CODES[value]
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
self.re_objects = [re.compile(r) for r in rgx]
|
match = "[{}{}]".format("" if typ else "^", match)
|
||||||
self.rgx = rgx
|
rgx[index + 1] = match
|
||||||
|
|
||||||
|
if typ:
|
||||||
|
min_msd_length = max(index + 1, min_msd_length)
|
||||||
|
|
||||||
|
# strip rgx
|
||||||
|
for i in reversed(range(len(rgx))):
|
||||||
|
if rgx[i] == '.':
|
||||||
|
rgx = rgx[:-1]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
self.re_objects.append([re.compile(r) for r in rgx])
|
||||||
|
self.rgxs.append(rgx)
|
||||||
|
self.min_msd_lengths.append(min_msd_length)
|
||||||
|
|
||||||
|
# self.re_objects = [re.compile(r) for r in rgx]
|
||||||
|
# self.rgx = rgx
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
if len(text) <= self.min_msd_length:
|
# if len(text) <= self.min_msd_length:
|
||||||
return False
|
# return False
|
||||||
|
# if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
|
||||||
|
# a = 1
|
||||||
|
for i, re_object in enumerate(self.re_objects):
|
||||||
|
if len(text) <= self.min_msd_lengths[i]:
|
||||||
|
continue
|
||||||
|
match = True
|
||||||
|
|
||||||
for c, r in zip(text, self.re_objects):
|
for c, r in zip(text, re_object):
|
||||||
if not r.match(c):
|
if not r.match(c):
|
||||||
return False
|
match = False
|
||||||
return True
|
break
|
||||||
|
if match:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class LexisRegex:
|
class LexisRegex:
|
||||||
|
@ -111,7 +144,7 @@ class Restriction:
|
||||||
if restriction_type == "morphology":
|
if restriction_type == "morphology":
|
||||||
self.type = RestrictionType.Morphology
|
self.type = RestrictionType.Morphology
|
||||||
self.matcher = MorphologyRegex(list(restriction_tag))
|
self.matcher = MorphologyRegex(list(restriction_tag))
|
||||||
self.ppb = determine_ppb(self.matcher.rgx)
|
self.ppb = determine_ppb(self.matcher.rgxs)
|
||||||
|
|
||||||
elif restriction_type == "lexis":
|
elif restriction_type == "lexis":
|
||||||
self.type = RestrictionType.Lexis
|
self.type = RestrictionType.Lexis
|
||||||
|
|
|
@ -134,7 +134,7 @@ if __name__ == '__main__':
|
||||||
action='store_true')
|
action='store_true')
|
||||||
|
|
||||||
parser.add_argument('--load-sloleks',
|
parser.add_argument('--load-sloleks',
|
||||||
help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
|
help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
|
||||||
action='store_true')
|
action='store_true')
|
||||||
|
|
||||||
parser.add_argument('--sort-by',
|
parser.add_argument('--sort-by',
|
||||||
|
|
|
@ -36,7 +36,9 @@ class Word:
|
||||||
self.lemma = lemma
|
self.lemma = lemma
|
||||||
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
|
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
|
||||||
self.id = wid
|
self.id = wid
|
||||||
|
self.idi = None
|
||||||
self.text = text
|
self.text = text
|
||||||
|
self.glue = ''
|
||||||
|
|
||||||
self.links = defaultdict(list)
|
self.links = defaultdict(list)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user