Quick fix for missing dispersions

This commit is contained in:
lkrsnik 2020-07-24 10:06:54 +02:00
parent f330a37764
commit 49a8d5123e
4 changed files with 24 additions and 3 deletions

1
.gitignore vendored
View File

@ -7,6 +7,7 @@
.vscode
__pycache__
run.sh
prev
old
data

1
run.sh
View File

@ -1 +0,0 @@
pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<PUT DB CREDENTIALS HERE!>' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks

1
run.sh.example Executable file
View File

@ -0,0 +1 @@
pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<sloleks db data>' --collocation_sentence_map_dest data/collocation-sentence-mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks

View File

@ -151,7 +151,27 @@ class StatsFormatter(Formatter):
word = words[idx]
key = (sidx, idx, word.lemma)
# try to fix missing dispersions
if key not in self.colocation_ids.dispersions:
if word.lemma == 'k':
new_key = (sidx, idx, 'h')
elif word.lemma == 'h':
new_key = (sidx, idx, 'k')
elif word.lemma == 's':
new_key = (sidx, idx, 'z')
elif word.lemma == 'z':
new_key = (sidx, idx, 's')
else:
new_key = (sidx, idx, '')
if new_key in self.colocation_ids.dispersions:
key = new_key
print('Dispersions fixed.')
else:
print('Dispersions not fixed.')
if key in self.colocation_ids.dispersions:
distribution = self.colocation_ids.dispersions[key]
else:
distribution = 1
return [self.stat_str(distribution)]
def content_right(self, freq):