diff --git a/.gitignore b/.gitignore index 6ac65c6..3452d0f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ .vscode __pycache__ +run.sh prev old data diff --git a/run.sh b/run.sh deleted file mode 100755 index 7137826..0000000 --- a/run.sh +++ /dev/null @@ -1 +0,0 @@ -pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks \ No newline at end of file diff --git a/run.sh.example b/run.sh.example new file mode 100755 index 0000000..85561d0 --- /dev/null +++ b/run.sh.example @@ -0,0 +1 @@ +pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '' --collocation_sentence_map_dest data/collocation-sentence-mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks diff --git a/src/formatter.py b/src/formatter.py index 85adc6d..ad11c65 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -151,7 +151,27 @@ class StatsFormatter(Formatter): word = words[idx] key = (sidx, idx, word.lemma) - distribution = self.colocation_ids.dispersions[key] + # try to fix missing dispersions + if key not in self.colocation_ids.dispersions: + if word.lemma == 'k': + new_key = (sidx, idx, 'h') + elif word.lemma == 'h': + new_key = (sidx, idx, 'k') + elif word.lemma == 's': + new_key = (sidx, idx, 'z') + elif word.lemma == 'z': + new_key = (sidx, idx, 's') + else: + new_key = (sidx, idx, '') + if new_key in self.colocation_ids.dispersions: + key = new_key + print('Dispersions fixed.') + else: + print('Dispersions not fixed.') + if key in self.colocation_ids.dispersions: + distribution = self.colocation_ids.dispersions[key] + else: + distribution = 1 return [self.stat_str(distribution)] def content_right(self, freq): @@ -203,4 +223,4 @@ class OutFormatter(Formatter): self.f2.new_match(match) def __str__(self): - return "out" \ No newline at end of file + return "out"