From b2baedca52e89fa7fc8c03353f5daadc768520a7 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Sat, 8 Jun 2019 11:18:49 +0200 Subject: [PATCH] determining dispersions --- wani.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/wani.py b/wani.py index a4c701c..985f3aa 100644 --- a/wani.py +++ b/wani.py @@ -1168,6 +1168,7 @@ class ColocationIds: def __init__(self): self.data = {} self.min_frequency = args.min_freq + self.dispersions = {} def _add_match(self, key, sid, match): if key not in self.data: @@ -1198,6 +1199,13 @@ class ColocationIds: for _1, sm in tqdm(self.data.items()): ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer) idx += 1 + + def determine_colocation_dispersions(self): + dispersions = defaultdict(int) + for (structure_id, *word_tups) in self.data.keys(): + for component_id, lemma in word_tups: + dispersions[(structure_id, component_id, lemma)] += 1 + self.dispersions = dict(dispersions) def match_file(words, structures): @@ -1273,6 +1281,7 @@ def main(input_file, structures_file, args): # get word renders for lemma/msd word_renderer.generate_renders() + colocation_ids.determine_colocation_dispersions() if args.output: # figure out representations!