From 1d4c0238a66d0d5ef9750cb82e1d0028b9a8814c Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Wed, 6 Nov 2019 02:39:26 +0100 Subject: [PATCH] fixing how min_freq is used and more verbose writer --- src/match_store.py | 10 +++++++++- src/writer.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/match_store.py b/src/match_store.py index 2eb189d..378278f 100644 --- a/src/match_store.py +++ b/src/match_store.py @@ -9,6 +9,7 @@ class MatchStore: def __init__(self, args, db): self.db = db self.dispersions = {} + self.min_freq = args.min_freq self.db.init("""CREATE TABLE Colocations ( @@ -117,6 +118,10 @@ class MatchStore: self.db.step_is_done(step_name) + def has_colocation_id_enough_frequency(self, colocation_id): + matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0] + return matches >= self.min_freq + def determine_colocation_dispersions(self): step_name = 'dispersions' if self.db.is_step_done(step_name): @@ -124,7 +129,10 @@ class MatchStore: return dispersions = defaultdict(int) - for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"): + for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"): + if not self.has_colocation_id_enough_frequency(colocation_id): + continue + word_tups = literal_eval(word_tups_str) for component_id, lemma in word_tups: dispersions[(str(structure_id), component_id, lemma)] += 1 diff --git a/src/writer.py b/src/writer.py index 9e3e96d..8695de4 100644 --- a/src/writer.py +++ b/src/writer.py @@ -77,7 +77,7 @@ class Writer: rows = [] components = structure.components - for match in colocation_ids.get_matches_for(structure): + for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)): if len(match) < self.min_frequency: continue