sql-join-test #1

Manually merged
ozbolt merged 4 commits from sql-join-test into master 2020-03-02 19:12:38 +00:00
2 changed files with 10 additions and 2 deletions
Showing only changes of commit 1d4c0238a6 - Show all commits

View File

@ -9,6 +9,7 @@ class MatchStore:
def __init__(self, args, db): def __init__(self, args, db):
self.db = db self.db = db
self.dispersions = {} self.dispersions = {}
self.min_freq = args.min_freq
self.db.init("""CREATE TABLE Colocations ( self.db.init("""CREATE TABLE Colocations (
@ -117,6 +118,10 @@ class MatchStore:
self.db.step_is_done(step_name) self.db.step_is_done(step_name)
def has_colocation_id_enough_frequency(self, colocation_id):
matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0]
return matches >= self.min_freq
def determine_colocation_dispersions(self): def determine_colocation_dispersions(self):
step_name = 'dispersions' step_name = 'dispersions'
if self.db.is_step_done(step_name): if self.db.is_step_done(step_name):
@ -124,7 +129,10 @@ class MatchStore:
return return
dispersions = defaultdict(int) dispersions = defaultdict(int)
for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"): for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"):
if not self.has_colocation_id_enough_frequency(colocation_id):
continue
word_tups = literal_eval(word_tups_str) word_tups = literal_eval(word_tups_str)
for component_id, lemma in word_tups: for component_id, lemma in word_tups:
dispersions[(str(structure_id), component_id, lemma)] += 1 dispersions[(str(structure_id), component_id, lemma)] += 1

View File

@ -77,7 +77,7 @@ class Writer:
rows = [] rows = []
components = structure.components components = structure.components
for match in colocation_ids.get_matches_for(structure): for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
if len(match) < self.min_frequency: if len(match) < self.min_frequency:
continue continue