Adding dispersions to sqlite, finished moving to it.

This commit is contained in:
Ozbolt Menegatti 2019-06-27 22:04:33 +02:00
parent b5e281bdf4
commit 8c20295adf
2 changed files with 11 additions and 13 deletions

View File

@ -2,7 +2,7 @@ from word import Word
class StructureMatch: class StructureMatch:
def __init__(self, match_id, structure): def __init__(self, match_id, structure):
self.match_id = match_id self.match_id = str(match_id)
self.structure = structure self.structure = structure
self.matches = [] self.matches = []
@ -10,7 +10,7 @@ class StructureMatch:
@staticmethod @staticmethod
def from_db(db, colocation_id, structure): def from_db(db, colocation_id, structure):
result = StructureMatch(str(colocation_id), structure) result = StructureMatch(colocation_id, structure)
for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)): for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)):
to_add = {} to_add = {}

View File

@ -1,4 +1,5 @@
from collections import defaultdict from collections import defaultdict
from ast import literal_eval
from match import StructureMatch from match import StructureMatch
from representation_assigner import RepresentationAssigner from representation_assigner import RepresentationAssigner
@ -7,14 +8,13 @@ from progress_bar import progress
class MatchStore: class MatchStore:
def __init__(self, args, db): def __init__(self, args, db):
self.db = db self.db = db
self.data = {}
self.min_frequency = args.min_freq self.min_frequency = args.min_freq
self.dispersions = {} self.dispersions = {}
self.match_num = 0 self.match_num = 0
self.db.init("""CREATE TABLE Colocations ( self.db.init("""CREATE TABLE Colocations (
colocation_id INTEGER PRIMARY KEY, colocation_id INTEGER PRIMARY KEY,
structure_id INTEGER, structure_id varchar(8),
key varchar(256)) key varchar(256))
""") """)
self.db.init("""CREATE TABLE Matches ( self.db.init("""CREATE TABLE Matches (
@ -73,24 +73,20 @@ class MatchStore:
self.match_num += 1 self.match_num += 1
if key not in self.data:
self.data[key] = StructureMatch(str(len(self.data) + 1), structure)
self.data[key].append(match)
def add_matches(self, matches): def add_matches(self, matches):
for structure, nms in matches.items(): for structure, nms in progress(matches.items(), 'adding-matches'):
for nm in nms: for nm in nms:
self._add_match(nm[1], structure, nm[0]) self._add_match(nm[1], structure, nm[0])
def get_matches_for(self, structure): def get_matches_for(self, structure):
print(structure.id)
for cid in self.db.execute("SELECT colocation_id FROM Colocations WHERE structure_id=?", for cid in self.db.execute("SELECT colocation_id FROM Colocations WHERE structure_id=?",
(structure.id,)): (structure.id,)):
yield StructureMatch.from_db(self.db, cid[0], structure) yield StructureMatch.from_db(self.db, cid[0], structure)
def set_representations(self, word_renderer, structures): def set_representations(self, word_renderer, structures):
structures_dict = {s.id: s for s in structures}
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations"): for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations"):
structure = structures[sid - 1] structure = structures_dict[sid]
match = StructureMatch.from_db(self.db, cid, structure) match = StructureMatch.from_db(self.db, cid, structure)
RepresentationAssigner.set_representations(match, word_renderer) RepresentationAssigner.set_representations(match, word_renderer)
for component_id, text in match.representations.items(): for component_id, text in match.representations.items():
@ -101,7 +97,9 @@ class MatchStore:
def determine_colocation_dispersions(self): def determine_colocation_dispersions(self):
dispersions = defaultdict(int) dispersions = defaultdict(int)
for (structure_id, *word_tups) in self.data.keys(): for structure_id, word_tups_str in self.db.execute("SELECT structure_id, key FROM Colocations"):
word_tups = literal_eval(word_tups_str)
for component_id, lemma in word_tups: for component_id, lemma in word_tups:
dispersions[(structure_id, component_id, lemma)] += 1 dispersions[(str(structure_id), component_id, lemma)] += 1
self.dispersions = dict(dispersions) self.dispersions = dict(dispersions)