2019-06-15 16:55:35 +00:00
from collections import defaultdict
2019-06-27 20:04:33 +00:00
from ast import literal_eval
2019-06-15 16:55:35 +00:00
from match import StructureMatch
from representation_assigner import RepresentationAssigner
2019-06-17 15:30:51 +00:00
from progress_bar import progress
2019-06-15 16:55:35 +00:00
class MatchStore :
2019-06-27 14:51:58 +00:00
def __init__ ( self , args , db ) :
self . db = db
2019-06-15 16:55:35 +00:00
self . dispersions = { }
2019-11-06 01:39:26 +00:00
self . min_freq = args . min_freq
2019-06-15 16:55:35 +00:00
2019-08-21 10:49:03 +00:00
2019-06-27 14:51:58 +00:00
self . db . init ( """ CREATE TABLE Colocations (
colocation_id INTEGER PRIMARY KEY ,
2019-06-27 20:04:33 +00:00
structure_id varchar ( 8 ) ,
2019-06-27 14:51:58 +00:00
key varchar ( 256 ) )
""" )
self . db . init ( """ CREATE TABLE Matches (
match_id INTEGER ,
component_id INTEGER NOT NULL ,
word_lemma varchar ( 32 ) NOT NULL ,
word_id varchar ( 32 ) NOT NULL ,
word_msd varchar ( 16 ) NOT NULL ,
word_text varchar ( 32 ) NOT NULL )
""" )
self . db . init ( """ CREATE TABLE ColocationMatches (
mid_match_id INTEGER ,
mid_colocation_id INTEGER ,
FOREIGN KEY ( mid_colocation_id ) REFERENCES Colocations ( colocation_id ) ,
FOREIGN KEY ( mid_match_id ) REFERENCES Matches ( match_id ) )
""" )
self . db . init ( """ CREATE TABLE Representations (
colocation_id INTEGER ,
component_id INTEGER ,
text varchar ( 32 ) ,
FOREIGN KEY ( colocation_id ) REFERENCES Colocations ( colocation_id ) )
""" )
2019-08-21 10:49:03 +00:00
self . db . init ( """ CREATE TABLE Dispersions (
structure_id varchar ( 64 ) ,
component_id varchar ( 64 ) ,
lemma varchar ( 128 ) ,
dispersion INTEGER )
""" )
2019-06-27 14:51:58 +00:00
2019-06-27 15:16:27 +00:00
self . db . init ( " CREATE INDEX key_sid_c ON Colocations (key, structure_id) " )
self . db . init ( " CREATE INDEX sid_c ON Colocations (structure_id) " )
self . db . init ( " CREATE INDEX mmid_cm ON ColocationMatches (mid_colocation_id) " )
self . db . init ( " CREATE INDEX mid_m ON Matches (match_id) " )
self . db . init ( " CREATE INDEX col_r ON Representations (colocation_id) " )
2019-08-21 10:49:03 +00:00
self . db . init ( " CREATE INDEX disp_key ON Dispersions (structure_id, component_id, lemma) " )
2019-06-27 14:51:58 +00:00
2019-09-09 13:29:15 +00:00
match_num = self . db . execute ( " SELECT MAX(match_id) FROM Matches " ) . fetchone ( ) [ 0 ]
self . match_num = 0 if match_num is None else match_num + 1
2019-06-15 16:55:35 +00:00
def _add_match ( self , key , structure , match ) :
2019-06-27 14:51:58 +00:00
structure_id , key_str = key [ 0 ] , str ( key [ 1 : ] )
cid = self . db . execute ( " SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=? " ,
( key_str , structure_id ) ) . fetchone ( )
if cid is None :
self . db . execute ( " INSERT INTO Colocations (structure_id, key) VALUES (?,?) " ,
( structure_id , key_str ) )
cid = self . db . execute ( " SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=? " ,
( key_str , structure_id ) ) . fetchone ( )
for component_id , word in match . items ( ) :
self . db . execute ( """
INSERT INTO Matches ( match_id , component_id , word_lemma , word_text , word_msd , word_id )
VALUES ( : match_id , : component_id , : word_lemma , : word_text , : word_msd , : word_id ) """ , {
" component_id " : component_id ,
2019-06-27 15:16:27 +00:00
" match_id " : self . match_num ,
2019-06-27 14:51:58 +00:00
" word_lemma " : word . lemma ,
" word_msd " : word . msd ,
" word_text " : word . text ,
" word_id " : word . id ,
} )
self . db . execute ( " INSERT INTO ColocationMatches (mid_colocation_id, mid_match_id) VALUES (?,?) " ,
2019-06-27 15:16:27 +00:00
( cid [ 0 ] , self . match_num ) )
self . match_num + = 1
2019-06-27 14:51:58 +00:00
2019-06-15 16:55:35 +00:00
def add_matches ( self , matches ) :
2019-06-27 20:04:33 +00:00
for structure , nms in progress ( matches . items ( ) , ' adding-matches ' ) :
2019-06-15 16:55:35 +00:00
for nm in nms :
self . _add_match ( nm [ 1 ] , structure , nm [ 0 ] )
def get_matches_for ( self , structure ) :
2019-06-27 14:51:58 +00:00
for cid in self . db . execute ( " SELECT colocation_id FROM Colocations WHERE structure_id=? " ,
( structure . id , ) ) :
yield StructureMatch . from_db ( self . db , cid [ 0 ] , structure )
2019-06-15 16:55:35 +00:00
2019-06-27 15:16:27 +00:00
def set_representations ( self , word_renderer , structures ) :
2019-09-06 12:55:36 +00:00
step_name = ' representation '
if self . db . is_step_done ( step_name ) :
print ( " Representation step already done, skipping " )
return
2019-09-11 06:58:02 +00:00
num_inserts = 1000
inserts = [ ]
2019-06-27 20:04:33 +00:00
structures_dict = { s . id : s for s in structures }
2019-07-03 12:54:23 +00:00
num_representations = int ( self . db . execute ( " SELECT Count(*) FROM Colocations " ) . fetchone ( ) [ 0 ] )
for cid , sid in progress ( self . db . execute ( " SELECT colocation_id, structure_id FROM Colocations " ) , " representations " , total = num_representations ) :
2019-06-27 20:04:33 +00:00
structure = structures_dict [ sid ]
2019-06-27 15:16:27 +00:00
match = StructureMatch . from_db ( self . db , cid , structure )
RepresentationAssigner . set_representations ( match , word_renderer )
2019-09-11 06:58:02 +00:00
inserts . append ( match )
if len ( inserts ) > num_inserts :
for match in inserts :
for component_id , text in match . representations . items ( ) :
self . db . execute ( """
INSERT INTO Representations ( colocation_id , component_id , text )
VALUES ( ? , ? , ? ) """ , (match.match_id, component_id, text))
inserts = [ ]
2019-06-27 14:51:58 +00:00
2019-09-06 12:55:36 +00:00
self . db . step_is_done ( step_name )
2019-06-15 16:55:35 +00:00
2019-11-06 01:39:26 +00:00
def has_colocation_id_enough_frequency ( self , colocation_id ) :
matches = self . db . execute ( " SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=? " , ( self . min_freq - 1 , self . min_freq , colocation_id ) ) . fetchone ( ) [ 0 ]
return matches > = self . min_freq
2019-06-15 16:55:35 +00:00
def determine_colocation_dispersions ( self ) :
2019-08-21 10:49:03 +00:00
step_name = ' dispersions '
2019-08-21 10:57:42 +00:00
if self . db . is_step_done ( step_name ) :
2019-08-21 10:49:03 +00:00
self . load_dispersions ( )
return
2019-06-15 16:55:35 +00:00
dispersions = defaultdict ( int )
2019-11-06 01:39:26 +00:00
for colocation_id , structure_id , word_tups_str in progress ( self . db . execute ( " SELECT colocation_id, structure_id, key FROM Colocations " ) , " dispersion " ) :
if not self . has_colocation_id_enough_frequency ( colocation_id ) :
continue
2019-06-27 20:04:33 +00:00
word_tups = literal_eval ( word_tups_str )
2019-06-15 16:55:35 +00:00
for component_id , lemma in word_tups :
2019-06-27 20:04:33 +00:00
dispersions [ ( str ( structure_id ) , component_id , lemma ) ] + = 1
2019-06-15 16:55:35 +00:00
self . dispersions = dict ( dispersions )
2019-08-21 10:49:03 +00:00
print ( " Storing dispersions... " )
self . store_dispersions ( )
2019-08-21 10:57:42 +00:00
self . db . step_is_done ( step_name )
2019-08-21 10:49:03 +00:00
def store_dispersions ( self ) :
for ( structure_id , component_id , lemma ) , disp in self . dispersions . items ( ) :
self . db . execute ( " INSERT INTO Dispersions (structure_id, component_id, lemma, dispersion) VALUES (?, ?, ?, ?) " ,
( structure_id , component_id , lemma , disp ) )
def load_dispersions ( self ) :
self . dispersions = { }
for structure_id , component_id , lemma , dispersion in progress ( self . db . execute ( " SELECT * FROM Dispersions " ) , " load-dispersions " ) :
self . dispersions [ structure_id , component_id , lemma ] = dispersion