import gc
from collections import defaultdict
from ast import literal_eval
from time import time
import logging
from luscenje_struktur . match import StructureMatch
from luscenje_struktur . representation_assigner import RepresentationAssigner
from luscenje_struktur . progress_bar import progress
class MatchStore :
def __init__ ( self , args , db ) :
self . db = db
self . dispersions = { }
self . min_freq = args . min_freq
self . db . init ( """ CREATE TABLE Colocations (
colocation_id INTEGER PRIMARY KEY ,
structure_id varchar ( 8 ) ,
key varchar ( 256 ) )
""" )
self . db . init ( """ CREATE TABLE Matches (
match_id INTEGER ,
component_id INTEGER NOT NULL ,
word_lemma varchar ( 32 ) NOT NULL ,
word_id varchar ( 32 ) NOT NULL ,
word_msd varchar ( 16 ) NOT NULL ,
word_text varchar ( 32 ) NOT NULL )
""" )
self . db . init ( """ CREATE TABLE ColocationMatches (
mid_match_id INTEGER ,
mid_colocation_id INTEGER ,
FOREIGN KEY ( mid_colocation_id ) REFERENCES Colocations ( colocation_id ) ,
FOREIGN KEY ( mid_match_id ) REFERENCES Matches ( match_id ) )
""" )
self . db . init ( """ CREATE TABLE Representations (
colocation_id INTEGER ,
component_id INTEGER ,
text varchar ( 32 ) ,
msd varchar ( 32 ) ,
FOREIGN KEY ( colocation_id ) REFERENCES Colocations ( colocation_id ) )
""" )
self . db . init ( """ CREATE TABLE Dispersions (
structure_id varchar ( 64 ) ,
component_id varchar ( 64 ) ,
lemma varchar ( 128 ) ,
dispersion INTEGER )
""" )
self . db . init ( " CREATE INDEX key_sid_c ON Colocations (key, structure_id) " )
self . db . init ( " CREATE INDEX sid_c ON Colocations (structure_id) " )
self . db . init ( " CREATE INDEX mmid_cm ON ColocationMatches (mid_colocation_id) " )
self . db . init ( " CREATE INDEX mid_m ON Matches (match_id) " )
self . db . init ( " CREATE INDEX col_r ON Representations (colocation_id) " )
self . db . init ( " CREATE INDEX disp_key ON Dispersions (structure_id, component_id, lemma) " )
match_num = self . db . execute ( " SELECT MAX(match_id) FROM Matches " ) . fetchone ( ) [ 0 ]
self . match_num = 0 if match_num is None else match_num + 1
def _add_match ( self , key , structure , match ) :
structure_id , key_str = key [ 0 ] , str ( key [ 1 : ] )
cid = self . db . execute ( " SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=? " ,
( key_str , structure_id ) ) . fetchone ( )
if cid is None :
self . db . execute ( " INSERT INTO Colocations (structure_id, key) VALUES (?,?) " ,
( structure_id , key_str ) )
cid = self . db . execute ( " SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=? " ,
( key_str , structure_id ) ) . fetchone ( )
for component_id , word in match . items ( ) :
self . db . execute ( """
INSERT INTO Matches ( match_id , component_id , word_lemma , word_text , word_msd , word_id )
VALUES ( : match_id , : component_id , : word_lemma , : word_text , : word_msd , : word_id ) """ , {
" component_id " : component_id ,
" match_id " : self . match_num ,
" word_lemma " : word . lemma ,
" word_msd " : word . msd ,
" word_text " : word . text ,
" word_id " : word . id ,
} )
self . db . execute ( " INSERT INTO ColocationMatches (mid_colocation_id, mid_match_id) VALUES (?,?) " ,
( cid [ 0 ] , self . match_num ) )
self . match_num + = 1
def add_matches ( self , matches ) :
for structure , nms in progress ( matches . items ( ) , ' adding-matches ' ) :
for nm in nms :
self . _add_match ( nm [ 1 ] , structure , nm [ 0 ] )
def get_matches_for ( self , structure ) :
for cid in self . db . execute ( " SELECT colocation_id FROM Colocations WHERE structure_id=? " ,
( structure . id , ) ) :
yield StructureMatch . from_db ( self . db , cid [ 0 ] , structure )
def add_inserts ( self , inserts ) :
for match in inserts :
for component_id , ( text , msd ) in match . representations . items ( ) :
self . db . execute ( """
INSERT INTO Representations ( colocation_id , component_id , text , msd )
VALUES ( ? , ? , ? , ? ) """ , (match.match_id, component_id, text, msd))
def set_representations ( self , word_renderer , structures , sloleks_db = None ) :
step_name = ' representation '
if self . db . is_step_done ( step_name ) :
logging . info ( " Representation step already done, skipping " )
return
num_inserts = 1000
inserts = [ ]
structures_dict = { s . id : s for s in structures }
num_representations = int ( self . db . execute ( " SELECT Count(*) FROM Colocations " ) . fetchone ( ) [ 0 ] )
start_time = time ( )
for cid , sid in progress ( self . db . execute ( " SELECT colocation_id, structure_id FROM Colocations " ) , " representations " , total = num_representations ) :
structure = structures_dict [ sid ]
match = StructureMatch . from_db ( self . db , cid , structure )
RepresentationAssigner . set_representations ( match , word_renderer , sloleks_db = sloleks_db )
inserts . append ( match )
if len ( inserts ) > num_inserts :
self . add_inserts ( inserts )
inserts = [ ]
if time ( ) - start_time > 5 :
start_time = time ( )
gc . collect ( )
self . add_inserts ( inserts )
self . db . step_is_done ( step_name )
def has_colocation_id_enough_frequency ( self , colocation_id ) :
matches = self . db . execute ( " SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=? " , ( self . min_freq - 1 , self . min_freq , colocation_id ) ) . fetchone ( ) [ 0 ]
return matches > = self . min_freq
def determine_colocation_dispersions ( self ) :
step_name = ' dispersions '
if self . db . is_step_done ( step_name ) :
self . load_dispersions ( )
return
dispersions = defaultdict ( int )
for colocation_id , structure_id , word_tups_str in progress ( self . db . execute ( " SELECT colocation_id, structure_id, key FROM Colocations " ) , " dispersion " ) :
if not self . has_colocation_id_enough_frequency ( colocation_id ) :
continue
word_tups = literal_eval ( word_tups_str )
for component_id , lemma in word_tups :
dispersions [ ( str ( structure_id ) , component_id , lemma ) ] + = 1
self . dispersions = dict ( dispersions )
logging . info ( " Storing dispersions... " )
self . store_dispersions ( )
self . db . step_is_done ( step_name )
def store_dispersions ( self ) :
for ( structure_id , component_id , lemma ) , disp in self . dispersions . items ( ) :
self . db . execute ( " INSERT INTO Dispersions (structure_id, component_id, lemma, dispersion) VALUES (?, ?, ?, ?) " ,
( structure_id , component_id , lemma , disp ) )
def load_dispersions ( self ) :
self . dispersions = { }
for structure_id , component_id , lemma , dispersion in progress ( self . db . execute ( " SELECT * FROM Dispersions " ) , " load-dispersions " ) :
self . dispersions [ structure_id , component_id , lemma ] = dispersion