You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
luscenje_struktur/luscenje_struktur/match_store.py

166 lines
7.2 KiB

import gc
from collections import defaultdict
from ast import literal_eval
from time import time
import logging
from luscenje_struktur.match import StructureMatch
from luscenje_struktur.representation_assigner import RepresentationAssigner
from luscenje_struktur.progress_bar import progress
class MatchStore:
def __init__(self, args, db):
self.db = db
self.dispersions = {}
self.min_freq = args.min_freq
self.db.init("""CREATE TABLE Colocations (
colocation_id INTEGER PRIMARY KEY,
structure_id varchar(8),
key varchar(256))
""")
self.db.init("""CREATE TABLE Matches (
match_id INTEGER,
component_id INTEGER NOT NULL,
word_lemma varchar(32) NOT NULL,
word_id varchar(32) NOT NULL,
word_msd varchar(16) NOT NULL,
word_text varchar(32) NOT NULL)
""")
self.db.init("""CREATE TABLE ColocationMatches (
mid_match_id INTEGER,
mid_colocation_id INTEGER,
FOREIGN KEY(mid_colocation_id) REFERENCES Colocations(colocation_id),
FOREIGN KEY(mid_match_id) REFERENCES Matches(match_id))
""")
self.db.init("""CREATE TABLE Representations (
colocation_id INTEGER,
component_id INTEGER,
text varchar(32),
msd varchar(32),
FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
""")
self.db.init("""CREATE TABLE Dispersions (
structure_id varchar(64),
component_id varchar(64),
lemma varchar(128),
dispersion INTEGER)
""")
self.db.init("CREATE INDEX key_sid_c ON Colocations (key, structure_id)")
self.db.init("CREATE INDEX sid_c ON Colocations (structure_id)")
self.db.init("CREATE INDEX mmid_cm ON ColocationMatches (mid_colocation_id)")
self.db.init("CREATE INDEX mid_m ON Matches (match_id)")
self.db.init("CREATE INDEX col_r ON Representations (colocation_id)")
self.db.init("CREATE INDEX disp_key ON Dispersions (structure_id, component_id, lemma)")
match_num = self.db.execute("SELECT MAX(match_id) FROM Matches").fetchone()[0]
self.match_num = 0 if match_num is None else match_num + 1
def _add_match(self, key, structure, match):
structure_id, key_str = key[0], str(key[1:])
cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
(key_str, structure_id)).fetchone()
if cid is None:
self.db.execute("INSERT INTO Colocations (structure_id, key) VALUES (?,?)",
(structure_id, key_str))
cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
(key_str, structure_id)).fetchone()
for component_id, word in match.items():
self.db.execute("""
INSERT INTO Matches (match_id, component_id, word_lemma, word_text, word_msd, word_id)
VALUES (:match_id, :component_id, :word_lemma, :word_text, :word_msd, :word_id)""", {
"component_id": component_id,
"match_id": self.match_num,
"word_lemma": word.lemma,
"word_msd": word.msd,
"word_text": word.text,
"word_id": word.id,
})
self.db.execute("INSERT INTO ColocationMatches (mid_colocation_id, mid_match_id) VALUES (?,?)",
(cid[0], self.match_num))
self.match_num += 1
def add_matches(self, matches):
for structure, nms in progress(matches.items(), 'adding-matches'):
for nm in nms:
self._add_match(nm[1], structure, nm[0])
def get_matches_for(self, structure):
for cid in self.db.execute("SELECT colocation_id FROM Colocations WHERE structure_id=?",
(structure.id,)):
yield StructureMatch.from_db(self.db, cid[0], structure)
def add_inserts(self, inserts):
for match in inserts:
for component_id, (text, msd) in match.representations.items():
self.db.execute("""
INSERT INTO Representations (colocation_id, component_id, text, msd)
VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd))
def set_representations(self, word_renderer, structures, sloleks_db=None):
step_name = 'representation'
if self.db.is_step_done(step_name):
logging.info("Representation step already done, skipping")
return
num_inserts = 1000
inserts = []
structures_dict = {s.id: s for s in structures}
num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
start_time = time()
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
structure = structures_dict[sid]
match = StructureMatch.from_db(self.db, cid, structure)
RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db)
inserts.append(match)
if len(inserts) > num_inserts:
self.add_inserts(inserts)
inserts = []
if time() - start_time > 5:
start_time = time()
gc.collect()
self.add_inserts(inserts)
self.db.step_is_done(step_name)
def has_colocation_id_enough_frequency(self, colocation_id):
matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0]
return matches >= self.min_freq
def determine_colocation_dispersions(self):
step_name = 'dispersions'
if self.db.is_step_done(step_name):
self.load_dispersions()
return
dispersions = defaultdict(int)
for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"):
if not self.has_colocation_id_enough_frequency(colocation_id):
continue
word_tups = literal_eval(word_tups_str)
for component_id, lemma in word_tups:
dispersions[(str(structure_id), component_id, lemma)] += 1
self.dispersions = dict(dispersions)
logging.info("Storing dispersions...")
self.store_dispersions()
self.db.step_is_done(step_name)
def store_dispersions(self):
for (structure_id, component_id, lemma), disp in self.dispersions.items():
self.db.execute("INSERT INTO Dispersions (structure_id, component_id, lemma, dispersion) VALUES (?, ?, ?, ?)",
(structure_id, component_id, lemma, disp))
def load_dispersions(self):
self.dispersions = {}
for structure_id, component_id, lemma, dispersion in progress(self.db.execute("SELECT * FROM Dispersions"), "load-dispersions"):
self.dispersions[structure_id, component_id, lemma] = dispersion