import gc from collections import defaultdict from ast import literal_eval from time import time import logging from luscenje_struktur.match import StructureMatch from luscenje_struktur.representation_assigner import RepresentationAssigner from luscenje_struktur.progress_bar import progress class MatchStore: def __init__(self, args, db): self.db = db self.dispersions = {} self.min_freq = args.min_freq self.db.init("""CREATE TABLE Colocations ( colocation_id INTEGER PRIMARY KEY, structure_id varchar(8), key varchar(256)) """) self.db.init("""CREATE TABLE Matches ( match_id INTEGER, component_id INTEGER NOT NULL, word_lemma varchar(32) NOT NULL, word_id varchar(32) NOT NULL, word_msd varchar(16) NOT NULL, word_text varchar(32) NOT NULL) """) self.db.init("""CREATE TABLE ColocationMatches ( mid_match_id INTEGER, mid_colocation_id INTEGER, FOREIGN KEY(mid_colocation_id) REFERENCES Colocations(colocation_id), FOREIGN KEY(mid_match_id) REFERENCES Matches(match_id)) """) self.db.init("""CREATE TABLE Representations ( colocation_id INTEGER, component_id INTEGER, text varchar(32), msd varchar(32), FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id)) """) self.db.init("""CREATE TABLE Dispersions ( structure_id varchar(64), component_id varchar(64), lemma varchar(128), dispersion INTEGER) """) self.db.init("CREATE INDEX key_sid_c ON Colocations (key, structure_id)") self.db.init("CREATE INDEX sid_c ON Colocations (structure_id)") self.db.init("CREATE INDEX mmid_cm ON ColocationMatches (mid_colocation_id)") self.db.init("CREATE INDEX mid_m ON Matches (match_id)") self.db.init("CREATE INDEX col_r ON Representations (colocation_id)") self.db.init("CREATE INDEX disp_key ON Dispersions (structure_id, component_id, lemma)") match_num = self.db.execute("SELECT MAX(match_id) FROM Matches").fetchone()[0] self.match_num = 0 if match_num is None else match_num + 1 def _add_match(self, key, structure, match): structure_id, key_str = key[0], str(key[1:]) cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?", (key_str, structure_id)).fetchone() if cid is None: self.db.execute("INSERT INTO Colocations (structure_id, key) VALUES (?,?)", (structure_id, key_str)) cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?", (key_str, structure_id)).fetchone() for component_id, word in match.items(): self.db.execute(""" INSERT INTO Matches (match_id, component_id, word_lemma, word_text, word_msd, word_id) VALUES (:match_id, :component_id, :word_lemma, :word_text, :word_msd, :word_id)""", { "component_id": component_id, "match_id": self.match_num, "word_lemma": word.lemma, "word_msd": word.msd, "word_text": word.text, "word_id": word.id, }) self.db.execute("INSERT INTO ColocationMatches (mid_colocation_id, mid_match_id) VALUES (?,?)", (cid[0], self.match_num)) self.match_num += 1 def add_matches(self, matches): for structure, nms in progress(matches.items(), 'adding-matches'): for nm in nms: self._add_match(nm[1], structure, nm[0]) def get_matches_for(self, structure): for cid in self.db.execute("SELECT colocation_id FROM Colocations WHERE structure_id=?", (structure.id,)): yield StructureMatch.from_db(self.db, cid[0], structure) def add_inserts(self, inserts): for match in inserts: for component_id, (text, msd) in match.representations.items(): self.db.execute(""" INSERT INTO Representations (colocation_id, component_id, text, msd) VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd)) def set_representations(self, word_renderer, structures, sloleks_db=None): step_name = 'representation' if self.db.is_step_done(step_name): logging.info("Representation step already done, skipping") return num_inserts = 1000 inserts = [] structures_dict = {s.id: s for s in structures} num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0]) start_time = time() for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations): structure = structures_dict[sid] match = StructureMatch.from_db(self.db, cid, structure) RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db) inserts.append(match) if len(inserts) > num_inserts: self.add_inserts(inserts) inserts = [] if time() - start_time > 5: start_time = time() gc.collect() self.add_inserts(inserts) self.db.step_is_done(step_name) def has_colocation_id_enough_frequency(self, colocation_id): matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0] return matches >= self.min_freq def determine_colocation_dispersions(self): step_name = 'dispersions' if self.db.is_step_done(step_name): self.load_dispersions() return dispersions = defaultdict(int) for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"): if not self.has_colocation_id_enough_frequency(colocation_id): continue word_tups = literal_eval(word_tups_str) for component_id, lemma in word_tups: dispersions[(str(structure_id), component_id, lemma)] += 1 self.dispersions = dict(dispersions) logging.info("Storing dispersions...") self.store_dispersions() self.db.step_is_done(step_name) def store_dispersions(self): for (structure_id, component_id, lemma), disp in self.dispersions.items(): self.db.execute("INSERT INTO Dispersions (structure_id, component_id, lemma, dispersion) VALUES (?, ?, ?, ?)", (structure_id, component_id, lemma, disp)) def load_dispersions(self): self.dispersions = {} for structure_id, component_id, lemma, dispersion in progress(self.db.execute("SELECT * FROM Dispersions"), "load-dispersions"): self.dispersions[structure_id, component_id, lemma] = dispersion