107 lines
4.8 KiB
Python
107 lines
4.8 KiB
Python
from collections import defaultdict
|
|
from ast import literal_eval
|
|
|
|
from match import StructureMatch
|
|
from representation_assigner import RepresentationAssigner
|
|
from progress_bar import progress
|
|
|
|
class MatchStore:
|
|
def __init__(self, args, db):
|
|
self.db = db
|
|
self.min_frequency = args.min_freq
|
|
self.dispersions = {}
|
|
self.match_num = 0
|
|
|
|
self.db.init("""CREATE TABLE Colocations (
|
|
colocation_id INTEGER PRIMARY KEY,
|
|
structure_id varchar(8),
|
|
key varchar(256))
|
|
""")
|
|
self.db.init("""CREATE TABLE Matches (
|
|
match_id INTEGER,
|
|
component_id INTEGER NOT NULL,
|
|
word_lemma varchar(32) NOT NULL,
|
|
word_id varchar(32) NOT NULL,
|
|
word_msd varchar(16) NOT NULL,
|
|
word_text varchar(32) NOT NULL)
|
|
""")
|
|
self.db.init("""CREATE TABLE ColocationMatches (
|
|
mid_match_id INTEGER,
|
|
mid_colocation_id INTEGER,
|
|
FOREIGN KEY(mid_colocation_id) REFERENCES Colocations(colocation_id),
|
|
FOREIGN KEY(mid_match_id) REFERENCES Matches(match_id))
|
|
""")
|
|
self.db.init("""CREATE TABLE Representations (
|
|
colocation_id INTEGER,
|
|
component_id INTEGER,
|
|
text varchar(32),
|
|
FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
|
|
""")
|
|
|
|
self.db.init("CREATE INDEX key_sid_c ON Colocations (key, structure_id)")
|
|
self.db.init("CREATE INDEX sid_c ON Colocations (structure_id)")
|
|
self.db.init("CREATE INDEX mmid_cm ON ColocationMatches (mid_colocation_id)")
|
|
self.db.init("CREATE INDEX mid_m ON Matches (match_id)")
|
|
self.db.init("CREATE INDEX col_r ON Representations (colocation_id)")
|
|
|
|
|
|
def _add_match(self, key, structure, match):
|
|
structure_id, key_str = key[0], str(key[1:])
|
|
cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
|
|
(key_str, structure_id)).fetchone()
|
|
|
|
if cid is None:
|
|
self.db.execute("INSERT INTO Colocations (structure_id, key) VALUES (?,?)",
|
|
(structure_id, key_str))
|
|
cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
|
|
(key_str, structure_id)).fetchone()
|
|
|
|
for component_id, word in match.items():
|
|
self.db.execute("""
|
|
INSERT INTO Matches (match_id, component_id, word_lemma, word_text, word_msd, word_id)
|
|
VALUES (:match_id, :component_id, :word_lemma, :word_text, :word_msd, :word_id)""", {
|
|
"component_id": component_id,
|
|
"match_id": self.match_num,
|
|
"word_lemma": word.lemma,
|
|
"word_msd": word.msd,
|
|
"word_text": word.text,
|
|
"word_id": word.id,
|
|
})
|
|
|
|
self.db.execute("INSERT INTO ColocationMatches (mid_colocation_id, mid_match_id) VALUES (?,?)",
|
|
(cid[0], self.match_num))
|
|
|
|
self.match_num += 1
|
|
|
|
def add_matches(self, matches):
|
|
for structure, nms in progress(matches.items(), 'adding-matches'):
|
|
for nm in nms:
|
|
self._add_match(nm[1], structure, nm[0])
|
|
|
|
def get_matches_for(self, structure):
|
|
for cid in self.db.execute("SELECT colocation_id FROM Colocations WHERE structure_id=?",
|
|
(structure.id,)):
|
|
yield StructureMatch.from_db(self.db, cid[0], structure)
|
|
|
|
def set_representations(self, word_renderer, structures):
|
|
structures_dict = {s.id: s for s in structures}
|
|
num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
|
|
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
|
|
structure = structures_dict[sid]
|
|
match = StructureMatch.from_db(self.db, cid, structure)
|
|
RepresentationAssigner.set_representations(match, word_renderer)
|
|
for component_id, text in match.representations.items():
|
|
self.db.execute("""
|
|
INSERT INTO Representations (colocation_id, component_id, text)
|
|
VALUES (?,?,?)""", (match.match_id, component_id, text))
|
|
|
|
|
|
def determine_colocation_dispersions(self):
|
|
dispersions = defaultdict(int)
|
|
for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"):
|
|
word_tups = literal_eval(word_tups_str)
|
|
for component_id, lemma in word_tups:
|
|
dispersions[(str(structure_id), component_id, lemma)] += 1
|
|
|
|
self.dispersions = dict(dispersions)
|