Incorporating database also in MatchStore

This commit is contained in:
Ozbolt Menegatti 2019-06-27 16:51:58 +02:00
parent c25844a335
commit 188763c06a
3 changed files with 95 additions and 10 deletions

View File

@ -1,3 +1,5 @@
from word import Word
class StructureMatch:
def __init__(self, match_id, structure):
self.match_id = match_id
@ -5,6 +7,25 @@ class StructureMatch:
self.matches = []
self.representations = {}
@staticmethod
def from_db(db, colocation_id, structure):
result = StructureMatch(str(colocation_id), structure)
for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)):
to_add = {}
for component_id, word_lemma, word_text, word_msd, word_id in db.execute("""
SELECT component_id, word_lemma, word_text, word_msd, word_id
FROM Matches WHERE match_id=?""", match_id):
to_add[str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
result.matches.append(to_add)
for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
result.representations[str(component_id)] = text
return result
def distinct_forms(self):
dm = set()
@ -17,4 +38,4 @@ class StructureMatch:
self.matches.append(match)
def __len__(self):
return len(self.matches)
return len(self.matches)

View File

@ -5,34 +5,98 @@ from representation_assigner import RepresentationAssigner
from progress_bar import progress
class MatchStore:
def __init__(self, args):
def __init__(self, args, db):
self.db = db
self.data = {}
self.min_frequency = args.min_freq
self.dispersions = {}
self.db.init("""CREATE TABLE Colocations (
colocation_id INTEGER PRIMARY KEY,
structure_id INTEGER,
key varchar(256))
""")
self.db.init("""CREATE TABLE Matches (
match_id INTEGER,
component_id INTEGER NOT NULL,
word_lemma varchar(32) NOT NULL,
word_id varchar(32) NOT NULL,
word_msd varchar(16) NOT NULL,
word_text varchar(32) NOT NULL)
""")
self.db.init("""CREATE TABLE ColocationMatches (
mid_match_id INTEGER,
mid_colocation_id INTEGER,
FOREIGN KEY(mid_colocation_id) REFERENCES Colocations(colocation_id),
FOREIGN KEY(mid_match_id) REFERENCES Matches(match_id))
""")
self.db.init("""CREATE TABLE Representations (
colocation_id INTEGER,
component_id INTEGER,
text varchar(32),
FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
""")
def _add_match(self, key, structure, match):
structure_id, key_str = key[0], str(key[1:])
cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
(key_str, structure_id)).fetchone()
if cid is None:
self.db.execute("INSERT INTO Colocations (structure_id, key) VALUES (?,?)",
(structure_id, key_str))
cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
(key_str, structure_id)).fetchone()
mid = self.db.execute("SELECT max(match_id) + 1 FROM Matches").fetchone()
mid = 0 if mid[0] is None else mid[0]
for component_id, word in match.items():
self.db.execute("""
INSERT INTO Matches (match_id, component_id, word_lemma, word_text, word_msd, word_id)
VALUES (:match_id, :component_id, :word_lemma, :word_text, :word_msd, :word_id)""", {
"component_id": component_id,
"match_id": mid,
"word_lemma": word.lemma,
"word_msd": word.msd,
"word_text": word.text,
"word_id": word.id,
})
self.db.execute("INSERT INTO ColocationMatches (mid_colocation_id, mid_match_id) VALUES (?,?)",
(cid[0], mid))
if key not in self.data:
self.data[key] = StructureMatch(str(len(self.data) + 1), structure)
self.data[key].append(match)
def get(self, key, n):
return self.data[key][n]
def add_matches(self, matches):
for structure, nms in matches.items():
for nm in nms:
self._add_match(nm[1], structure, nm[0])
def get_matches_for(self, structure):
for _cid_tup, sm in self.data.items():
if sm.structure != structure:
continue
print(structure.id)
for cid in self.db.execute("SELECT colocation_id FROM Colocations WHERE structure_id=?",
(structure.id,)):
yield StructureMatch.from_db(self.db, cid[0], structure)
yield sm
# for _cid_tup, sm in self.data.items():
# if sm.structure != structure:
# continue
# # print(sm.matches, sm.match_id, sm.representations)
# yield sm
def set_representations(self, word_renderer):
for _1, sm in progress(self.data.items(), "representations"):
RepresentationAssigner.set_representations(sm, word_renderer)
for component_id, text in sm.representations.items():
self.db.execute("""
INSERT INTO Representations (colocation_id, component_id, text)
VALUES (?,?,?)""", (sm.match_id, component_id, text))
def determine_colocation_dispersions(self):
dispersions = defaultdict(int)

View File

@ -39,7 +39,7 @@ def main(args):
structures, lemma_msds, max_num_components = build_structures(args)
database = Database(args)
match_store = MatchStore(args)
match_store = MatchStore(args, database)
word_stats = WordStats(lemma_msds, database)
if args.parallel: