diff --git a/src/match.py b/src/match.py index b2b11e9..5e241d7 100644 --- a/src/match.py +++ b/src/match.py @@ -1,3 +1,5 @@ +from word import Word + class StructureMatch: def __init__(self, match_id, structure): self.match_id = match_id @@ -5,6 +7,25 @@ class StructureMatch: self.matches = [] self.representations = {} + + @staticmethod + def from_db(db, colocation_id, structure): + result = StructureMatch(str(colocation_id), structure) + for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)): + to_add = {} + + for component_id, word_lemma, word_text, word_msd, word_id in db.execute(""" + SELECT component_id, word_lemma, word_text, word_msd, word_id + FROM Matches WHERE match_id=?""", match_id): + + to_add[str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False) + + result.matches.append(to_add) + + for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)): + result.representations[str(component_id)] = text + + return result def distinct_forms(self): dm = set() @@ -17,4 +38,4 @@ class StructureMatch: self.matches.append(match) def __len__(self): - return len(self.matches) \ No newline at end of file + return len(self.matches) diff --git a/src/match_store.py b/src/match_store.py index fdacdd8..1194172 100644 --- a/src/match_store.py +++ b/src/match_store.py @@ -5,34 +5,98 @@ from representation_assigner import RepresentationAssigner from progress_bar import progress class MatchStore: - def __init__(self, args): + def __init__(self, args, db): + self.db = db self.data = {} self.min_frequency = args.min_freq self.dispersions = {} + self.db.init("""CREATE TABLE Colocations ( + colocation_id INTEGER PRIMARY KEY, + structure_id INTEGER, + key varchar(256)) + """) + self.db.init("""CREATE TABLE Matches ( + match_id INTEGER, + component_id INTEGER NOT NULL, + word_lemma varchar(32) NOT NULL, + word_id varchar(32) NOT NULL, + word_msd varchar(16) NOT NULL, + word_text varchar(32) NOT NULL) + """) + self.db.init("""CREATE TABLE ColocationMatches ( + mid_match_id INTEGER, + mid_colocation_id INTEGER, + FOREIGN KEY(mid_colocation_id) REFERENCES Colocations(colocation_id), + FOREIGN KEY(mid_match_id) REFERENCES Matches(match_id)) + """) + self.db.init("""CREATE TABLE Representations ( + colocation_id INTEGER, + component_id INTEGER, + text varchar(32), + FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id)) + """) + + def _add_match(self, key, structure, match): + structure_id, key_str = key[0], str(key[1:]) + cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?", + (key_str, structure_id)).fetchone() + + if cid is None: + self.db.execute("INSERT INTO Colocations (structure_id, key) VALUES (?,?)", + (structure_id, key_str)) + cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?", + (key_str, structure_id)).fetchone() + + mid = self.db.execute("SELECT max(match_id) + 1 FROM Matches").fetchone() + mid = 0 if mid[0] is None else mid[0] + + for component_id, word in match.items(): + self.db.execute(""" + INSERT INTO Matches (match_id, component_id, word_lemma, word_text, word_msd, word_id) + VALUES (:match_id, :component_id, :word_lemma, :word_text, :word_msd, :word_id)""", { + "component_id": component_id, + "match_id": mid, + "word_lemma": word.lemma, + "word_msd": word.msd, + "word_text": word.text, + "word_id": word.id, + }) + + self.db.execute("INSERT INTO ColocationMatches (mid_colocation_id, mid_match_id) VALUES (?,?)", + (cid[0], mid)) + if key not in self.data: self.data[key] = StructureMatch(str(len(self.data) + 1), structure) self.data[key].append(match) - def get(self, key, n): - return self.data[key][n] - def add_matches(self, matches): for structure, nms in matches.items(): for nm in nms: self._add_match(nm[1], structure, nm[0]) def get_matches_for(self, structure): - for _cid_tup, sm in self.data.items(): - if sm.structure != structure: - continue + print(structure.id) + for cid in self.db.execute("SELECT colocation_id FROM Colocations WHERE structure_id=?", + (structure.id,)): + yield StructureMatch.from_db(self.db, cid[0], structure) - yield sm + # for _cid_tup, sm in self.data.items(): + # if sm.structure != structure: + # continue + + # # print(sm.matches, sm.match_id, sm.representations) + # yield sm def set_representations(self, word_renderer): for _1, sm in progress(self.data.items(), "representations"): RepresentationAssigner.set_representations(sm, word_renderer) + for component_id, text in sm.representations.items(): + self.db.execute(""" + INSERT INTO Representations (colocation_id, component_id, text) + VALUES (?,?,?)""", (sm.match_id, component_id, text)) + def determine_colocation_dispersions(self): dispersions = defaultdict(int) diff --git a/src/wani.py b/src/wani.py index b3d1637..6a5e604 100644 --- a/src/wani.py +++ b/src/wani.py @@ -39,7 +39,7 @@ def main(args): structures, lemma_msds, max_num_components = build_structures(args) database = Database(args) - match_store = MatchStore(args) + match_store = MatchStore(args, database) word_stats = WordStats(lemma_msds, database) if args.parallel: