Incorporating database also in MatchStore
This commit is contained in:
parent
c25844a335
commit
188763c06a
21
src/match.py
21
src/match.py
|
@ -1,3 +1,5 @@
|
||||||
|
from word import Word
|
||||||
|
|
||||||
class StructureMatch:
|
class StructureMatch:
|
||||||
def __init__(self, match_id, structure):
|
def __init__(self, match_id, structure):
|
||||||
self.match_id = match_id
|
self.match_id = match_id
|
||||||
|
@ -6,6 +8,25 @@ class StructureMatch:
|
||||||
self.matches = []
|
self.matches = []
|
||||||
self.representations = {}
|
self.representations = {}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_db(db, colocation_id, structure):
|
||||||
|
result = StructureMatch(str(colocation_id), structure)
|
||||||
|
for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)):
|
||||||
|
to_add = {}
|
||||||
|
|
||||||
|
for component_id, word_lemma, word_text, word_msd, word_id in db.execute("""
|
||||||
|
SELECT component_id, word_lemma, word_text, word_msd, word_id
|
||||||
|
FROM Matches WHERE match_id=?""", match_id):
|
||||||
|
|
||||||
|
to_add[str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
|
||||||
|
|
||||||
|
result.matches.append(to_add)
|
||||||
|
|
||||||
|
for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
|
||||||
|
result.representations[str(component_id)] = text
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def distinct_forms(self):
|
def distinct_forms(self):
|
||||||
dm = set()
|
dm = set()
|
||||||
keys = list(self.matches[0].keys())
|
keys = list(self.matches[0].keys())
|
||||||
|
|
|
@ -5,34 +5,98 @@ from representation_assigner import RepresentationAssigner
|
||||||
from progress_bar import progress
|
from progress_bar import progress
|
||||||
|
|
||||||
class MatchStore:
|
class MatchStore:
|
||||||
def __init__(self, args):
|
def __init__(self, args, db):
|
||||||
|
self.db = db
|
||||||
self.data = {}
|
self.data = {}
|
||||||
self.min_frequency = args.min_freq
|
self.min_frequency = args.min_freq
|
||||||
self.dispersions = {}
|
self.dispersions = {}
|
||||||
|
|
||||||
|
self.db.init("""CREATE TABLE Colocations (
|
||||||
|
colocation_id INTEGER PRIMARY KEY,
|
||||||
|
structure_id INTEGER,
|
||||||
|
key varchar(256))
|
||||||
|
""")
|
||||||
|
self.db.init("""CREATE TABLE Matches (
|
||||||
|
match_id INTEGER,
|
||||||
|
component_id INTEGER NOT NULL,
|
||||||
|
word_lemma varchar(32) NOT NULL,
|
||||||
|
word_id varchar(32) NOT NULL,
|
||||||
|
word_msd varchar(16) NOT NULL,
|
||||||
|
word_text varchar(32) NOT NULL)
|
||||||
|
""")
|
||||||
|
self.db.init("""CREATE TABLE ColocationMatches (
|
||||||
|
mid_match_id INTEGER,
|
||||||
|
mid_colocation_id INTEGER,
|
||||||
|
FOREIGN KEY(mid_colocation_id) REFERENCES Colocations(colocation_id),
|
||||||
|
FOREIGN KEY(mid_match_id) REFERENCES Matches(match_id))
|
||||||
|
""")
|
||||||
|
self.db.init("""CREATE TABLE Representations (
|
||||||
|
colocation_id INTEGER,
|
||||||
|
component_id INTEGER,
|
||||||
|
text varchar(32),
|
||||||
|
FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
def _add_match(self, key, structure, match):
|
def _add_match(self, key, structure, match):
|
||||||
|
structure_id, key_str = key[0], str(key[1:])
|
||||||
|
cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
|
||||||
|
(key_str, structure_id)).fetchone()
|
||||||
|
|
||||||
|
if cid is None:
|
||||||
|
self.db.execute("INSERT INTO Colocations (structure_id, key) VALUES (?,?)",
|
||||||
|
(structure_id, key_str))
|
||||||
|
cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
|
||||||
|
(key_str, structure_id)).fetchone()
|
||||||
|
|
||||||
|
mid = self.db.execute("SELECT max(match_id) + 1 FROM Matches").fetchone()
|
||||||
|
mid = 0 if mid[0] is None else mid[0]
|
||||||
|
|
||||||
|
for component_id, word in match.items():
|
||||||
|
self.db.execute("""
|
||||||
|
INSERT INTO Matches (match_id, component_id, word_lemma, word_text, word_msd, word_id)
|
||||||
|
VALUES (:match_id, :component_id, :word_lemma, :word_text, :word_msd, :word_id)""", {
|
||||||
|
"component_id": component_id,
|
||||||
|
"match_id": mid,
|
||||||
|
"word_lemma": word.lemma,
|
||||||
|
"word_msd": word.msd,
|
||||||
|
"word_text": word.text,
|
||||||
|
"word_id": word.id,
|
||||||
|
})
|
||||||
|
|
||||||
|
self.db.execute("INSERT INTO ColocationMatches (mid_colocation_id, mid_match_id) VALUES (?,?)",
|
||||||
|
(cid[0], mid))
|
||||||
|
|
||||||
if key not in self.data:
|
if key not in self.data:
|
||||||
self.data[key] = StructureMatch(str(len(self.data) + 1), structure)
|
self.data[key] = StructureMatch(str(len(self.data) + 1), structure)
|
||||||
self.data[key].append(match)
|
self.data[key].append(match)
|
||||||
|
|
||||||
def get(self, key, n):
|
|
||||||
return self.data[key][n]
|
|
||||||
|
|
||||||
def add_matches(self, matches):
|
def add_matches(self, matches):
|
||||||
for structure, nms in matches.items():
|
for structure, nms in matches.items():
|
||||||
for nm in nms:
|
for nm in nms:
|
||||||
self._add_match(nm[1], structure, nm[0])
|
self._add_match(nm[1], structure, nm[0])
|
||||||
|
|
||||||
def get_matches_for(self, structure):
|
def get_matches_for(self, structure):
|
||||||
for _cid_tup, sm in self.data.items():
|
print(structure.id)
|
||||||
if sm.structure != structure:
|
for cid in self.db.execute("SELECT colocation_id FROM Colocations WHERE structure_id=?",
|
||||||
continue
|
(structure.id,)):
|
||||||
|
yield StructureMatch.from_db(self.db, cid[0], structure)
|
||||||
|
|
||||||
yield sm
|
# for _cid_tup, sm in self.data.items():
|
||||||
|
# if sm.structure != structure:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# # print(sm.matches, sm.match_id, sm.representations)
|
||||||
|
# yield sm
|
||||||
|
|
||||||
def set_representations(self, word_renderer):
|
def set_representations(self, word_renderer):
|
||||||
for _1, sm in progress(self.data.items(), "representations"):
|
for _1, sm in progress(self.data.items(), "representations"):
|
||||||
RepresentationAssigner.set_representations(sm, word_renderer)
|
RepresentationAssigner.set_representations(sm, word_renderer)
|
||||||
|
for component_id, text in sm.representations.items():
|
||||||
|
self.db.execute("""
|
||||||
|
INSERT INTO Representations (colocation_id, component_id, text)
|
||||||
|
VALUES (?,?,?)""", (sm.match_id, component_id, text))
|
||||||
|
|
||||||
|
|
||||||
def determine_colocation_dispersions(self):
|
def determine_colocation_dispersions(self):
|
||||||
dispersions = defaultdict(int)
|
dispersions = defaultdict(int)
|
||||||
|
|
|
@ -39,7 +39,7 @@ def main(args):
|
||||||
structures, lemma_msds, max_num_components = build_structures(args)
|
structures, lemma_msds, max_num_components = build_structures(args)
|
||||||
|
|
||||||
database = Database(args)
|
database = Database(args)
|
||||||
match_store = MatchStore(args)
|
match_store = MatchStore(args, database)
|
||||||
word_stats = WordStats(lemma_msds, database)
|
word_stats = WordStats(lemma_msds, database)
|
||||||
|
|
||||||
if args.parallel:
|
if args.parallel:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user