and --keep-db deprecated in favour of --new-db (harder for me to fu*k up)
		
			
				
	
	
		
			139 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			139 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from collections import defaultdict
 | 
						|
from ast import literal_eval
 | 
						|
 | 
						|
from match import StructureMatch
 | 
						|
from representation_assigner import RepresentationAssigner
 | 
						|
from progress_bar import progress
 | 
						|
 | 
						|
class MatchStore:
 | 
						|
    def __init__(self, args, db):
 | 
						|
        self.db = db
 | 
						|
        self.dispersions = {}
 | 
						|
 | 
						|
 | 
						|
        self.db.init("""CREATE TABLE Colocations (
 | 
						|
            colocation_id INTEGER PRIMARY KEY,
 | 
						|
            structure_id varchar(8),
 | 
						|
            key varchar(256))
 | 
						|
            """)
 | 
						|
        self.db.init("""CREATE TABLE Matches (
 | 
						|
            match_id INTEGER,
 | 
						|
            component_id INTEGER NOT NULL,
 | 
						|
            word_lemma varchar(32) NOT NULL,
 | 
						|
            word_id varchar(32) NOT NULL,
 | 
						|
            word_msd varchar(16) NOT NULL,
 | 
						|
            word_text varchar(32) NOT NULL)
 | 
						|
            """)
 | 
						|
        self.db.init("""CREATE TABLE ColocationMatches (
 | 
						|
            mid_match_id INTEGER,
 | 
						|
            mid_colocation_id INTEGER,
 | 
						|
            FOREIGN KEY(mid_colocation_id) REFERENCES Colocations(colocation_id),
 | 
						|
            FOREIGN KEY(mid_match_id) REFERENCES Matches(match_id))
 | 
						|
            """)
 | 
						|
        self.db.init("""CREATE TABLE Representations (
 | 
						|
            colocation_id INTEGER,
 | 
						|
            component_id INTEGER,
 | 
						|
            text varchar(32),
 | 
						|
            FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
 | 
						|
            """)
 | 
						|
        self.db.init("""CREATE TABLE Dispersions (
 | 
						|
            structure_id varchar(64),
 | 
						|
            component_id varchar(64),
 | 
						|
            lemma varchar(128),
 | 
						|
            dispersion INTEGER)
 | 
						|
            """)
 | 
						|
        
 | 
						|
        self.db.init("CREATE INDEX key_sid_c ON Colocations (key, structure_id)")
 | 
						|
        self.db.init("CREATE INDEX sid_c ON Colocations (structure_id)")
 | 
						|
        self.db.init("CREATE INDEX mmid_cm ON ColocationMatches (mid_colocation_id)")
 | 
						|
        self.db.init("CREATE INDEX mid_m ON Matches (match_id)")
 | 
						|
        self.db.init("CREATE INDEX col_r ON Representations (colocation_id)")
 | 
						|
        self.db.init("CREATE INDEX disp_key ON Dispersions (structure_id, component_id, lemma)")
 | 
						|
 | 
						|
        match_num = self.db.execute("SELECT MAX(match_id) FROM Matches").fetchone()[0]
 | 
						|
        self.match_num = 0 if match_num is None else match_num + 1
 | 
						|
    def _add_match(self, key, structure, match):
 | 
						|
        structure_id, key_str = key[0], str(key[1:])
 | 
						|
        cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
 | 
						|
                              (key_str, structure_id)).fetchone()
 | 
						|
 | 
						|
        if cid is None:
 | 
						|
            self.db.execute("INSERT INTO Colocations (structure_id, key) VALUES (?,?)", 
 | 
						|
                            (structure_id, key_str))
 | 
						|
            cid = self.db.execute("SELECT colocation_id FROM Colocations WHERE key=? AND structure_id=?",
 | 
						|
                                (key_str, structure_id)).fetchone()
 | 
						|
        
 | 
						|
        for component_id, word in match.items():
 | 
						|
            self.db.execute("""
 | 
						|
            INSERT INTO Matches (match_id, component_id, word_lemma, word_text, word_msd, word_id) 
 | 
						|
            VALUES (:match_id, :component_id, :word_lemma, :word_text, :word_msd, :word_id)""", {
 | 
						|
                "component_id": component_id,
 | 
						|
                "match_id": self.match_num,
 | 
						|
                "word_lemma": word.lemma,
 | 
						|
                "word_msd": word.msd,
 | 
						|
                "word_text": word.text,
 | 
						|
                "word_id": word.id,
 | 
						|
            })
 | 
						|
 | 
						|
        self.db.execute("INSERT INTO ColocationMatches (mid_colocation_id, mid_match_id) VALUES (?,?)",
 | 
						|
            (cid[0], self.match_num))
 | 
						|
        
 | 
						|
        self.match_num += 1
 | 
						|
 | 
						|
    def add_matches(self, matches):
 | 
						|
        for structure, nms in progress(matches.items(), 'adding-matches'):
 | 
						|
            for nm in nms:
 | 
						|
                self._add_match(nm[1], structure, nm[0])
 | 
						|
 | 
						|
    def get_matches_for(self, structure):
 | 
						|
        for cid in self.db.execute("SELECT colocation_id FROM Colocations WHERE structure_id=?",
 | 
						|
                                   (structure.id,)):
 | 
						|
            yield StructureMatch.from_db(self.db, cid[0], structure)
 | 
						|
 | 
						|
    def set_representations(self, word_renderer, structures):
 | 
						|
        step_name = 'representation'
 | 
						|
        if self.db.is_step_done(step_name):
 | 
						|
            print("Representation step already done, skipping")
 | 
						|
            return
 | 
						|
 | 
						|
        structures_dict = {s.id: s for s in structures}
 | 
						|
        num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
 | 
						|
        for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
 | 
						|
            structure = structures_dict[sid]
 | 
						|
            match = StructureMatch.from_db(self.db, cid, structure)
 | 
						|
            RepresentationAssigner.set_representations(match, word_renderer)
 | 
						|
            for component_id, text in match.representations.items():
 | 
						|
                self.db.execute("""
 | 
						|
                    INSERT INTO Representations (colocation_id, component_id, text) 
 | 
						|
                    VALUES (?,?,?)""", (match.match_id, component_id, text))
 | 
						|
 | 
						|
        self.db.step_is_done(step_name)
 | 
						|
 | 
						|
    def determine_colocation_dispersions(self):
 | 
						|
        step_name = 'dispersions'
 | 
						|
        if self.db.is_step_done(step_name):
 | 
						|
            self.load_dispersions()
 | 
						|
            return
 | 
						|
 | 
						|
        dispersions = defaultdict(int)
 | 
						|
        for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"):
 | 
						|
            word_tups = literal_eval(word_tups_str)
 | 
						|
            for component_id, lemma in word_tups:
 | 
						|
                dispersions[(str(structure_id), component_id, lemma)] += 1
 | 
						|
            
 | 
						|
        self.dispersions = dict(dispersions)
 | 
						|
        print("Storing dispersions...")
 | 
						|
        self.store_dispersions()
 | 
						|
 | 
						|
        self.db.step_is_done(step_name)
 | 
						|
 | 
						|
    def store_dispersions(self):
 | 
						|
        for (structure_id, component_id, lemma), disp in self.dispersions.items():
 | 
						|
            self.db.execute("INSERT INTO Dispersions (structure_id, component_id, lemma, dispersion) VALUES (?, ?, ?, ?)",
 | 
						|
                (structure_id, component_id, lemma, disp))
 | 
						|
 | 
						|
    def load_dispersions(self):
 | 
						|
        self.dispersions = {}
 | 
						|
        for structure_id, component_id, lemma, dispersion in progress(self.db.execute("SELECT * FROM Dispersions"), "load-dispersions"):
 | 
						|
            self.dispersions[structure_id, component_id, lemma] = dispersion
 |