adding separate database class

This commit is contained in:
Ozbolt Menegatti 2019-06-27 12:37:23 +02:00
parent fa8a5e55f8
commit c25844a335
3 changed files with 40 additions and 14 deletions

23
src/database.py Normal file
View File

@ -0,0 +1,23 @@
import sqlite3
import os
class Database:
def __init__(self, args):
filename = ":memory:" if args.db is None else args.db
if not args.keep_db and os.path.exists(filename):
os.remove(filename)
self.new = not os.path.exists(filename)
self.db = sqlite3.connect(filename)
def execute(self, *args, **kwargs):
return self.db.execute(*args, **kwargs)
def init(self, *args, **kwargs):
# same as execute, only skipped if not a new database file
if self.new:
return self.execute(*args, **kwargs)
def commit(self):
self.db.commit()

View File

@ -16,6 +16,7 @@ from match_store import MatchStore
from word_stats import WordStats from word_stats import WordStats
from writer import Writer from writer import Writer
from loader import load_files from loader import load_files
from database import Database
def match_file(words, structures): def match_file(words, structures):
@ -37,8 +38,9 @@ def match_file(words, structures):
def main(args): def main(args):
structures, lemma_msds, max_num_components = build_structures(args) structures, lemma_msds, max_num_components = build_structures(args)
database = Database(args)
match_store = MatchStore(args) match_store = MatchStore(args)
word_stats = WordStats(lemma_msds) word_stats = WordStats(lemma_msds, database)
if args.parallel: if args.parallel:
num_parallel = int(args.parallel) num_parallel = int(args.parallel)
@ -139,6 +141,11 @@ if __name__ == '__main__':
parser.add_argument('--sort-reversed', parser.add_argument('--sort-reversed',
help="Sort in reversed ored", action='store_true') help="Sort in reversed ored", action='store_true')
parser.add_argument('--db',
help="Database file to use (instead of memory)", default=None)
parser.add_argument('--keep-db',
help="Does not recreate new database file", action='store_true')
parser.add_argument('--pc-tag', parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc") help='Tag for separators, usually pc or c', default="pc")
parser.add_argument('--parallel', parser.add_argument('--parallel',

View File

@ -1,34 +1,30 @@
from collections import defaultdict, Counter from collections import defaultdict, Counter
from progress_bar import progress from progress_bar import progress
import sqlite3
class WordStats: class WordStats:
def __init__(self, lemma_features): def __init__(self, lemma_features, db):
self.lemma_features = lemma_features self.lemma_features = lemma_features
self.db = db
self.all_words = 0 self.all_words = 0
self.memoized_msd_merges = {} self.memoized_msd_merges = {}
with open("sqlite.db", 'w') as fp: self.db.init("""CREATE TABLE UniqWords (
fp.write("")
self.db = sqlite3.connect('sqlite.db')
self.db.execute("""CREATE TABLE UniqWords (
uw_id INTEGER PRIMARY KEY, uw_id INTEGER PRIMARY KEY,
lemma varchar(64), lemma varchar(64),
msd varchar(16), msd varchar(16),
text varchar(64), text varchar(64),
frequency int frequency int
)""") )""")
self.db.execute("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))") self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))")
self.db.execute("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
self.db.execute("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
self.db.execute("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
self.db.execute("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)") self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)")
self.db.execute("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
def add_words(self, words): def add_words(self, words):
for w in progress(words, "adding-words"): for w in progress(words, "adding-words"):