diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..32da2a2 --- /dev/null +++ b/src/database.py @@ -0,0 +1,23 @@ +import sqlite3 +import os + +class Database: + def __init__(self, args): + filename = ":memory:" if args.db is None else args.db + + if not args.keep_db and os.path.exists(filename): + os.remove(filename) + + self.new = not os.path.exists(filename) + self.db = sqlite3.connect(filename) + + def execute(self, *args, **kwargs): + return self.db.execute(*args, **kwargs) + + def init(self, *args, **kwargs): + # same as execute, only skipped if not a new database file + if self.new: + return self.execute(*args, **kwargs) + + def commit(self): + self.db.commit() diff --git a/src/wani.py b/src/wani.py index 79785ca..b3d1637 100644 --- a/src/wani.py +++ b/src/wani.py @@ -16,6 +16,7 @@ from match_store import MatchStore from word_stats import WordStats from writer import Writer from loader import load_files +from database import Database def match_file(words, structures): @@ -37,8 +38,9 @@ def match_file(words, structures): def main(args): structures, lemma_msds, max_num_components = build_structures(args) + database = Database(args) match_store = MatchStore(args) - word_stats = WordStats(lemma_msds) + word_stats = WordStats(lemma_msds, database) if args.parallel: num_parallel = int(args.parallel) @@ -139,6 +141,11 @@ if __name__ == '__main__': parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true') + parser.add_argument('--db', + help="Database file to use (instead of memory)", default=None) + parser.add_argument('--keep-db', + help="Does not recreate new database file", action='store_true') + parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") parser.add_argument('--parallel', diff --git a/src/word_stats.py b/src/word_stats.py index 46a4ca1..bb34b6c 100644 --- a/src/word_stats.py +++ b/src/word_stats.py @@ -1,34 +1,30 @@ from collections import defaultdict, Counter from progress_bar import progress -import sqlite3 class WordStats: - def __init__(self, lemma_features): + def __init__(self, lemma_features, db): self.lemma_features = lemma_features + self.db = db self.all_words = 0 self.memoized_msd_merges = {} - with open("sqlite.db", 'w') as fp: - fp.write("") - - self.db = sqlite3.connect('sqlite.db') - self.db.execute("""CREATE TABLE UniqWords ( + self.db.init("""CREATE TABLE UniqWords ( uw_id INTEGER PRIMARY KEY, lemma varchar(64), msd varchar(16), text varchar(64), frequency int )""") - self.db.execute("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))") - self.db.execute("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") + self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd varchar(16))") + self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") - self.db.execute("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") - self.db.execute("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") - self.db.execute("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)") - self.db.execute("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") + self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") + self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") + self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma)") + self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") def add_words(self, words): for w in progress(words, "adding-words"):