diff --git a/src/loader.py b/src/loader.py index 9664fc7..1876352 100644 --- a/src/loader.py +++ b/src/loader.py @@ -13,15 +13,23 @@ def is_root_id(id_): return len(id_.split('.')) == 3 -def load_files(args): +def load_files(args, database): filenames = args.input skip_id_check = args.skip_id_check do_msd_translate = not args.no_msd_translate + database.init("CREATE TABLE Files ( filename varchar(2048) )") + for idx, fname in enumerate(filenames): print("FILE ", fname, "{}/{}".format(idx, len(filenames))) extension = pathlib.Path(fname).suffix + # check if file with the same name already loaded... + loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone() + if loaded is not None: + print("ALREADY LOADED") + continue + if extension == ".xml": et = load_xml(fname) yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) @@ -32,6 +40,10 @@ def load_files(args): # else: # raise NotImplementedError("Unknown file extension: {}".format(extension)) + database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, )) + database.commit() + + def lines_gz(filename): with gzip.open(filename, 'r') as fp: for line in progress(fp, 'load-gz'): diff --git a/src/wani.py b/src/wani.py index 364ccc4..5826ae0 100644 --- a/src/wani.py +++ b/src/wani.py @@ -43,7 +43,10 @@ def main(args): match_store = MatchStore(args, database) word_stats = WordStats(lemma_msds, database) - for words in load_files(args): + for words in load_files(args, database): + if words is None: + continue + matches = match_file(words, structures) match_store.add_matches(matches) word_stats.add_words(words)