files loaded now in database

This commit is contained in:
Ozbolt Menegatti 2019-08-21 11:12:38 +02:00
parent 8cca761b91
commit 2018745d52
2 changed files with 17 additions and 2 deletions

View File

@ -13,15 +13,23 @@ def is_root_id(id_):
return len(id_.split('.')) == 3
def load_files(args):
def load_files(args, database):
filenames = args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
database.init("CREATE TABLE Files ( filename varchar(2048) )")
for idx, fname in enumerate(filenames):
print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
extension = pathlib.Path(fname).suffix
# check if file with the same name already loaded...
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone()
if loaded is not None:
print("ALREADY LOADED")
continue
if extension == ".xml":
et = load_xml(fname)
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
@ -32,6 +40,10 @@ def load_files(args):
# else:
# raise NotImplementedError("Unknown file extension: {}".format(extension))
database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, ))
database.commit()
def lines_gz(filename):
with gzip.open(filename, 'r') as fp:
for line in progress(fp, 'load-gz'):

View File

@ -43,7 +43,10 @@ def main(args):
match_store = MatchStore(args, database)
word_stats = WordStats(lemma_msds, database)
for words in load_files(args):
for words in load_files(args, database):
if words is None:
continue
matches = match_file(words, structures)
match_store.add_matches(matches)
word_stats.add_words(words)