files loaded now in database
This commit is contained in:
parent
8cca761b91
commit
2018745d52
|
@ -13,15 +13,23 @@ def is_root_id(id_):
|
||||||
return len(id_.split('.')) == 3
|
return len(id_.split('.')) == 3
|
||||||
|
|
||||||
|
|
||||||
def load_files(args):
|
def load_files(args, database):
|
||||||
filenames = args.input
|
filenames = args.input
|
||||||
skip_id_check = args.skip_id_check
|
skip_id_check = args.skip_id_check
|
||||||
do_msd_translate = not args.no_msd_translate
|
do_msd_translate = not args.no_msd_translate
|
||||||
|
|
||||||
|
database.init("CREATE TABLE Files ( filename varchar(2048) )")
|
||||||
|
|
||||||
for idx, fname in enumerate(filenames):
|
for idx, fname in enumerate(filenames):
|
||||||
print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
|
print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
|
||||||
extension = pathlib.Path(fname).suffix
|
extension = pathlib.Path(fname).suffix
|
||||||
|
|
||||||
|
# check if file with the same name already loaded...
|
||||||
|
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone()
|
||||||
|
if loaded is not None:
|
||||||
|
print("ALREADY LOADED")
|
||||||
|
continue
|
||||||
|
|
||||||
if extension == ".xml":
|
if extension == ".xml":
|
||||||
et = load_xml(fname)
|
et = load_xml(fname)
|
||||||
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
||||||
|
@ -32,6 +40,10 @@ def load_files(args):
|
||||||
# else:
|
# else:
|
||||||
# raise NotImplementedError("Unknown file extension: {}".format(extension))
|
# raise NotImplementedError("Unknown file extension: {}".format(extension))
|
||||||
|
|
||||||
|
database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, ))
|
||||||
|
database.commit()
|
||||||
|
|
||||||
|
|
||||||
def lines_gz(filename):
|
def lines_gz(filename):
|
||||||
with gzip.open(filename, 'r') as fp:
|
with gzip.open(filename, 'r') as fp:
|
||||||
for line in progress(fp, 'load-gz'):
|
for line in progress(fp, 'load-gz'):
|
||||||
|
|
|
@ -43,7 +43,10 @@ def main(args):
|
||||||
match_store = MatchStore(args, database)
|
match_store = MatchStore(args, database)
|
||||||
word_stats = WordStats(lemma_msds, database)
|
word_stats = WordStats(lemma_msds, database)
|
||||||
|
|
||||||
for words in load_files(args):
|
for words in load_files(args, database):
|
||||||
|
if words is None:
|
||||||
|
continue
|
||||||
|
|
||||||
matches = match_file(words, structures)
|
matches = match_file(words, structures)
|
||||||
match_store.add_matches(matches)
|
match_store.add_matches(matches)
|
||||||
word_stats.add_words(words)
|
word_stats.add_words(words)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user