files loaded now in database
This commit is contained in:
parent
8cca761b91
commit
2018745d52
|
@ -13,15 +13,23 @@ def is_root_id(id_):
|
|||
return len(id_.split('.')) == 3
|
||||
|
||||
|
||||
def load_files(args):
|
||||
def load_files(args, database):
|
||||
filenames = args.input
|
||||
skip_id_check = args.skip_id_check
|
||||
do_msd_translate = not args.no_msd_translate
|
||||
|
||||
database.init("CREATE TABLE Files ( filename varchar(2048) )")
|
||||
|
||||
for idx, fname in enumerate(filenames):
|
||||
print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
|
||||
extension = pathlib.Path(fname).suffix
|
||||
|
||||
# check if file with the same name already loaded...
|
||||
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname, )).fetchone()
|
||||
if loaded is not None:
|
||||
print("ALREADY LOADED")
|
||||
continue
|
||||
|
||||
if extension == ".xml":
|
||||
et = load_xml(fname)
|
||||
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
||||
|
@ -32,6 +40,10 @@ def load_files(args):
|
|||
# else:
|
||||
# raise NotImplementedError("Unknown file extension: {}".format(extension))
|
||||
|
||||
database.execute("INSERT INTO Files (filename) VALUES (?)", (fname, ))
|
||||
database.commit()
|
||||
|
||||
|
||||
def lines_gz(filename):
|
||||
with gzip.open(filename, 'r') as fp:
|
||||
for line in progress(fp, 'load-gz'):
|
||||
|
|
|
@ -43,7 +43,10 @@ def main(args):
|
|||
match_store = MatchStore(args, database)
|
||||
word_stats = WordStats(lemma_msds, database)
|
||||
|
||||
for words in load_files(args):
|
||||
for words in load_files(args, database):
|
||||
if words is None:
|
||||
continue
|
||||
|
||||
matches = match_file(words, structures)
|
||||
match_store.add_matches(matches)
|
||||
word_stats.add_words(words)
|
||||
|
|
Loading…
Reference in New Issue
Block a user