From 1256a4de4008026d0cdb5acf89447c554739c779 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Wed, 26 Jun 2019 13:06:43 +0200 Subject: [PATCH] Fixing loading bad gz files and progress showing --- src/loader.py | 33 +++++++++++++++++++-------------- src/progress_bar.py | 11 +++++------ src/wani.py | 2 +- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/loader.py b/src/loader.py index 1652e47..14d118f 100644 --- a/src/loader.py +++ b/src/loader.py @@ -18,7 +18,7 @@ def load_files(args): skip_id_check = args.skip_id_check do_msd_translate = not args.no_msd_translate - for fname in filenames: + for fname in progress(filenames, "files", outfile=True): extension = pathlib.Path(fname).suffix if extension == ".xml": @@ -34,20 +34,28 @@ def load_gz(filename): result = [] bad_sentence = False + words = {} + links = [] + + def sentence_end(bad_sentence): + if bad_sentence: + return + + for lfrom, ldest, ana in links: + if lfrom not in words or ldest not in words: + logging.warning("Bad link in sentence: " + line_split[0]) + continue + words[lfrom].add_link(ana, words[ldest]) + result.extend(words.values()) + with gzip.open(filename, 'r') as fp: - words = {} - links = [] - for line in progress(fp, 'load-gz'): + for line in progress(fp, 'load-gz', infile=True): line_str = line.decode('utf8').strip() line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t') line_split = line_fixed.split("\t") if line_split[1] == "1" and len(words) > 0: - if not bad_sentence: - for lfrom, ldest, ana in links: - words[lfrom].add_link(ana, words[ldest]) - result.extend(words.values()) - + sentence_end(bad_sentence) bad_sentence = False links = [] words = {} @@ -62,10 +70,7 @@ def load_gz(filename): if link_src != '0': links.append((link_src, wid, link_type)) - for lfrom, ldest, ana in links: - words[lfrom].add_link(ana, words[ldest]) - result.extend(words.values()) - + sentence_end(bad_sentence) return result def load_xml(filename): @@ -114,4 +119,4 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): # strange errors, just skip... pass - return list(words.values()) \ No newline at end of file + return list(words.values()) diff --git a/src/progress_bar.py b/src/progress_bar.py index 75c7121..6f8ad48 100644 --- a/src/progress_bar.py +++ b/src/progress_bar.py @@ -6,15 +6,14 @@ except ImportError: class Progress: def __init__(self): - self.infile = False + self.hide_inner = False def __call__(self, iterable, description, infile=False, outfile=False): show_progress = True - if infile and not self.infile: - show_progress = False - elif outfile and self.infile: - show_progress = False + if True in (infile, outfile): + assert False in (infile, outfile) + show_progress = outfile == self.hide_inner if not show_progress: yield from iterable @@ -35,7 +34,7 @@ class Progress: def init(self, args): - self.infile = not args.hide_inner_progress + self.hide_inner = args.hide_inner_progress progress = Progress() diff --git a/src/wani.py b/src/wani.py index 85755eb..79785ca 100644 --- a/src/wani.py +++ b/src/wani.py @@ -72,7 +72,7 @@ def main(args): word_stats.add_words(words) else: - for words in progress(load_files(args), "files", outfile=True): + for words in load_files(args): matches = match_file(words, structures) # just save to temporary file, used for children of a parallel process # MUST NOT have more than one file