Fixing loading bad gz files and progress showing

2019-06-26 13:06:43 +02:00
parent 049f5ca3dc
commit 1256a4de40
3 changed files with 25 additions and 21 deletions
@@ -18,7 +18,7 @@ def load_files(args):
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate
-    for fname in filenames:
+    for fname in progress(filenames, "files", outfile=True):
        extension = pathlib.Path(fname).suffix
        if extension == ".xml":
@@ -34,20 +34,28 @@ def load_gz(filename):
    result = []
    bad_sentence = False
    words = {}
    links = []
    def sentence_end(bad_sentence):
        if bad_sentence:
            return
        for lfrom, ldest, ana in links:
            if lfrom not in words or ldest not in words:
                logging.warning("Bad link in sentence: " + line_split[0])
                continue
            words[lfrom].add_link(ana, words[ldest])
        result.extend(words.values())
    with gzip.open(filename, 'r') as fp:
-        words = {}
+        for line in progress(fp, 'load-gz', infile=True):
        links = []
        for line in progress(fp, 'load-gz'):
            line_str = line.decode('utf8').strip()
            line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
            line_split = line_fixed.split("\t")
            if line_split[1] == "1" and len(words) > 0:
-                if not bad_sentence:
+                sentence_end(bad_sentence)
                    for lfrom, ldest, ana in links:
                        words[lfrom].add_link(ana, words[ldest])
                    result.extend(words.values())
                bad_sentence = False
                links = []
                words = {}
@@ -62,10 +70,7 @@ def load_gz(filename):
            if link_src != '0':
                links.append((link_src, wid, link_type))
-    for lfrom, ldest, ana in links:
+    sentence_end(bad_sentence)
        words[lfrom].add_link(ana, words[ldest])
    result.extend(words.values())
    return result
 def load_xml(filename):
@@ -6,15 +6,14 @@ except ImportError:
 class Progress:
    def __init__(self):
-        self.infile = False
+        self.hide_inner = False
    def __call__(self, iterable, description, infile=False, outfile=False):
        show_progress = True
-        if infile and not self.infile:
+        if True in (infile, outfile):
-            show_progress = False
+            assert False in (infile, outfile)
-        elif outfile and self.infile:
+            show_progress = outfile == self.hide_inner
            show_progress = False
        if not show_progress:
            yield from iterable
@@ -35,7 +34,7 @@ class Progress:
    def init(self, args):
-        self.infile = not args.hide_inner_progress
+        self.hide_inner = args.hide_inner_progress
 progress = Progress()
@@ -72,7 +72,7 @@ def main(args):
                    word_stats.add_words(words)
    else:
-        for words in progress(load_files(args), "files", outfile=True):
+        for words in load_files(args):
            matches = match_file(words, structures)
            # just save to temporary file, used for children of a parallel process
            # MUST NOT have more than one file