Fixing loading bad gz files and progress showing

This commit is contained in:
Ozbolt Menegatti 2019-06-26 13:06:43 +02:00
parent 049f5ca3dc
commit 1256a4de40
3 changed files with 25 additions and 21 deletions

View File

@ -18,7 +18,7 @@ def load_files(args):
skip_id_check = args.skip_id_check skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate do_msd_translate = not args.no_msd_translate
for fname in filenames: for fname in progress(filenames, "files", outfile=True):
extension = pathlib.Path(fname).suffix extension = pathlib.Path(fname).suffix
if extension == ".xml": if extension == ".xml":
@ -34,20 +34,28 @@ def load_gz(filename):
result = [] result = []
bad_sentence = False bad_sentence = False
words = {}
links = []
def sentence_end(bad_sentence):
if bad_sentence:
return
for lfrom, ldest, ana in links:
if lfrom not in words or ldest not in words:
logging.warning("Bad link in sentence: " + line_split[0])
continue
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
with gzip.open(filename, 'r') as fp: with gzip.open(filename, 'r') as fp:
words = {} for line in progress(fp, 'load-gz', infile=True):
links = []
for line in progress(fp, 'load-gz'):
line_str = line.decode('utf8').strip() line_str = line.decode('utf8').strip()
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t') line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t") line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0: if line_split[1] == "1" and len(words) > 0:
if not bad_sentence: sentence_end(bad_sentence)
for lfrom, ldest, ana in links:
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
bad_sentence = False bad_sentence = False
links = [] links = []
words = {} words = {}
@ -62,10 +70,7 @@ def load_gz(filename):
if link_src != '0': if link_src != '0':
links.append((link_src, wid, link_type)) links.append((link_src, wid, link_type))
for lfrom, ldest, ana in links: sentence_end(bad_sentence)
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
return result return result
def load_xml(filename): def load_xml(filename):

View File

@ -6,15 +6,14 @@ except ImportError:
class Progress: class Progress:
def __init__(self): def __init__(self):
self.infile = False self.hide_inner = False
def __call__(self, iterable, description, infile=False, outfile=False): def __call__(self, iterable, description, infile=False, outfile=False):
show_progress = True show_progress = True
if infile and not self.infile: if True in (infile, outfile):
show_progress = False assert False in (infile, outfile)
elif outfile and self.infile: show_progress = outfile == self.hide_inner
show_progress = False
if not show_progress: if not show_progress:
yield from iterable yield from iterable
@ -35,7 +34,7 @@ class Progress:
def init(self, args): def init(self, args):
self.infile = not args.hide_inner_progress self.hide_inner = args.hide_inner_progress
progress = Progress() progress = Progress()

View File

@ -72,7 +72,7 @@ def main(args):
word_stats.add_words(words) word_stats.add_words(words)
else: else:
for words in progress(load_files(args), "files", outfile=True): for words in load_files(args):
matches = match_file(words, structures) matches = match_file(words, structures)
# just save to temporary file, used for children of a parallel process # just save to temporary file, used for children of a parallel process
# MUST NOT have more than one file # MUST NOT have more than one file