Fixing loading bad gz files and progress showing

This commit is contained in:
Ozbolt Menegatti 2019-06-26 13:06:43 +02:00
parent 049f5ca3dc
commit 1256a4de40
3 changed files with 25 additions and 21 deletions

View File

@ -18,7 +18,7 @@ def load_files(args):
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
for fname in filenames:
for fname in progress(filenames, "files", outfile=True):
extension = pathlib.Path(fname).suffix
if extension == ".xml":
@ -34,20 +34,28 @@ def load_gz(filename):
result = []
bad_sentence = False
words = {}
links = []
def sentence_end(bad_sentence):
if bad_sentence:
return
for lfrom, ldest, ana in links:
if lfrom not in words or ldest not in words:
logging.warning("Bad link in sentence: " + line_split[0])
continue
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
with gzip.open(filename, 'r') as fp:
words = {}
links = []
for line in progress(fp, 'load-gz'):
for line in progress(fp, 'load-gz', infile=True):
line_str = line.decode('utf8').strip()
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0:
if not bad_sentence:
for lfrom, ldest, ana in links:
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
sentence_end(bad_sentence)
bad_sentence = False
links = []
words = {}
@ -62,10 +70,7 @@ def load_gz(filename):
if link_src != '0':
links.append((link_src, wid, link_type))
for lfrom, ldest, ana in links:
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
sentence_end(bad_sentence)
return result
def load_xml(filename):
@ -114,4 +119,4 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
# strange errors, just skip...
pass
return list(words.values())
return list(words.values())

View File

@ -6,15 +6,14 @@ except ImportError:
class Progress:
def __init__(self):
self.infile = False
self.hide_inner = False
def __call__(self, iterable, description, infile=False, outfile=False):
show_progress = True
if infile and not self.infile:
show_progress = False
elif outfile and self.infile:
show_progress = False
if True in (infile, outfile):
assert False in (infile, outfile)
show_progress = outfile == self.hide_inner
if not show_progress:
yield from iterable
@ -35,7 +34,7 @@ class Progress:
def init(self, args):
self.infile = not args.hide_inner_progress
self.hide_inner = args.hide_inner_progress
progress = Progress()

View File

@ -72,7 +72,7 @@ def main(args):
word_stats.add_words(words)
else:
for words in progress(load_files(args), "files", outfile=True):
for words in load_files(args):
matches = match_file(words, structures)
# just save to temporary file, used for children of a parallel process
# MUST NOT have more than one file