Fixing loading bad gz files and progress showing
This commit is contained in:
parent
049f5ca3dc
commit
1256a4de40
|
@ -18,7 +18,7 @@ def load_files(args):
|
|||
skip_id_check = args.skip_id_check
|
||||
do_msd_translate = not args.no_msd_translate
|
||||
|
||||
for fname in filenames:
|
||||
for fname in progress(filenames, "files", outfile=True):
|
||||
extension = pathlib.Path(fname).suffix
|
||||
|
||||
if extension == ".xml":
|
||||
|
@ -34,20 +34,28 @@ def load_gz(filename):
|
|||
result = []
|
||||
bad_sentence = False
|
||||
|
||||
words = {}
|
||||
links = []
|
||||
|
||||
def sentence_end(bad_sentence):
|
||||
if bad_sentence:
|
||||
return
|
||||
|
||||
for lfrom, ldest, ana in links:
|
||||
if lfrom not in words or ldest not in words:
|
||||
logging.warning("Bad link in sentence: " + line_split[0])
|
||||
continue
|
||||
words[lfrom].add_link(ana, words[ldest])
|
||||
result.extend(words.values())
|
||||
|
||||
with gzip.open(filename, 'r') as fp:
|
||||
words = {}
|
||||
links = []
|
||||
for line in progress(fp, 'load-gz'):
|
||||
for line in progress(fp, 'load-gz', infile=True):
|
||||
line_str = line.decode('utf8').strip()
|
||||
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
|
||||
line_split = line_fixed.split("\t")
|
||||
|
||||
if line_split[1] == "1" and len(words) > 0:
|
||||
if not bad_sentence:
|
||||
for lfrom, ldest, ana in links:
|
||||
words[lfrom].add_link(ana, words[ldest])
|
||||
result.extend(words.values())
|
||||
|
||||
sentence_end(bad_sentence)
|
||||
bad_sentence = False
|
||||
links = []
|
||||
words = {}
|
||||
|
@ -62,10 +70,7 @@ def load_gz(filename):
|
|||
if link_src != '0':
|
||||
links.append((link_src, wid, link_type))
|
||||
|
||||
for lfrom, ldest, ana in links:
|
||||
words[lfrom].add_link(ana, words[ldest])
|
||||
result.extend(words.values())
|
||||
|
||||
sentence_end(bad_sentence)
|
||||
return result
|
||||
|
||||
def load_xml(filename):
|
||||
|
@ -114,4 +119,4 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
|||
# strange errors, just skip...
|
||||
pass
|
||||
|
||||
return list(words.values())
|
||||
return list(words.values())
|
||||
|
|
|
@ -6,15 +6,14 @@ except ImportError:
|
|||
|
||||
class Progress:
|
||||
def __init__(self):
|
||||
self.infile = False
|
||||
self.hide_inner = False
|
||||
|
||||
|
||||
def __call__(self, iterable, description, infile=False, outfile=False):
|
||||
show_progress = True
|
||||
if infile and not self.infile:
|
||||
show_progress = False
|
||||
elif outfile and self.infile:
|
||||
show_progress = False
|
||||
if True in (infile, outfile):
|
||||
assert False in (infile, outfile)
|
||||
show_progress = outfile == self.hide_inner
|
||||
|
||||
if not show_progress:
|
||||
yield from iterable
|
||||
|
@ -35,7 +34,7 @@ class Progress:
|
|||
|
||||
|
||||
def init(self, args):
|
||||
self.infile = not args.hide_inner_progress
|
||||
self.hide_inner = args.hide_inner_progress
|
||||
|
||||
|
||||
progress = Progress()
|
||||
|
|
|
@ -72,7 +72,7 @@ def main(args):
|
|||
word_stats.add_words(words)
|
||||
|
||||
else:
|
||||
for words in progress(load_files(args), "files", outfile=True):
|
||||
for words in load_files(args):
|
||||
matches = match_file(words, structures)
|
||||
# just save to temporary file, used for children of a parallel process
|
||||
# MUST NOT have more than one file
|
||||
|
|
Loading…
Reference in New Issue
Block a user