Fixing loading bad gz files and progress showing

This commit is contained in:
2019-06-26 13:06:43 +02:00
parent 049f5ca3dc
commit 1256a4de40
3 changed files with 25 additions and 21 deletions

View File

@@ -18,7 +18,7 @@ def load_files(args):
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
for fname in filenames:
for fname in progress(filenames, "files", outfile=True):
extension = pathlib.Path(fname).suffix
if extension == ".xml":
@@ -34,20 +34,28 @@ def load_gz(filename):
result = []
bad_sentence = False
words = {}
links = []
def sentence_end(bad_sentence):
if bad_sentence:
return
for lfrom, ldest, ana in links:
if lfrom not in words or ldest not in words:
logging.warning("Bad link in sentence: " + line_split[0])
continue
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
with gzip.open(filename, 'r') as fp:
words = {}
links = []
for line in progress(fp, 'load-gz'):
for line in progress(fp, 'load-gz', infile=True):
line_str = line.decode('utf8').strip()
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0:
if not bad_sentence:
for lfrom, ldest, ana in links:
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
sentence_end(bad_sentence)
bad_sentence = False
links = []
words = {}
@@ -62,10 +70,7 @@ def load_gz(filename):
if link_src != '0':
links.append((link_src, wid, link_type))
for lfrom, ldest, ana in links:
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
sentence_end(bad_sentence)
return result
def load_xml(filename):
@@ -114,4 +119,4 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
# strange errors, just skip...
pass
return list(words.values())
return list(words.values())