Fixing loading bad gz files and progress showing
This commit is contained in:
parent
049f5ca3dc
commit
1256a4de40
|
@ -18,7 +18,7 @@ def load_files(args):
|
||||||
skip_id_check = args.skip_id_check
|
skip_id_check = args.skip_id_check
|
||||||
do_msd_translate = not args.no_msd_translate
|
do_msd_translate = not args.no_msd_translate
|
||||||
|
|
||||||
for fname in filenames:
|
for fname in progress(filenames, "files", outfile=True):
|
||||||
extension = pathlib.Path(fname).suffix
|
extension = pathlib.Path(fname).suffix
|
||||||
|
|
||||||
if extension == ".xml":
|
if extension == ".xml":
|
||||||
|
@ -34,20 +34,28 @@ def load_gz(filename):
|
||||||
result = []
|
result = []
|
||||||
bad_sentence = False
|
bad_sentence = False
|
||||||
|
|
||||||
|
words = {}
|
||||||
|
links = []
|
||||||
|
|
||||||
|
def sentence_end(bad_sentence):
|
||||||
|
if bad_sentence:
|
||||||
|
return
|
||||||
|
|
||||||
|
for lfrom, ldest, ana in links:
|
||||||
|
if lfrom not in words or ldest not in words:
|
||||||
|
logging.warning("Bad link in sentence: " + line_split[0])
|
||||||
|
continue
|
||||||
|
words[lfrom].add_link(ana, words[ldest])
|
||||||
|
result.extend(words.values())
|
||||||
|
|
||||||
with gzip.open(filename, 'r') as fp:
|
with gzip.open(filename, 'r') as fp:
|
||||||
words = {}
|
for line in progress(fp, 'load-gz', infile=True):
|
||||||
links = []
|
|
||||||
for line in progress(fp, 'load-gz'):
|
|
||||||
line_str = line.decode('utf8').strip()
|
line_str = line.decode('utf8').strip()
|
||||||
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
|
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
|
||||||
line_split = line_fixed.split("\t")
|
line_split = line_fixed.split("\t")
|
||||||
|
|
||||||
if line_split[1] == "1" and len(words) > 0:
|
if line_split[1] == "1" and len(words) > 0:
|
||||||
if not bad_sentence:
|
sentence_end(bad_sentence)
|
||||||
for lfrom, ldest, ana in links:
|
|
||||||
words[lfrom].add_link(ana, words[ldest])
|
|
||||||
result.extend(words.values())
|
|
||||||
|
|
||||||
bad_sentence = False
|
bad_sentence = False
|
||||||
links = []
|
links = []
|
||||||
words = {}
|
words = {}
|
||||||
|
@ -62,10 +70,7 @@ def load_gz(filename):
|
||||||
if link_src != '0':
|
if link_src != '0':
|
||||||
links.append((link_src, wid, link_type))
|
links.append((link_src, wid, link_type))
|
||||||
|
|
||||||
for lfrom, ldest, ana in links:
|
sentence_end(bad_sentence)
|
||||||
words[lfrom].add_link(ana, words[ldest])
|
|
||||||
result.extend(words.values())
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def load_xml(filename):
|
def load_xml(filename):
|
||||||
|
|
|
@ -6,15 +6,14 @@ except ImportError:
|
||||||
|
|
||||||
class Progress:
|
class Progress:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.infile = False
|
self.hide_inner = False
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, iterable, description, infile=False, outfile=False):
|
def __call__(self, iterable, description, infile=False, outfile=False):
|
||||||
show_progress = True
|
show_progress = True
|
||||||
if infile and not self.infile:
|
if True in (infile, outfile):
|
||||||
show_progress = False
|
assert False in (infile, outfile)
|
||||||
elif outfile and self.infile:
|
show_progress = outfile == self.hide_inner
|
||||||
show_progress = False
|
|
||||||
|
|
||||||
if not show_progress:
|
if not show_progress:
|
||||||
yield from iterable
|
yield from iterable
|
||||||
|
@ -35,7 +34,7 @@ class Progress:
|
||||||
|
|
||||||
|
|
||||||
def init(self, args):
|
def init(self, args):
|
||||||
self.infile = not args.hide_inner_progress
|
self.hide_inner = args.hide_inner_progress
|
||||||
|
|
||||||
|
|
||||||
progress = Progress()
|
progress = Progress()
|
||||||
|
|
|
@ -72,7 +72,7 @@ def main(args):
|
||||||
word_stats.add_words(words)
|
word_stats.add_words(words)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
for words in progress(load_files(args), "files", outfile=True):
|
for words in load_files(args):
|
||||||
matches = match_file(words, structures)
|
matches = match_file(words, structures)
|
||||||
# just save to temporary file, used for children of a parallel process
|
# just save to temporary file, used for children of a parallel process
|
||||||
# MUST NOT have more than one file
|
# MUST NOT have more than one file
|
||||||
|
|
Loading…
Reference in New Issue
Block a user