diff --git a/src/loader.py b/src/loader.py index 0215a9e..9664fc7 100644 --- a/src/loader.py +++ b/src/loader.py @@ -26,12 +26,25 @@ def load_files(args): et = load_xml(fname) yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) elif extension == ".gz": - yield load_gz(fname) + yield load_csv(fname, True) else: - raise NotImplementedError("Unknown file extension: {}".format(extension)) + yield load_csv(fname, False) + # else: + # raise NotImplementedError("Unknown file extension: {}".format(extension)) + +def lines_gz(filename): + with gzip.open(filename, 'r') as fp: + for line in progress(fp, 'load-gz'): + yield line.decode('utf8') -def load_gz(filename): +def lines_csv(filename): + with open(filename, 'r') as fp: + for line in progress(fp, 'load-csv'): + yield line + + +def load_csv(filename, compressed): result = [] bad_sentence = False @@ -49,27 +62,27 @@ def load_gz(filename): words[lfrom].add_link(ana, words[ldest]) result.extend(words.values()) - with gzip.open(filename, 'r') as fp: - for line in progress(fp, 'load-gz'): - line_str = line.decode('utf8').strip() - line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t') - line_split = line_fixed.split("\t") + line_gen = lines_gz if compressed else lines_csv + for line in line_gen(filename): + line_str = line.strip() + line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t') + line_split = line_fixed.split("\t") - if line_split[1] == "1" and len(words) > 0: - sentence_end(bad_sentence) - bad_sentence = False - links = [] - words = {} + if line_split[1] == "1" and len(words) > 0: + sentence_end(bad_sentence) + bad_sentence = False + links = [] + words = {} - try: - sid, wid, text, msd, lemma, link_src, link_type = line_split - except ValueError: - bad_sentence = True - full_id = "{}.{}".format(sid, wid) - - words[wid] = Word(lemma, msd, full_id, text, True) - if link_src != '0': - links.append((link_src, wid, link_type)) + try: + sid, wid, text, msd, lemma, link_src, link_type = line_split + except ValueError: + bad_sentence = True + full_id = "{}.{}".format(sid, wid) + + words[wid] = Word(lemma, msd, full_id, text, True) + if link_src != '0': + links.append((link_src, wid, link_type)) sentence_end(bad_sentence) return result