can now load csv files

This commit is contained in:
Ozbolt Menegatti 2019-08-21 11:09:47 +02:00
parent d497749c78
commit 3f1c154705

View File

@ -26,12 +26,25 @@ def load_files(args):
et = load_xml(fname) et = load_xml(fname)
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
elif extension == ".gz": elif extension == ".gz":
yield load_gz(fname) yield load_csv(fname, True)
else: else:
raise NotImplementedError("Unknown file extension: {}".format(extension)) yield load_csv(fname, False)
# else:
# raise NotImplementedError("Unknown file extension: {}".format(extension))
def lines_gz(filename):
with gzip.open(filename, 'r') as fp:
for line in progress(fp, 'load-gz'):
yield line.decode('utf8')
def load_gz(filename): def lines_csv(filename):
with open(filename, 'r') as fp:
for line in progress(fp, 'load-csv'):
yield line
def load_csv(filename, compressed):
result = [] result = []
bad_sentence = False bad_sentence = False
@ -49,27 +62,27 @@ def load_gz(filename):
words[lfrom].add_link(ana, words[ldest]) words[lfrom].add_link(ana, words[ldest])
result.extend(words.values()) result.extend(words.values())
with gzip.open(filename, 'r') as fp: line_gen = lines_gz if compressed else lines_csv
for line in progress(fp, 'load-gz'): for line in line_gen(filename):
line_str = line.decode('utf8').strip() line_str = line.strip()
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t') line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t") line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0: if line_split[1] == "1" and len(words) > 0:
sentence_end(bad_sentence) sentence_end(bad_sentence)
bad_sentence = False bad_sentence = False
links = [] links = []
words = {} words = {}
try: try:
sid, wid, text, msd, lemma, link_src, link_type = line_split sid, wid, text, msd, lemma, link_src, link_type = line_split
except ValueError: except ValueError:
bad_sentence = True bad_sentence = True
full_id = "{}.{}".format(sid, wid) full_id = "{}.{}".format(sid, wid)
words[wid] = Word(lemma, msd, full_id, text, True) words[wid] = Word(lemma, msd, full_id, text, True)
if link_src != '0': if link_src != '0':
links.append((link_src, wid, link_type)) links.append((link_src, wid, link_type))
sentence_end(bad_sentence) sentence_end(bad_sentence)
return result return result