can now load csv files
This commit is contained in:
parent
d497749c78
commit
3f1c154705
|
@ -26,12 +26,25 @@ def load_files(args):
|
||||||
et = load_xml(fname)
|
et = load_xml(fname)
|
||||||
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
||||||
elif extension == ".gz":
|
elif extension == ".gz":
|
||||||
yield load_gz(fname)
|
yield load_csv(fname, True)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("Unknown file extension: {}".format(extension))
|
yield load_csv(fname, False)
|
||||||
|
# else:
|
||||||
|
# raise NotImplementedError("Unknown file extension: {}".format(extension))
|
||||||
|
|
||||||
|
def lines_gz(filename):
|
||||||
|
with gzip.open(filename, 'r') as fp:
|
||||||
|
for line in progress(fp, 'load-gz'):
|
||||||
|
yield line.decode('utf8')
|
||||||
|
|
||||||
|
|
||||||
def load_gz(filename):
|
def lines_csv(filename):
|
||||||
|
with open(filename, 'r') as fp:
|
||||||
|
for line in progress(fp, 'load-csv'):
|
||||||
|
yield line
|
||||||
|
|
||||||
|
|
||||||
|
def load_csv(filename, compressed):
|
||||||
result = []
|
result = []
|
||||||
bad_sentence = False
|
bad_sentence = False
|
||||||
|
|
||||||
|
@ -49,27 +62,27 @@ def load_gz(filename):
|
||||||
words[lfrom].add_link(ana, words[ldest])
|
words[lfrom].add_link(ana, words[ldest])
|
||||||
result.extend(words.values())
|
result.extend(words.values())
|
||||||
|
|
||||||
with gzip.open(filename, 'r') as fp:
|
line_gen = lines_gz if compressed else lines_csv
|
||||||
for line in progress(fp, 'load-gz'):
|
for line in line_gen(filename):
|
||||||
line_str = line.decode('utf8').strip()
|
line_str = line.strip()
|
||||||
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
|
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
|
||||||
line_split = line_fixed.split("\t")
|
line_split = line_fixed.split("\t")
|
||||||
|
|
||||||
if line_split[1] == "1" and len(words) > 0:
|
if line_split[1] == "1" and len(words) > 0:
|
||||||
sentence_end(bad_sentence)
|
sentence_end(bad_sentence)
|
||||||
bad_sentence = False
|
bad_sentence = False
|
||||||
links = []
|
links = []
|
||||||
words = {}
|
words = {}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sid, wid, text, msd, lemma, link_src, link_type = line_split
|
sid, wid, text, msd, lemma, link_src, link_type = line_split
|
||||||
except ValueError:
|
except ValueError:
|
||||||
bad_sentence = True
|
bad_sentence = True
|
||||||
full_id = "{}.{}".format(sid, wid)
|
full_id = "{}.{}".format(sid, wid)
|
||||||
|
|
||||||
words[wid] = Word(lemma, msd, full_id, text, True)
|
words[wid] = Word(lemma, msd, full_id, text, True)
|
||||||
if link_src != '0':
|
if link_src != '0':
|
||||||
links.append((link_src, wid, link_type))
|
links.append((link_src, wid, link_type))
|
||||||
|
|
||||||
sentence_end(bad_sentence)
|
sentence_end(bad_sentence)
|
||||||
return result
|
return result
|
||||||
|
|
Loading…
Reference in New Issue
Block a user