Adding ability to load gz files.

This commit is contained in:
2019-06-17 20:41:11 +02:00
parent d2f6f8dac8
commit cfdb36b894
2 changed files with 65 additions and 10 deletions

View File

@@ -2,6 +2,8 @@ from xml.etree import ElementTree
import logging
import re
import sys
import gzip
import pathlib
from progress_bar import progress
from word import Word
@@ -17,9 +19,54 @@ def load_files(args):
do_msd_translate = not args.no_msd_translate
for fname in filenames:
et = load_xml(fname)
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
extension = pathlib.Path(fname).suffix
if extension == ".xml":
et = load_xml(fname)
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
elif extension == ".gz":
yield load_gz(fname)
else:
raise NotImplementedError("Unknown file extension: {}".format(extension))
def load_gz(filename):
result = []
bad_sentence = False
with gzip.open(filename, 'r') as fp:
words = {}
links = []
for line in progress(fp, 'load-gz'):
line_str = line.decode('utf8').strip()
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0:
if not bad_sentence:
for lfrom, ldest, ana in links:
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
bad_sentence = False
links = []
words = {}
try:
sid, wid, text, msd, lemma, link_src, link_type = line_split
except ValueError:
bad_sentence = True
full_id = "{}.{}".format(sid, wid)
words[wid] = Word(lemma, msd, full_id, text, True)
if link_src != '0':
links.append((link_src, wid, link_type))
for lfrom, ldest, ana in links:
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
return result
def load_xml(filename):
with open(filename, 'r') as fp:
@@ -35,7 +82,7 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
sentences = list(et.iter('s'))
for sentence in progress(sentences, "load-text", infile=True):
for w in sentence.iter("w"):
words[w.get('id')] = Word(w, do_msd_translate)
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
for pc in sentence.iter(pc_tag):
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)