Adding ability to load gz files.
This commit is contained in:
parent
d2f6f8dac8
commit
cfdb36b894
|
@ -2,6 +2,8 @@ from xml.etree import ElementTree
|
|||
import logging
|
||||
import re
|
||||
import sys
|
||||
import gzip
|
||||
import pathlib
|
||||
|
||||
from progress_bar import progress
|
||||
from word import Word
|
||||
|
@ -17,9 +19,54 @@ def load_files(args):
|
|||
do_msd_translate = not args.no_msd_translate
|
||||
|
||||
for fname in filenames:
|
||||
et = load_xml(fname)
|
||||
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
||||
extension = pathlib.Path(fname).suffix
|
||||
|
||||
if extension == ".xml":
|
||||
et = load_xml(fname)
|
||||
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
||||
elif extension == ".gz":
|
||||
yield load_gz(fname)
|
||||
else:
|
||||
raise NotImplementedError("Unknown file extension: {}".format(extension))
|
||||
|
||||
|
||||
def load_gz(filename):
|
||||
result = []
|
||||
bad_sentence = False
|
||||
|
||||
with gzip.open(filename, 'r') as fp:
|
||||
words = {}
|
||||
links = []
|
||||
for line in progress(fp, 'load-gz'):
|
||||
line_str = line.decode('utf8').strip()
|
||||
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
|
||||
line_split = line_fixed.split("\t")
|
||||
|
||||
if line_split[1] == "1" and len(words) > 0:
|
||||
if not bad_sentence:
|
||||
for lfrom, ldest, ana in links:
|
||||
words[lfrom].add_link(ana, words[ldest])
|
||||
result.extend(words.values())
|
||||
|
||||
bad_sentence = False
|
||||
links = []
|
||||
words = {}
|
||||
|
||||
try:
|
||||
sid, wid, text, msd, lemma, link_src, link_type = line_split
|
||||
except ValueError:
|
||||
bad_sentence = True
|
||||
full_id = "{}.{}".format(sid, wid)
|
||||
|
||||
words[wid] = Word(lemma, msd, full_id, text, True)
|
||||
if link_src != '0':
|
||||
links.append((link_src, wid, link_type))
|
||||
|
||||
for lfrom, ldest, ana in links:
|
||||
words[lfrom].add_link(ana, words[ldest])
|
||||
result.extend(words.values())
|
||||
|
||||
return result
|
||||
|
||||
def load_xml(filename):
|
||||
with open(filename, 'r') as fp:
|
||||
|
@ -35,7 +82,7 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
|||
sentences = list(et.iter('s'))
|
||||
for sentence in progress(sentences, "load-text", infile=True):
|
||||
for w in sentence.iter("w"):
|
||||
words[w.get('id')] = Word(w, do_msd_translate)
|
||||
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
||||
for pc in sentence.iter(pc_tag):
|
||||
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
||||
|
||||
|
|
22
src/word.py
22
src/word.py
|
@ -15,12 +15,12 @@ class WordMsdOnly:
|
|||
|
||||
|
||||
class Word:
|
||||
def __init__(self, xml, do_msd_translate):
|
||||
self.lemma = xml.get('lemma')
|
||||
self.msd = Word.get_msd(xml)
|
||||
self.msd = MSD_TRANSLATE[self.msd] if do_msd_translate else self.msd
|
||||
self.id = xml.get('id')
|
||||
self.text = xml.text
|
||||
def __init__(self, lemma, msd, wid, text, do_msd_translate):
|
||||
self.lemma = lemma
|
||||
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
|
||||
self.id = wid
|
||||
self.text = text
|
||||
|
||||
self.links = defaultdict(list)
|
||||
|
||||
last_num = self.id.split('.')[-1]
|
||||
|
@ -29,6 +29,14 @@ class Word:
|
|||
self.int_id = int(last_num)
|
||||
|
||||
assert None not in (self.id, self.lemma, self.msd)
|
||||
|
||||
@staticmethod
|
||||
def from_xml(xml, do_msd_translate):
|
||||
lemma = xml.get('lemma')
|
||||
msd = Word.get_msd(xml)
|
||||
wid = xml.get('id')
|
||||
text = xml.text
|
||||
return Word(lemma, msd, wid, text, do_msd_translate)
|
||||
|
||||
@staticmethod
|
||||
def get_msd(comp):
|
||||
|
@ -45,7 +53,7 @@ class Word:
|
|||
def pc_word(pc, do_msd_translate):
|
||||
pc.set('lemma', pc.text)
|
||||
pc.set('msd', "N" if do_msd_translate else "U")
|
||||
return Word(pc, do_msd_translate)
|
||||
return Word.from_xml(pc, do_msd_translate)
|
||||
|
||||
def add_link(self, link, to):
|
||||
self.links[link].append(to)
|
||||
|
|
Loading…
Reference in New Issue
Block a user