Adding ability to load gz files.

This commit is contained in:
Ozbolt Menegatti 2019-06-17 20:41:11 +02:00
parent d2f6f8dac8
commit cfdb36b894
2 changed files with 65 additions and 10 deletions

View File

@ -2,6 +2,8 @@ from xml.etree import ElementTree
import logging
import re
import sys
import gzip
import pathlib
from progress_bar import progress
from word import Word
@ -17,10 +19,55 @@ def load_files(args):
do_msd_translate = not args.no_msd_translate
for fname in filenames:
extension = pathlib.Path(fname).suffix
if extension == ".xml":
et = load_xml(fname)
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
elif extension == ".gz":
yield load_gz(fname)
else:
raise NotImplementedError("Unknown file extension: {}".format(extension))
def load_gz(filename):
result = []
bad_sentence = False
with gzip.open(filename, 'r') as fp:
words = {}
links = []
for line in progress(fp, 'load-gz'):
line_str = line.decode('utf8').strip()
line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
line_split = line_fixed.split("\t")
if line_split[1] == "1" and len(words) > 0:
if not bad_sentence:
for lfrom, ldest, ana in links:
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
bad_sentence = False
links = []
words = {}
try:
sid, wid, text, msd, lemma, link_src, link_type = line_split
except ValueError:
bad_sentence = True
full_id = "{}.{}".format(sid, wid)
words[wid] = Word(lemma, msd, full_id, text, True)
if link_src != '0':
links.append((link_src, wid, link_type))
for lfrom, ldest, ana in links:
words[lfrom].add_link(ana, words[ldest])
result.extend(words.values())
return result
def load_xml(filename):
with open(filename, 'r') as fp:
content = fp.read()
@ -35,7 +82,7 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
sentences = list(et.iter('s'))
for sentence in progress(sentences, "load-text", infile=True):
for w in sentence.iter("w"):
words[w.get('id')] = Word(w, do_msd_translate)
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
for pc in sentence.iter(pc_tag):
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)

View File

@ -15,12 +15,12 @@ class WordMsdOnly:
class Word:
def __init__(self, xml, do_msd_translate):
self.lemma = xml.get('lemma')
self.msd = Word.get_msd(xml)
self.msd = MSD_TRANSLATE[self.msd] if do_msd_translate else self.msd
self.id = xml.get('id')
self.text = xml.text
def __init__(self, lemma, msd, wid, text, do_msd_translate):
self.lemma = lemma
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
self.id = wid
self.text = text
self.links = defaultdict(list)
last_num = self.id.split('.')[-1]
@ -30,6 +30,14 @@ class Word:
assert None not in (self.id, self.lemma, self.msd)
@staticmethod
def from_xml(xml, do_msd_translate):
lemma = xml.get('lemma')
msd = Word.get_msd(xml)
wid = xml.get('id')
text = xml.text
return Word(lemma, msd, wid, text, do_msd_translate)
@staticmethod
def get_msd(comp):
d = dict(comp.items())
@ -45,7 +53,7 @@ class Word:
def pc_word(pc, do_msd_translate):
pc.set('lemma', pc.text)
pc.set('msd', "N" if do_msd_translate else "U")
return Word(pc, do_msd_translate)
return Word.from_xml(pc, do_msd_translate)
def add_link(self, link, to):
self.links[link].append(to)