From cfdb36b89435291077393f9a1f975e6ca917152d Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Mon, 17 Jun 2019 20:41:11 +0200 Subject: [PATCH] Adding ability to load gz files. --- src/loader.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++--- src/word.py | 22 ++++++++++++++------- 2 files changed, 65 insertions(+), 10 deletions(-) diff --git a/src/loader.py b/src/loader.py index ff1e662..1652e47 100644 --- a/src/loader.py +++ b/src/loader.py @@ -2,6 +2,8 @@ from xml.etree import ElementTree import logging import re import sys +import gzip +import pathlib from progress_bar import progress from word import Word @@ -17,9 +19,54 @@ def load_files(args): do_msd_translate = not args.no_msd_translate for fname in filenames: - et = load_xml(fname) - yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) + extension = pathlib.Path(fname).suffix + if extension == ".xml": + et = load_xml(fname) + yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) + elif extension == ".gz": + yield load_gz(fname) + else: + raise NotImplementedError("Unknown file extension: {}".format(extension)) + + +def load_gz(filename): + result = [] + bad_sentence = False + + with gzip.open(filename, 'r') as fp: + words = {} + links = [] + for line in progress(fp, 'load-gz'): + line_str = line.decode('utf8').strip() + line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t') + line_split = line_fixed.split("\t") + + if line_split[1] == "1" and len(words) > 0: + if not bad_sentence: + for lfrom, ldest, ana in links: + words[lfrom].add_link(ana, words[ldest]) + result.extend(words.values()) + + bad_sentence = False + links = [] + words = {} + + try: + sid, wid, text, msd, lemma, link_src, link_type = line_split + except ValueError: + bad_sentence = True + full_id = "{}.{}".format(sid, wid) + + words[wid] = Word(lemma, msd, full_id, text, True) + if link_src != '0': + links.append((link_src, wid, link_type)) + + for lfrom, ldest, ana in links: + words[lfrom].add_link(ana, words[ldest]) + result.extend(words.values()) + + return result def load_xml(filename): with open(filename, 'r') as fp: @@ -35,7 +82,7 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): sentences = list(et.iter('s')) for sentence in progress(sentences, "load-text", infile=True): for w in sentence.iter("w"): - words[w.get('id')] = Word(w, do_msd_translate) + words[w.get('id')] = Word.from_xml(w, do_msd_translate) for pc in sentence.iter(pc_tag): words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) diff --git a/src/word.py b/src/word.py index 569d57c..bbf3889 100644 --- a/src/word.py +++ b/src/word.py @@ -15,12 +15,12 @@ class WordMsdOnly: class Word: - def __init__(self, xml, do_msd_translate): - self.lemma = xml.get('lemma') - self.msd = Word.get_msd(xml) - self.msd = MSD_TRANSLATE[self.msd] if do_msd_translate else self.msd - self.id = xml.get('id') - self.text = xml.text + def __init__(self, lemma, msd, wid, text, do_msd_translate): + self.lemma = lemma + self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd + self.id = wid + self.text = text + self.links = defaultdict(list) last_num = self.id.split('.')[-1] @@ -29,6 +29,14 @@ class Word: self.int_id = int(last_num) assert None not in (self.id, self.lemma, self.msd) + + @staticmethod + def from_xml(xml, do_msd_translate): + lemma = xml.get('lemma') + msd = Word.get_msd(xml) + wid = xml.get('id') + text = xml.text + return Word(lemma, msd, wid, text, do_msd_translate) @staticmethod def get_msd(comp): @@ -45,7 +53,7 @@ class Word: def pc_word(pc, do_msd_translate): pc.set('lemma', pc.text) pc.set('msd', "N" if do_msd_translate else "U") - return Word(pc, do_msd_translate) + return Word.from_xml(pc, do_msd_translate) def add_link(self, link, to): self.links[link].append(to)