From 3552f14b816fe8f6ba192bcb3e8ad0f72415fa0b Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Mon, 17 Jun 2019 15:38:55 +0200 Subject: [PATCH] Loader to its own module --- src/loader.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/wani.py | 68 +----------------------------------------------- 2 files changed, 73 insertions(+), 67 deletions(-) create mode 100644 src/loader.py diff --git a/src/loader.py b/src/loader.py new file mode 100644 index 0000000..267ff8d --- /dev/null +++ b/src/loader.py @@ -0,0 +1,72 @@ +from xml.etree import ElementTree +import logging +import re +import sys + +from tqdm import tqdm + +from word import Word + + +def is_root_id(id_): + return len(id_.split('.')) == 3 + + +def load_files(args): + filenames = args.input + skip_id_check = args.skip_id_check + do_msd_translate = not args.no_msd_translate + + for fname in filenames: + et = load_xml(fname) + yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) + + +def load_xml(filename): + logging.info("LOADING XML: {}".format(filename)) + with open(filename, 'r') as fp: + content = fp.read() + + xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1) + xmlstring = xmlstring.replace(' xml:', ' ') + return ElementTree.XML(xmlstring) + + +def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): + words = {} + sentences = list(et.iter('s')) + for sentence in tqdm(sentences, desc="load-text"): + for w in sentence.iter("w"): + words[w.get('id')] = Word(w, do_msd_translate) + for pc in sentence.iter(pc_tag): + words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) + + for l in sentence.iter("link"): + if 'dep' in l.keys(): + ana = l.get('afun') + lfrom = l.get('from') + dest = l.get('dep') + else: + ana = l.get('ana') + if ana[:4] != 'syn:': # dont bother... + continue + ana = ana[4:] + lfrom, dest = l.get('target').replace('#', '').split() + + if lfrom in words: + if not skip_id_check and is_root_id(lfrom): + logging.error("NOO: {}".format(lfrom)) + sys.exit(1) + + if dest in words: + next_word = words[dest] + words[lfrom].add_link(ana, next_word) + else: + logging.error("Unknown id: {}".format(dest)) + sys.exit(1) + + else: + # strange errors, just skip... + pass + + return list(words.values()) \ No newline at end of file diff --git a/src/wani.py b/src/wani.py index 12ef8e8..6a1b86a 100644 --- a/src/wani.py +++ b/src/wani.py @@ -19,73 +19,9 @@ from syntactic_structure import build_structures from match_store import MatchStore from word_stats import WordStats from writer import Writer +from loader import load_files -def is_root_id(id_): - return len(id_.split('.')) == 3 - - -def load_files(args): - filenames = args.input - skip_id_check = args.skip_id_check - do_msd_translate = not args.no_msd_translate - - for n, fname in enumerate(filenames): - et = load_xml(fname) - yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size) - - -def load_xml(filename): - logging.info("\rLOADING XML: {}".format(filename)) - with open(filename, 'r') as fp: - content = fp.read() - - xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1) - xmlstring = xmlstring.replace(' xml:', ' ') - return ElementTree.XML(xmlstring) - - -def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size): - words = {} - for sentence in et.iter('s'): - for w in sentence.iter("w"): - words[w.get('id')] = Word(w, do_msd_translate) - for pc in sentence.iter(pc_tag): - words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) - - for l in sentence.iter("link"): - if 'dep' in l.keys(): - ana = l.get('afun') - lfrom = l.get('from') - dest = l.get('dep') - else: - ana = l.get('ana') - if ana[:4] != 'syn:': # dont bother... - continue - ana = ana[4:] - lfrom, dest = l.get('target').replace('#', '').split() - - if lfrom in words: - if not skip_id_check and is_root_id(lfrom): - logging.error("NOO: {}".format(lfrom)) - sys.exit(1) - - if dest in words: - next_word = words[dest] - words[lfrom].add_link(ana, next_word) - else: - logging.error("Unknown id: {}".format(dest)) - sys.exit(1) - - else: - # strange errors, just skip... - pass - - if chunk_size > 0 and len(words) > chunk_size: - yield list(words.values()) - words = {} - - yield list(words.values()) def match_file(words, structures): matches = {s: [] for s in structures} @@ -109,8 +45,6 @@ def main(args): match_store = MatchStore(args) word_stats = WordStats(lemma_msds) - args.chunk_size = 50000 - if args.parallel: num_parallel = int(args.parallel)