from xml.etree import ElementTree import logging import re import sys from tqdm import tqdm from word import Word def is_root_id(id_): return len(id_.split('.')) == 3 def load_files(args): filenames = args.input skip_id_check = args.skip_id_check do_msd_translate = not args.no_msd_translate for fname in filenames: et = load_xml(fname) yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) def load_xml(filename): logging.info("LOADING XML: {}".format(filename)) with open(filename, 'r') as fp: content = fp.read() xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1) xmlstring = xmlstring.replace(' xml:', ' ') return ElementTree.XML(xmlstring) def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): words = {} sentences = list(et.iter('s')) for sentence in tqdm(sentences, desc="load-text"): for w in sentence.iter("w"): words[w.get('id')] = Word(w, do_msd_translate) for pc in sentence.iter(pc_tag): words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) for l in sentence.iter("link"): if 'dep' in l.keys(): ana = l.get('afun') lfrom = l.get('from') dest = l.get('dep') else: ana = l.get('ana') if ana[:4] != 'syn:': # dont bother... continue ana = ana[4:] lfrom, dest = l.get('target').replace('#', '').split() if lfrom in words: if not skip_id_check and is_root_id(lfrom): logging.error("NOO: {}".format(lfrom)) sys.exit(1) if dest in words: next_word = words[dest] words[lfrom].add_link(ana, next_word) else: logging.error("Unknown id: {}".format(dest)) sys.exit(1) else: # strange errors, just skip... pass return list(words.values())