from xml.etree import ElementTree import logging import re import sys import gzip import pathlib from progress_bar import progress from word import Word def is_root_id(id_): return len(id_.split('.')) == 3 def load_files(args): filenames = args.input skip_id_check = args.skip_id_check do_msd_translate = not args.no_msd_translate for idx, fname in enumerate(filenames): print("FILE ", fname, "{}/{}".format(idx, len(filenames))) extension = pathlib.Path(fname).suffix if extension == ".xml": et = load_xml(fname) yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag) elif extension == ".gz": yield load_csv(fname, True) else: yield load_csv(fname, False) # else: # raise NotImplementedError("Unknown file extension: {}".format(extension)) def lines_gz(filename): with gzip.open(filename, 'r') as fp: for line in progress(fp, 'load-gz'): yield line.decode('utf8') def lines_csv(filename): with open(filename, 'r') as fp: for line in progress(fp, 'load-csv'): yield line def load_csv(filename, compressed): result = [] bad_sentence = False words = {} links = [] def sentence_end(bad_sentence): if bad_sentence: return for lfrom, ldest, ana in links: if lfrom not in words or ldest not in words: logging.warning("Bad link in sentence: " + line_split[0]) continue words[lfrom].add_link(ana, words[ldest]) result.extend(words.values()) line_gen = lines_gz if compressed else lines_csv for line in line_gen(filename): line_str = line.strip() line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t') line_split = line_fixed.split("\t") if line_split[1] == "1" and len(words) > 0: sentence_end(bad_sentence) bad_sentence = False links = [] words = {} try: sid, wid, text, msd, lemma, link_src, link_type = line_split except ValueError: bad_sentence = True full_id = "{}.{}".format(sid, wid) words[wid] = Word(lemma, msd, full_id, text, True) if link_src != '0': links.append((link_src, wid, link_type)) sentence_end(bad_sentence) return result def load_xml(filename): with open(filename, 'r') as fp: content = fp.read() xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1) xmlstring = xmlstring.replace(' xml:', ' ') return ElementTree.XML(xmlstring) def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): words = {} sentences = list(et.iter('s')) for sentence in progress(sentences, "load-text"): for w in sentence.iter("w"): words[w.get('id')] = Word.from_xml(w, do_msd_translate) for pc in sentence.iter(pc_tag): words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) for l in sentence.iter("link"): if 'dep' in l.keys(): ana = l.get('afun') lfrom = l.get('from') dest = l.get('dep') else: ana = l.get('ana') if ana[:4] != 'syn:': # dont bother... continue ana = ana[4:] lfrom, dest = l.get('target').replace('#', '').split() if lfrom in words: if not skip_id_check and is_root_id(lfrom): logging.error("NOO: {}".format(lfrom)) sys.exit(1) if dest in words: next_word = words[dest] words[lfrom].add_link(ana, next_word) else: logging.error("Unknown id: {}".format(dest)) sys.exit(1) else: # strange errors, just skip... pass return list(words.values())