from xml.etree import ElementTree import re import sys import logging import argparse import pickle import time import subprocess import concurrent.futures import tempfile try: from tqdm import tqdm except ImportError: tqdm = lambda x: x from word import Word from syntactic_structure import build_structures from match_store import MatchStore from word_stats import WordStats from writer import Writer def is_root_id(id_): return len(id_.split('.')) == 3 def load_files(args): filenames = args.input skip_id_check = args.skip_id_check do_msd_translate = not args.no_msd_translate for n, fname in enumerate(filenames): et = load_xml(fname) yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size) if args.count_files: status = " :: {} / {}".format(n, len(filenames)) else: status = "" yield from file_sentence_generator(fname, skip_id_check, do_msd_translate, args.pc_tag, status) def load_xml(filename, status): logging.info("LOADING XML: {}{}".format(filename, status)) with open(filename, 'r') as fp: content = fp.read() xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1) xmlstring = xmlstring.replace(' xml:', ' ') return ElementTree.XML(xmlstring) def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size): words = {} for sentence in et.iter('s'): for w in sentence.iter("w"): words[w.get('id')] = Word(w, do_msd_translate) for pc in sentence.iter(pc_tag): words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) for l in sentence.iter("link"): if 'dep' in l.keys(): ana = l.get('afun') lfrom = l.get('from') dest = l.get('dep') else: ana = l.get('ana') if ana[:4] != 'syn:': # dont bother... continue ana = ana[4:] lfrom, dest = l.get('target').replace('#', '').split() if lfrom in words: if not skip_id_check and is_root_id(lfrom): logging.error("NOO: {}".format(lfrom)) sys.exit(1) if dest in words: next_word = words[dest] words[lfrom].add_link(ana, next_word) else: logging.error("Unknown id: {}".format(dest)) sys.exit(1) else: # strange errors, just skip... pass if chunk_size > 0 and len(words) > chunk_size: yield list(words.values()) words = {} yield list(words.values()) def match_file(words, structures): matches = {s: [] for s in structures} for s in structures: for w in words: mhere = s.match(w) for match in mhere: colocation_id = [(idx, w.lemma) for idx, w in match.items()] colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) colocation_id = tuple(colocation_id) matches[s].append((match, colocation_id)) return matches def main(structures_file, args): structures, lemma_msds, max_num_components = build_structures(structures_file) match_store = MatchStore(args) word_stats = WordStats(lemma_msds) args.chunk_size = 50000 if args.parallel: num_parallel = int(args.parallel) # make temporary directory to hold temporary files with tempfile.TemporaryDirectory() as tmpdirname: cmd = sys.argv for inpt in args.input: if inpt in cmd: cmd.remove(inpt) # remove "--parallel X" pidx = cmd.index('--parallel') del cmd[pidx] del cmd[pidx] def func(n): cmdn = [sys.executable] + cmd + [args.input[n], "--match-to-file", "{}/{}.p".format(tmpdirname, n)] subprocess.check_call(cmdn) return n # use ThreadPoolExecuter to run subprocesses in parallel using py threads with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor: # fancy interface to wait for threads to finish for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]): with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp: words, matches = pickle.load(fp) match_store.add_matches(matches) word_stats.add_words(words) else: for words in tqdm(load_files(args)): matches = match_file(words, structures) # just save to temporary file, used for children of a parallel process # MUST NOT have more than one file if args.match_to_file is not None: with open(args.match_to_file, "wb") as fp: pickle.dump((words, matches), fp) return else: match_store.add_matches(matches) word_stats.add_words(words) # get word renders for lemma/msd word_stats.generate_renders() match_store.determine_colocation_dispersions() # figure out representations! if args.out or args.out_no_stat: match_store.set_representations(word_stats) Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Extract structures from a parsed corpus.') parser.add_argument('structures', help='Structures definitions in xml file') parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+') parser.add_argument('--out', help='Classic output file') parser.add_argument('--out-no-stat', help='Output file, but without statistical columns') parser.add_argument('--all', help='Additional output file, writes more data') parser.add_argument('--stats', help='Output file for statistics') parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true') parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true') parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?') parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?') parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true') parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true') parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1) parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true') parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") parser.add_argument('--parallel', help='Run in multiple processes, should speed things up') parser.add_argument('--match-to-file', help='Do not use!') args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) start = time.time() main(args.structures, args) logging.info("TIME: {}".format(time.time() - start))