from xml.etree import ElementTree import re import sys import logging import argparse import pickle import time import gc import subprocess import concurrent.futures import tempfile from luscenje_struktur.progress_bar import progress from luscenje_struktur.sloleks_db import SloleksDatabase from luscenje_struktur.word import Word from luscenje_struktur.syntactic_structure import build_structures from luscenje_struktur.match_store import MatchStore from luscenje_struktur.word_stats import WordStats from luscenje_struktur.writer import Writer from luscenje_struktur.loader import load_files from luscenje_struktur.database import Database from luscenje_struktur.time_info import TimeInfo from luscenje_struktur.postprocessor import Postprocessor def match_file(words, structures, postprocessor): matches = {s: [] for s in structures} for s in progress(structures, "matching"): for w in words: mhere = s.match(w) for match in mhere: if not postprocessor.is_fixed_restriction_order(match): continue colocation_id = [[idx, w.lemma] for idx, w in match.items()] colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) match, collocation_id = postprocessor.process(match, colocation_id) colocation_id = tuple(colocation_id) matches[s].append((match, colocation_id)) return matches def main(args): structures, lemma_msds, max_num_components = build_structures(args) timeinfo = TimeInfo(len(args.input)) database = Database(args) match_store = MatchStore(args, database) word_stats = WordStats(lemma_msds, database) postprocessor = Postprocessor(fixed_restriction_order=args.fixed_restriction_order) for words in load_files(args, database): if words is None: timeinfo.add_measurement(-1) continue start_time = time.time() matches = match_file(words, structures, postprocessor) match_store.add_matches(matches) word_stats.add_words(words) database.commit() # force a bit of garbage collection del words del matches gc.collect() timeinfo.add_measurement(time.time() - start_time) timeinfo.info() # if no output files, just exit if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]): return # get word renders for lemma/msd word_stats.generate_renders() match_store.determine_colocation_dispersions() # figure out representations! if args.out or args.out_no_stat: if args.sloleks_db is not None: sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks) else: sloleks_db = None match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db) if args.sloleks_db is not None: sloleks_db.close() Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Extract structures from a parsed corpus.') parser.add_argument('structures', help='Structures definitions in xml file') parser.add_argument('input', help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*') parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials') parser.add_argument('--out', help='Classic output file') parser.add_argument('--out-no-stat', help='Output file, but without statistical columns') parser.add_argument('--all', help='Additional output file, writes more data') parser.add_argument('--stats', help='Output file for statistics') # parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true') parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true') parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?') parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?') parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true') parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true') parser.add_argument('--load-sloleks', help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in', action='store_true') parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1) parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true') parser.add_argument('--db', help="Database file to use (instead of memory)", default=None) parser.add_argument('--collocation_sentence_map_dest', help="Destination to folder where collocation-sentence mapper (mappers in case of multiple-output).", default=None) parser.add_argument('--new-db', help="Writes over database file, if there exists one", action='store_true') parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") parser.add_argument('--separator', help='Separator in output file', default="\t") parser.add_argument('--ignore-punctuations', help="Sort in reversed ored", action='store_true') parser.add_argument('--fixed-restriction-order', help='If used, words have to be in the same order as components.', action='store_true') parser.add_argument('--new-tei', help='Attribute to be used, when using new version of tei. (default=False)', action='store_true') args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) start = time.time() main(args) logging.info("TIME: {}".format(time.time() - start))