luscenje_struktur/src/wani.py

from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import gc
import subprocess
import concurrent.futures
import tempfile

from progress_bar import progress
from word import Word
from syntactic_structure import build_structures
from match_store import MatchStore
from word_stats import WordStats
from writer import Writer
from loader import load_files
from database import Database


def match_file(words, structures):
    matches = {s: [] for s in structures}

    for s in progress(structures, "matching"):
        for w in words:
            mhere = s.match(w)
            for match in mhere:
                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
                colocation_id = tuple(colocation_id)

                matches[s].append((match, colocation_id))

    return matches


def main(args):
    structures, lemma_msds, max_num_components = build_structures(args)

    database = Database(args)
    match_store = MatchStore(args, database)
    word_stats = WordStats(lemma_msds, database)

    for words in load_files(args):
        matches = match_file(words, structures)
        match_store.add_matches(matches)
        word_stats.add_words(words)

        # force a bit of garbage collection
        del words
        del matches
        gc.collect()

    # get word renders for lemma/msd
    word_stats.generate_renders()
    match_store.determine_colocation_dispersions()

    # figure out representations!
    if args.out or args.out_no_stat:
        match_store.set_representations(word_stats, structures)

    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
    Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
    Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
    Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Extract structures from a parsed corpus.')
    parser.add_argument('structures',
                        help='Structures definitions in xml file')
    parser.add_argument('input',
                        help='input xml file in `ssj500k form`, can list more than one', nargs='+')
    parser.add_argument('--out',
                        help='Classic output file')
    parser.add_argument('--out-no-stat',
                        help='Output file, but without statistical columns')
    parser.add_argument('--all',
                        help='Additional output file, writes more data')
    parser.add_argument('--stats',
                        help='Output file for statistics')

    parser.add_argument('--no-msd-translate',
                        help='MSDs are translated from slovene to english by default',
                        action='store_true')
    parser.add_argument('--skip-id-check',
                        help='Skips checks for ids of <w> and <pc>, if they are in correct format',
                        action='store_true')
    parser.add_argument('--min_freq', help='Minimal frequency in output',
                        type=int, default=0, const=1, nargs='?')
    parser.add_argument('--verbose', help='Enable verbose output to stderr',
                        choices=["warning", "info", "debug"], default="info",
                        const="info", nargs='?')
    parser.add_argument('--count-files',
                        help="Count files: more verbose output", action='store_true')
    parser.add_argument('--multiple-output',
                        help='Generate one output for each syntactic structure',
                        action='store_true')

    parser.add_argument('--sort-by',
                        help="Sort by a this column (index)", type=int, default=-1)
    parser.add_argument('--sort-reversed',
                        help="Sort in reversed ored", action='store_true')

    parser.add_argument('--db',
                        help="Database file to use (instead of memory)", default=None)
    parser.add_argument('--keep-db',
                        help="Does not recreate new database file", action='store_true')

    parser.add_argument('--pc-tag',
                        help='Tag for separators, usually pc or c', default="pc")


    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())

    start = time.time()
    main(args)
    logging.info("TIME: {}".format(time.time() - start))