luscenje_struktur/src/wani.py

165 lines
6.6 KiB
Python

from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import subprocess
import concurrent.futures
import tempfile
from progress_bar import progress
from word import Word
from syntactic_structure import build_structures
from match_store import MatchStore
from word_stats import WordStats
from writer import Writer
from loader import load_files
from database import Database
def match_file(words, structures):
matches = {s: [] for s in structures}
for s in progress(structures, "matching", infile=True):
for w in words:
mhere = s.match(w)
for match in mhere:
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
colocation_id = tuple(colocation_id)
matches[s].append((match, colocation_id))
return matches
def main(args):
structures, lemma_msds, max_num_components = build_structures(args)
database = Database(args)
match_store = MatchStore(args, database)
word_stats = WordStats(lemma_msds, database)
if args.parallel:
num_parallel = int(args.parallel)
# make temporary directory to hold temporary files
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = sys.argv
for inpt in args.input:
if inpt in cmd:
cmd.remove(inpt)
# remove "--parallel X"
pidx = cmd.index('--parallel')
del cmd[pidx]
del cmd[pidx]
def func(n):
cmdn = [sys.executable] + cmd + [args.input[n],
"--match-to-file", "{}/{}.p".format(tmpdirname, n)]
subprocess.check_call(cmdn)
return n
# use ThreadPoolExecuter to run subprocesses in parallel using py threads
with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor:
# fancy interface to wait for threads to finish
for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
words, matches = pickle.load(fp)
match_store.add_matches(matches)
word_stats.add_words(words)
else:
for words in load_files(args):
matches = match_file(words, structures)
# just save to temporary file, used for children of a parallel process
# MUST NOT have more than one file
if args.match_to_file is not None:
with open(args.match_to_file, "wb") as fp:
pickle.dump((words, matches), fp)
return
else:
match_store.add_matches(matches)
word_stats.add_words(words)
# get word renders for lemma/msd
word_stats.generate_renders()
match_store.determine_colocation_dispersions()
# figure out representations!
if args.out or args.out_no_stat:
match_store.set_representations(word_stats, structures)
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract structures from a parsed corpus.')
parser.add_argument('structures',
help='Structures definitions in xml file')
parser.add_argument('input',
help='input xml file in `ssj500k form`, can list more than one', nargs='+')
parser.add_argument('--out',
help='Classic output file')
parser.add_argument('--out-no-stat',
help='Output file, but without statistical columns')
parser.add_argument('--all',
help='Additional output file, writes more data')
parser.add_argument('--stats',
help='Output file for statistics')
parser.add_argument('--no-msd-translate',
help='MSDs are translated from slovene to english by default',
action='store_true')
parser.add_argument('--skip-id-check',
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
action='store_true')
parser.add_argument('--min_freq', help='Minimal frequency in output',
type=int, default=0, const=1, nargs='?')
parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
parser.add_argument('--count-files',
help="Count files: more verbose output", action='store_true')
parser.add_argument('--multiple-output',
help='Generate one output for each syntactic structure',
action='store_true')
parser.add_argument('--sort-by',
help="Sort by a this column (index)", type=int, default=-1)
parser.add_argument('--sort-reversed',
help="Sort in reversed ored", action='store_true')
parser.add_argument('--db',
help="Database file to use (instead of memory)", default=None)
parser.add_argument('--keep-db',
help="Does not recreate new database file", action='store_true')
parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc")
parser.add_argument('--parallel',
help='Run in multiple processes, should speed things up')
parser.add_argument('--match-to-file', help='Do not use!')
parser.add_argument('--pickled-structures', help='Do not use!', action='store_true')
parser.add_argument('--hide-inner-progress', help='Do not use!', action='store_true')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
progress.init(args)
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))