luscenje_struktur/src/wani.py

158 lines
6.2 KiB
Python
Raw Normal View History

from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import subprocess
import concurrent.futures
import tempfile
2019-06-17 15:30:51 +00:00
from progress_bar import progress
from word import Word
from syntactic_structure import build_structures
from match_store import MatchStore
from word_stats import WordStats
from writer import Writer
2019-06-17 13:38:55 +00:00
from loader import load_files
def match_file(words, structures):
matches = {s: [] for s in structures}
2019-06-17 15:30:51 +00:00
for s in progress(structures, "matching", infile=True):
for w in words:
mhere = s.match(w)
for match in mhere:
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
colocation_id = tuple(colocation_id)
matches[s].append((match, colocation_id))
return matches
2019-06-15 23:00:22 +00:00
def main(args):
structures, lemma_msds, max_num_components = build_structures(args)
match_store = MatchStore(args)
word_stats = WordStats(lemma_msds)
if args.parallel:
num_parallel = int(args.parallel)
# make temporary directory to hold temporary files
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = sys.argv
for inpt in args.input:
if inpt in cmd:
cmd.remove(inpt)
# remove "--parallel X"
pidx = cmd.index('--parallel')
del cmd[pidx]
del cmd[pidx]
def func(n):
cmdn = [sys.executable] + cmd + [args.input[n],
"--match-to-file", "{}/{}.p".format(tmpdirname, n)]
subprocess.check_call(cmdn)
return n
# use ThreadPoolExecuter to run subprocesses in parallel using py threads
with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor:
# fancy interface to wait for threads to finish
for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
words, matches = pickle.load(fp)
match_store.add_matches(matches)
word_stats.add_words(words)
else:
for words in load_files(args):
matches = match_file(words, structures)
# just save to temporary file, used for children of a parallel process
# MUST NOT have more than one file
if args.match_to_file is not None:
with open(args.match_to_file, "wb") as fp:
pickle.dump((words, matches), fp)
return
else:
match_store.add_matches(matches)
word_stats.add_words(words)
# get word renders for lemma/msd
word_stats.generate_renders()
match_store.determine_colocation_dispersions()
# figure out representations!
if args.out or args.out_no_stat:
match_store.set_representations(word_stats)
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract structures from a parsed corpus.')
parser.add_argument('structures',
help='Structures definitions in xml file')
parser.add_argument('input',
help='input xml file in `ssj500k form`, can list more than one', nargs='+')
parser.add_argument('--out',
help='Classic output file')
parser.add_argument('--out-no-stat',
help='Output file, but without statistical columns')
parser.add_argument('--all',
help='Additional output file, writes more data')
parser.add_argument('--stats',
help='Output file for statistics')
parser.add_argument('--no-msd-translate',
help='MSDs are translated from slovene to english by default',
action='store_true')
parser.add_argument('--skip-id-check',
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
action='store_true')
parser.add_argument('--min_freq', help='Minimal frequency in output',
type=int, default=0, const=1, nargs='?')
parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
parser.add_argument('--count-files',
help="Count files: more verbose output", action='store_true')
parser.add_argument('--multiple-output',
help='Generate one output for each syntactic structure',
action='store_true')
parser.add_argument('--sort-by',
help="Sort by a this column (index)", type=int, default=-1)
parser.add_argument('--sort-reversed',
help="Sort in reversed ored", action='store_true')
parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc")
parser.add_argument('--parallel',
help='Run in multiple processes, should speed things up')
2019-06-15 23:00:22 +00:00
parser.add_argument('--match-to-file', help='Do not use!')
2019-06-15 23:00:22 +00:00
parser.add_argument('--pickled-structures', help='Do not use!', action='store_true')
2019-06-17 15:30:51 +00:00
parser.add_argument('--hide-inner-progress', help='Do not use!', action='store_true')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
2019-06-17 15:30:51 +00:00
progress.init(args)
start = time.time()
2019-06-15 23:00:22 +00:00
main(args)
logging.info("TIME: {}".format(time.time() - start))