diff --git a/src/wani.py b/src/wani.py index 16c9cc0..6ea7819 100644 --- a/src/wani.py +++ b/src/wani.py @@ -5,6 +5,7 @@ import logging import argparse import pickle import time +import gc import subprocess import concurrent.futures import tempfile @@ -42,49 +43,15 @@ def main(args): match_store = MatchStore(args, database) word_stats = WordStats(lemma_msds, database) - if args.parallel: - num_parallel = int(args.parallel) + for words in load_files(args): + matches = match_file(words, structures) + match_store.add_matches(matches) + word_stats.add_words(words) - # make temporary directory to hold temporary files - with tempfile.TemporaryDirectory() as tmpdirname: - cmd = sys.argv - for inpt in args.input: - if inpt in cmd: - cmd.remove(inpt) - - # remove "--parallel X" - pidx = cmd.index('--parallel') - del cmd[pidx] - del cmd[pidx] - - def func(n): - cmdn = [sys.executable] + cmd + [args.input[n], - "--match-to-file", "{}/{}.p".format(tmpdirname, n)] - subprocess.check_call(cmdn) - return n - - # use ThreadPoolExecuter to run subprocesses in parallel using py threads - with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor: - # fancy interface to wait for threads to finish - for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]): - with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp: - words, matches = pickle.load(fp) - - match_store.add_matches(matches) - word_stats.add_words(words) - - else: - for words in load_files(args): - matches = match_file(words, structures) - # just save to temporary file, used for children of a parallel process - # MUST NOT have more than one file - if args.match_to_file is not None: - with open(args.match_to_file, "wb") as fp: - pickle.dump((words, matches), fp) - return - else: - match_store.add_matches(matches) - word_stats.add_words(words) + # force a bit of garbage collection + del words + del matches + gc.collect() # get word renders for lemma/msd word_stats.generate_renders() @@ -148,10 +115,7 @@ if __name__ == '__main__': parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") - parser.add_argument('--parallel', - help='Run in multiple processes, should speed things up') - parser.add_argument('--match-to-file', help='Do not use!') args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())