Moved wani.py + Added ignore of .zstd files for valency

2020-10-01 16:20:52 +02:00
parent 412d0c0f62
commit d5668c8b68
3 changed files with 5 additions and 1 deletions
--- a/wani.py
+++ b/wani.py
@@ -0,0 +1,160 @@
+from xml.etree import ElementTree
+import re
+import sys
+import logging
+import argparse
+import pickle
+import time
+import gc
+import subprocess
+import concurrent.futures
+import tempfile
+
+from luscenje_struktur.progress_bar import progress
+from luscenje_struktur.sloleks_db import SloleksDatabase
+from luscenje_struktur.word import Word
+from luscenje_struktur.syntactic_structure import build_structures
+from luscenje_struktur.match_store import MatchStore
+from luscenje_struktur.word_stats import WordStats
+from luscenje_struktur.writer import Writer
+from luscenje_struktur.loader import load_files
+from luscenje_struktur.database import Database
+from luscenje_struktur.time_info import TimeInfo
+
+from luscenje_struktur.postprocessor import Postprocessor
+
+
+def match_file(words, structures, postprocessor):
+    matches = {s: [] for s in structures}
+
+    for s in progress(structures, "matching"):
+        for w in words:
+            mhere = s.match(w)
+            for match in mhere:
+                colocation_id = [[idx, w.lemma] for idx, w in match.items()]
+                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
+                match, collocation_id = postprocessor.process(match, colocation_id)
+                colocation_id = tuple(colocation_id)
+
+                matches[s].append((match, colocation_id))
+
+    return matches
+
+
+def main(args):
+    structures, lemma_msds, max_num_components = build_structures(args)
+    timeinfo = TimeInfo(len(args.input))
+
+    database = Database(args)
+    match_store = MatchStore(args, database)
+    word_stats = WordStats(lemma_msds, database)
+
+    for words in load_files(args, database):
+        if words is None:
+            timeinfo.add_measurement(-1)
+            continue
+
+        start_time = time.time()
+        postprocessor = Postprocessor()
+        matches = match_file(words, structures, postprocessor)
+
+        match_store.add_matches(matches)
+        word_stats.add_words(words)
+        database.commit()
+
+        # force a bit of garbage collection
+        del words
+        del matches
+        gc.collect()
+
+        timeinfo.add_measurement(time.time() - start_time)
+        timeinfo.info()
+
+    # if no output files, just exit
+    if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
+        return
+
+    # get word renders for lemma/msd
+    word_stats.generate_renders()
+    match_store.determine_colocation_dispersions()
+
+    # figure out representations!
+    if args.out or args.out_no_stat:
+        if args.sloleks_db is not None:
+            sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
+        else:
+            sloleks_db = None
+        match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
+        if args.sloleks_db is not None:
+            sloleks_db.close()
+
+    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
+        structures, match_store)
+    Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
+        structures, match_store)
+    Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
+        structures, match_store)
+    Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
+        structures, match_store)
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Extract structures from a parsed corpus.')
+    parser.add_argument('structures',
+                        help='Structures definitions in xml file')
+    parser.add_argument('input',
+                        help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
+    parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials')
+    parser.add_argument('--out',
+                        help='Classic output file')
+    parser.add_argument('--out-no-stat',
+                        help='Output file, but without statistical columns')
+    parser.add_argument('--all',
+                        help='Additional output file, writes more data')
+    parser.add_argument('--stats',
+                        help='Output file for statistics')
+#
+    parser.add_argument('--no-msd-translate',
+                        help='MSDs are translated from slovene to english by default',
+                        action='store_true')
+    parser.add_argument('--skip-id-check',
+                        help='Skips checks for ids of <w> and <pc>, if they are in correct format',
+                        action='store_true')
+    parser.add_argument('--min_freq', help='Minimal frequency in output',
+                        type=int, default=0, const=1, nargs='?')
+    parser.add_argument('--verbose', help='Enable verbose output to stderr',
+                        choices=["warning", "info", "debug"], default="info",
+                        const="info", nargs='?')
+    parser.add_argument('--count-files',
+                        help="Count files: more verbose output", action='store_true')
+    parser.add_argument('--multiple-output',
+                        help='Generate one output for each syntactic structure',
+                        action='store_true')
+
+    parser.add_argument('--load-sloleks',
+                        help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
+                        action='store_true')
+
+    parser.add_argument('--sort-by',
+                        help="Sort by a this column (index)", type=int, default=-1)
+    parser.add_argument('--sort-reversed',
+                        help="Sort in reversed ored", action='store_true')
+
+    parser.add_argument('--db',
+                        help="Database file to use (instead of memory)", default=None)
+    parser.add_argument('--collocation_sentence_map_dest',
+                        help="Destination to folder where collocation-sentence mapper (mappers in case of multiple-output).", default=None)
+    parser.add_argument('--new-db',
+                        help="Writes over database file, if there exists one", action='store_true')
+
+    parser.add_argument('--pc-tag',
+                        help='Tag for separators, usually pc or c', default="pc")
+
+    args = parser.parse_args()
+    logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
+
+    start = time.time()
+    main(args)
+    logging.info("TIME: {}".format(time.time() - start))