luscenje_struktur/src/wani.py

126 lines
4.7 KiB
Python

from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import gc
import subprocess
import concurrent.futures
import tempfile
from progress_bar import progress
from word import Word
from syntactic_structure import build_structures
from match_store import MatchStore
from word_stats import WordStats
from writer import Writer
from loader import load_files
from database import Database
def match_file(words, structures):
matches = {s: [] for s in structures}
for s in progress(structures, "matching"):
for w in words:
mhere = s.match(w)
for match in mhere:
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
colocation_id = tuple(colocation_id)
matches[s].append((match, colocation_id))
return matches
def main(args):
structures, lemma_msds, max_num_components = build_structures(args)
database = Database(args)
match_store = MatchStore(args, database)
word_stats = WordStats(lemma_msds, database)
for words in load_files(args):
matches = match_file(words, structures)
match_store.add_matches(matches)
word_stats.add_words(words)
# force a bit of garbage collection
del words
del matches
gc.collect()
# get word renders for lemma/msd
word_stats.generate_renders()
match_store.determine_colocation_dispersions()
# figure out representations!
if args.out or args.out_no_stat:
match_store.set_representations(word_stats, structures)
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract structures from a parsed corpus.')
parser.add_argument('structures',
help='Structures definitions in xml file')
parser.add_argument('input',
help='input xml file in `ssj500k form`, can list more than one', nargs='+')
parser.add_argument('--out',
help='Classic output file')
parser.add_argument('--out-no-stat',
help='Output file, but without statistical columns')
parser.add_argument('--all',
help='Additional output file, writes more data')
parser.add_argument('--stats',
help='Output file for statistics')
parser.add_argument('--no-msd-translate',
help='MSDs are translated from slovene to english by default',
action='store_true')
parser.add_argument('--skip-id-check',
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
action='store_true')
parser.add_argument('--min_freq', help='Minimal frequency in output',
type=int, default=0, const=1, nargs='?')
parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
parser.add_argument('--count-files',
help="Count files: more verbose output", action='store_true')
parser.add_argument('--multiple-output',
help='Generate one output for each syntactic structure',
action='store_true')
parser.add_argument('--sort-by',
help="Sort by a this column (index)", type=int, default=-1)
parser.add_argument('--sort-reversed',
help="Sort in reversed ored", action='store_true')
parser.add_argument('--db',
help="Database file to use (instead of memory)", default=None)
parser.add_argument('--keep-db',
help="Does not recreate new database file", action='store_true')
parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc")
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))