luscenje_struktur/luscenje_struktur/wani.py

161 lines
6.1 KiB
Python

from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import gc
import subprocess
import concurrent.futures
import tempfile
from progress_bar import progress
from sloleks_db import SloleksDatabase
from word import Word
from syntactic_structure import build_structures
from match_store import MatchStore
from word_stats import WordStats
from writer import Writer
from loader import load_files
from database import Database
from time_info import TimeInfo
from postprocessor import Postprocessor
def match_file(words, structures, postprocessor):
matches = {s: [] for s in structures}
for s in progress(structures, "matching"):
for w in words:
mhere = s.match(w)
for match in mhere:
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
match, collocation_id = postprocessor.process(match, colocation_id)
colocation_id = tuple(colocation_id)
matches[s].append((match, colocation_id))
return matches
def main(args):
structures, lemma_msds, max_num_components = build_structures(args)
timeinfo = TimeInfo(len(args.input))
database = Database(args)
match_store = MatchStore(args, database)
word_stats = WordStats(lemma_msds, database)
for words in load_files(args, database):
if words is None:
timeinfo.add_measurement(-1)
continue
start_time = time.time()
postprocessor = Postprocessor()
matches = match_file(words, structures, postprocessor)
match_store.add_matches(matches)
word_stats.add_words(words)
database.commit()
# force a bit of garbage collection
del words
del matches
gc.collect()
timeinfo.add_measurement(time.time() - start_time)
timeinfo.info()
# if no output files, just exit
if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
return
# get word renders for lemma/msd
word_stats.generate_renders()
match_store.determine_colocation_dispersions()
# figure out representations!
if args.out or args.out_no_stat:
if args.sloleks_db is not None:
sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
else:
sloleks_db = None
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
if args.sloleks_db is not None:
sloleks_db.close()
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract structures from a parsed corpus.')
parser.add_argument('structures',
help='Structures definitions in xml file')
parser.add_argument('input',
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials')
parser.add_argument('--out',
help='Classic output file')
parser.add_argument('--out-no-stat',
help='Output file, but without statistical columns')
parser.add_argument('--all',
help='Additional output file, writes more data')
parser.add_argument('--stats',
help='Output file for statistics')
#
parser.add_argument('--no-msd-translate',
help='MSDs are translated from slovene to english by default',
action='store_true')
parser.add_argument('--skip-id-check',
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
action='store_true')
parser.add_argument('--min_freq', help='Minimal frequency in output',
type=int, default=0, const=1, nargs='?')
parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
parser.add_argument('--count-files',
help="Count files: more verbose output", action='store_true')
parser.add_argument('--multiple-output',
help='Generate one output for each syntactic structure',
action='store_true')
parser.add_argument('--load-sloleks',
help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
action='store_true')
parser.add_argument('--sort-by',
help="Sort by a this column (index)", type=int, default=-1)
parser.add_argument('--sort-reversed',
help="Sort in reversed ored", action='store_true')
parser.add_argument('--db',
help="Database file to use (instead of memory)", default=None)
parser.add_argument('--collocation_sentence_map_dest',
help="Destination to folder where collocation-sentence mapper (mappers in case of multiple-output).", default=None)
parser.add_argument('--new-db',
help="Writes over database file, if there exists one", action='store_true')
parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc")
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))