2019-06-15 16:55:35 +00:00
|
|
|
from xml.etree import ElementTree
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import logging
|
|
|
|
import argparse
|
|
|
|
import pickle
|
|
|
|
import time
|
2019-07-03 11:06:59 +00:00
|
|
|
import gc
|
2019-06-15 16:55:35 +00:00
|
|
|
import subprocess
|
|
|
|
import concurrent.futures
|
|
|
|
import tempfile
|
|
|
|
|
2020-09-17 12:17:40 +00:00
|
|
|
from luscenje_struktur.progress_bar import progress
|
|
|
|
from luscenje_struktur.sloleks_db import SloleksDatabase
|
|
|
|
from luscenje_struktur.word import Word
|
|
|
|
from luscenje_struktur.syntactic_structure import build_structures
|
|
|
|
from luscenje_struktur.match_store import MatchStore
|
|
|
|
from luscenje_struktur.word_stats import WordStats
|
|
|
|
from luscenje_struktur.writer import Writer
|
|
|
|
from luscenje_struktur.loader import load_files
|
|
|
|
from luscenje_struktur.database import Database
|
|
|
|
from luscenje_struktur.time_info import TimeInfo
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2020-09-17 12:17:40 +00:00
|
|
|
from luscenje_struktur.postprocessor import Postprocessor
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2020-07-08 17:23:56 +00:00
|
|
|
|
|
|
|
def match_file(words, structures, postprocessor):
|
2019-06-15 16:55:35 +00:00
|
|
|
matches = {s: [] for s in structures}
|
|
|
|
|
2019-07-03 08:23:18 +00:00
|
|
|
for s in progress(structures, "matching"):
|
2019-06-15 16:55:35 +00:00
|
|
|
for w in words:
|
|
|
|
mhere = s.match(w)
|
|
|
|
for match in mhere:
|
2020-10-27 08:48:34 +00:00
|
|
|
if not postprocessor.is_fixed_restriction_order(match):
|
|
|
|
continue
|
2020-07-08 17:23:56 +00:00
|
|
|
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
|
2019-06-15 16:55:35 +00:00
|
|
|
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
|
2020-07-08 17:23:56 +00:00
|
|
|
match, collocation_id = postprocessor.process(match, colocation_id)
|
2019-06-15 16:55:35 +00:00
|
|
|
colocation_id = tuple(colocation_id)
|
|
|
|
|
|
|
|
matches[s].append((match, colocation_id))
|
|
|
|
|
|
|
|
return matches
|
|
|
|
|
|
|
|
|
2019-06-15 23:00:22 +00:00
|
|
|
def main(args):
|
|
|
|
structures, lemma_msds, max_num_components = build_structures(args)
|
2019-08-21 09:13:23 +00:00
|
|
|
timeinfo = TimeInfo(len(args.input))
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2019-06-27 10:37:23 +00:00
|
|
|
database = Database(args)
|
2019-06-27 14:51:58 +00:00
|
|
|
match_store = MatchStore(args, database)
|
2019-06-27 10:37:23 +00:00
|
|
|
word_stats = WordStats(lemma_msds, database)
|
2020-10-27 08:48:34 +00:00
|
|
|
postprocessor = Postprocessor(fixed_restriction_order=args.fixed_restriction_order)
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2019-08-21 09:12:38 +00:00
|
|
|
for words in load_files(args, database):
|
|
|
|
if words is None:
|
2019-08-21 09:13:23 +00:00
|
|
|
timeinfo.add_measurement(-1)
|
2019-08-21 09:12:38 +00:00
|
|
|
continue
|
|
|
|
|
2019-08-21 09:13:23 +00:00
|
|
|
start_time = time.time()
|
2020-07-08 17:23:56 +00:00
|
|
|
matches = match_file(words, structures, postprocessor)
|
|
|
|
|
2019-07-03 11:06:59 +00:00
|
|
|
match_store.add_matches(matches)
|
|
|
|
word_stats.add_words(words)
|
2019-08-21 09:08:08 +00:00
|
|
|
database.commit()
|
2019-07-03 11:06:59 +00:00
|
|
|
|
|
|
|
# force a bit of garbage collection
|
|
|
|
del words
|
|
|
|
del matches
|
|
|
|
gc.collect()
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2019-08-21 09:13:23 +00:00
|
|
|
timeinfo.add_measurement(time.time() - start_time)
|
|
|
|
timeinfo.info()
|
|
|
|
|
2019-07-03 11:10:55 +00:00
|
|
|
# if no output files, just exit
|
|
|
|
if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
|
|
|
|
return
|
|
|
|
|
2019-06-15 16:55:35 +00:00
|
|
|
# get word renders for lemma/msd
|
|
|
|
word_stats.generate_renders()
|
|
|
|
match_store.determine_colocation_dispersions()
|
|
|
|
|
|
|
|
# figure out representations!
|
|
|
|
if args.out or args.out_no_stat:
|
2020-09-02 08:53:45 +00:00
|
|
|
if args.sloleks_db is not None:
|
|
|
|
sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
|
|
|
|
else:
|
|
|
|
sloleks_db = None
|
2020-07-08 17:23:56 +00:00
|
|
|
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
|
2020-09-02 08:53:45 +00:00
|
|
|
if args.sloleks_db is not None:
|
|
|
|
sloleks_db.close()
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
|
|
structures, match_store)
|
|
|
|
Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
|
|
structures, match_store)
|
|
|
|
Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
|
|
structures, match_store)
|
|
|
|
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
|
|
structures, match_store)
|
|
|
|
|
2020-07-08 17:23:56 +00:00
|
|
|
|
|
|
|
|
2019-06-15 16:55:35 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description='Extract structures from a parsed corpus.')
|
|
|
|
parser.add_argument('structures',
|
|
|
|
help='Structures definitions in xml file')
|
|
|
|
parser.add_argument('input',
|
2019-07-03 11:10:55 +00:00
|
|
|
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
|
2020-09-02 08:53:45 +00:00
|
|
|
parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials')
|
2019-06-15 16:55:35 +00:00
|
|
|
parser.add_argument('--out',
|
|
|
|
help='Classic output file')
|
|
|
|
parser.add_argument('--out-no-stat',
|
|
|
|
help='Output file, but without statistical columns')
|
|
|
|
parser.add_argument('--all',
|
|
|
|
help='Additional output file, writes more data')
|
|
|
|
parser.add_argument('--stats',
|
|
|
|
help='Output file for statistics')
|
2020-07-08 17:23:56 +00:00
|
|
|
#
|
2019-06-15 16:55:35 +00:00
|
|
|
parser.add_argument('--no-msd-translate',
|
|
|
|
help='MSDs are translated from slovene to english by default',
|
|
|
|
action='store_true')
|
|
|
|
parser.add_argument('--skip-id-check',
|
|
|
|
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
|
|
|
|
action='store_true')
|
|
|
|
parser.add_argument('--min_freq', help='Minimal frequency in output',
|
|
|
|
type=int, default=0, const=1, nargs='?')
|
|
|
|
parser.add_argument('--verbose', help='Enable verbose output to stderr',
|
|
|
|
choices=["warning", "info", "debug"], default="info",
|
|
|
|
const="info", nargs='?')
|
|
|
|
parser.add_argument('--count-files',
|
|
|
|
help="Count files: more verbose output", action='store_true')
|
|
|
|
parser.add_argument('--multiple-output',
|
|
|
|
help='Generate one output for each syntactic structure',
|
|
|
|
action='store_true')
|
2020-07-22 09:16:28 +00:00
|
|
|
|
|
|
|
parser.add_argument('--load-sloleks',
|
2020-09-10 13:06:09 +00:00
|
|
|
help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
|
2020-07-22 09:16:28 +00:00
|
|
|
action='store_true')
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
parser.add_argument('--sort-by',
|
|
|
|
help="Sort by a this column (index)", type=int, default=-1)
|
|
|
|
parser.add_argument('--sort-reversed',
|
|
|
|
help="Sort in reversed ored", action='store_true')
|
|
|
|
|
2019-06-27 10:37:23 +00:00
|
|
|
parser.add_argument('--db',
|
|
|
|
help="Database file to use (instead of memory)", default=None)
|
2020-07-20 08:51:09 +00:00
|
|
|
parser.add_argument('--collocation_sentence_map_dest',
|
|
|
|
help="Destination to folder where collocation-sentence mapper (mappers in case of multiple-output).", default=None)
|
2019-09-09 13:29:15 +00:00
|
|
|
parser.add_argument('--new-db',
|
|
|
|
help="Writes over database file, if there exists one", action='store_true')
|
2019-06-27 10:37:23 +00:00
|
|
|
|
2019-06-15 16:55:35 +00:00
|
|
|
parser.add_argument('--pc-tag',
|
|
|
|
help='Tag for separators, usually pc or c', default="pc")
|
2020-10-09 13:18:52 +00:00
|
|
|
parser.add_argument('--separator',
|
|
|
|
help='Separator in output file', default="\t")
|
2020-10-22 11:16:58 +00:00
|
|
|
parser.add_argument('--ignore-punctuations',
|
|
|
|
help="Sort in reversed ored", action='store_true')
|
2020-10-27 08:48:34 +00:00
|
|
|
parser.add_argument('--fixed-restriction-order',
|
|
|
|
help='If used, words have to be in the same order as components.',
|
|
|
|
action='store_true')
|
2019-06-15 16:55:35 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
|
|
|
|
|
|
|
start = time.time()
|
2019-06-15 23:00:22 +00:00
|
|
|
main(args)
|
2019-06-15 16:55:35 +00:00
|
|
|
logging.info("TIME: {}".format(time.time() - start))
|