luscenje_struktur/src/wani.py

168 lines
6.8 KiB
Python
Raw Normal View History

from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
2019-07-03 11:06:59 +00:00
import gc
import subprocess
import concurrent.futures
import tempfile
2019-06-17 15:30:51 +00:00
from progress_bar import progress
from sloleks_db import SloleksDatabase
from word import Word
from syntactic_structure import build_structures
from match_store import MatchStore
from word_stats import WordStats
from writer import Writer
2019-06-17 13:38:55 +00:00
from loader import load_files
2019-06-27 10:37:23 +00:00
from database import Database
2019-08-21 09:13:23 +00:00
from time_info import TimeInfo
2020-07-20 15:36:44 +00:00
from postprocessor import Postprocessor
def match_file(words, structures, postprocessor):
matches = {s: [] for s in structures}
for s in progress(structures, "matching"):
for w in words:
mhere = s.match(w)
for match in mhere:
# colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
match, collocation_id = postprocessor.process(match, colocation_id)
colocation_id = tuple(colocation_id)
matches[s].append((match, colocation_id))
# for key, val in matches.items():
# if key.id == '15':
# for el in val:
# if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje':
# word_id = '.'.join(words[0].id.split('.')[:-1])
# print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}")
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
# if s.id == '15':
# if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje':
# word_id = '.'.join(match['1'].id.split('.')[:-1])
# print(f"ID: {word_id}")
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
return matches
2019-06-15 23:00:22 +00:00
def main(args):
sloleks_db = SloleksDatabase(args.sloleks_db)
2019-06-15 23:00:22 +00:00
structures, lemma_msds, max_num_components = build_structures(args)
2019-08-21 09:13:23 +00:00
timeinfo = TimeInfo(len(args.input))
2019-06-27 10:37:23 +00:00
database = Database(args)
match_store = MatchStore(args, database)
2019-06-27 10:37:23 +00:00
word_stats = WordStats(lemma_msds, database)
2019-08-21 09:12:38 +00:00
for words in load_files(args, database):
if words is None:
2019-08-21 09:13:23 +00:00
timeinfo.add_measurement(-1)
2019-08-21 09:12:38 +00:00
continue
2019-08-21 09:13:23 +00:00
start_time = time.time()
postprocessor = Postprocessor()
matches = match_file(words, structures, postprocessor)
2019-07-03 11:06:59 +00:00
match_store.add_matches(matches)
word_stats.add_words(words)
2019-08-21 09:08:08 +00:00
database.commit()
2019-07-03 11:06:59 +00:00
# force a bit of garbage collection
del words
del matches
gc.collect()
2019-08-21 09:13:23 +00:00
timeinfo.add_measurement(time.time() - start_time)
timeinfo.info()
# if no output files, just exit
if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
return
# get word renders for lemma/msd
word_stats.generate_renders()
match_store.determine_colocation_dispersions()
# figure out representations!
if args.out or args.out_no_stat:
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
# sloleks_db.get_word_form(lemma, gender, number, case)
sloleks_db.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract structures from a parsed corpus.')
parser.add_argument('structures',
help='Structures definitions in xml file')
parser.add_argument('input',
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
parser.add_argument('--out',
help='Classic output file')
parser.add_argument('--out-no-stat',
help='Output file, but without statistical columns')
parser.add_argument('--all',
help='Additional output file, writes more data')
parser.add_argument('--stats',
help='Output file for statistics')
#
parser.add_argument('--no-msd-translate',
help='MSDs are translated from slovene to english by default',
action='store_true')
parser.add_argument('--skip-id-check',
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
action='store_true')
parser.add_argument('--min_freq', help='Minimal frequency in output',
type=int, default=0, const=1, nargs='?')
parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
parser.add_argument('--count-files',
help="Count files: more verbose output", action='store_true')
parser.add_argument('--multiple-output',
help='Generate one output for each syntactic structure',
action='store_true')
parser.add_argument('--sort-by',
help="Sort by a this column (index)", type=int, default=-1)
parser.add_argument('--sort-reversed',
help="Sort in reversed ored", action='store_true')
2019-06-27 10:37:23 +00:00
parser.add_argument('--db',
help="Database file to use (instead of memory)", default=None)
2020-07-20 08:51:09 +00:00
parser.add_argument('--collocation_sentence_map_dest',
help="Destination to folder where collocation-sentence mapper (mappers in case of multiple-output).", default=None)
parser.add_argument('--new-db',
help="Writes over database file, if there exists one", action='store_true')
2019-06-27 10:37:23 +00:00
parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc")
2019-06-15 23:00:22 +00:00
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
2019-06-15 23:00:22 +00:00
main(args)
logging.info("TIME: {}".format(time.time() - start))