161 lines
		
	
	
		
			6.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			161 lines
		
	
	
		
			6.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from xml.etree import ElementTree
 | |
| import re
 | |
| import sys
 | |
| import logging
 | |
| import argparse
 | |
| import pickle
 | |
| import time
 | |
| import gc
 | |
| import subprocess
 | |
| import concurrent.futures
 | |
| import tempfile
 | |
| 
 | |
| from luscenje_struktur.progress_bar import progress
 | |
| from luscenje_struktur.sloleks_db import SloleksDatabase
 | |
| from luscenje_struktur.word import Word
 | |
| from luscenje_struktur.syntactic_structure import build_structures
 | |
| from luscenje_struktur.match_store import MatchStore
 | |
| from luscenje_struktur.word_stats import WordStats
 | |
| from luscenje_struktur.writer import Writer
 | |
| from luscenje_struktur.loader import load_files
 | |
| from luscenje_struktur.database import Database
 | |
| from luscenje_struktur.time_info import TimeInfo
 | |
| 
 | |
| from luscenje_struktur.postprocessor import Postprocessor
 | |
| 
 | |
| 
 | |
| def match_file(words, structures, postprocessor):
 | |
|     matches = {s: [] for s in structures}
 | |
| 
 | |
|     for s in progress(structures, "matching"):
 | |
|         for w in words:
 | |
|             mhere = s.match(w)
 | |
|             for match in mhere:
 | |
|                 colocation_id = [[idx, w.lemma] for idx, w in match.items()]
 | |
|                 colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
 | |
|                 match, collocation_id = postprocessor.process(match, colocation_id)
 | |
|                 colocation_id = tuple(colocation_id)
 | |
| 
 | |
|                 matches[s].append((match, colocation_id))
 | |
| 
 | |
|     return matches
 | |
| 
 | |
| 
 | |
| def main(args):
 | |
|     structures, lemma_msds, max_num_components = build_structures(args)
 | |
|     timeinfo = TimeInfo(len(args.input))
 | |
| 
 | |
|     database = Database(args)
 | |
|     match_store = MatchStore(args, database)
 | |
|     word_stats = WordStats(lemma_msds, database)
 | |
| 
 | |
|     for words in load_files(args, database):
 | |
|         if words is None:
 | |
|             timeinfo.add_measurement(-1)
 | |
|             continue
 | |
| 
 | |
|         start_time = time.time()
 | |
|         postprocessor = Postprocessor()
 | |
|         matches = match_file(words, structures, postprocessor)
 | |
| 
 | |
|         match_store.add_matches(matches)
 | |
|         word_stats.add_words(words)
 | |
|         database.commit()
 | |
| 
 | |
|         # force a bit of garbage collection
 | |
|         del words
 | |
|         del matches
 | |
|         gc.collect()
 | |
| 
 | |
|         timeinfo.add_measurement(time.time() - start_time)
 | |
|         timeinfo.info()
 | |
| 
 | |
|     # if no output files, just exit
 | |
|     if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
 | |
|         return
 | |
| 
 | |
|     # get word renders for lemma/msd
 | |
|     word_stats.generate_renders()
 | |
|     match_store.determine_colocation_dispersions()
 | |
| 
 | |
|     # figure out representations!
 | |
|     if args.out or args.out_no_stat:
 | |
|         if args.sloleks_db is not None:
 | |
|             sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
 | |
|         else:
 | |
|             sloleks_db = None
 | |
|         match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
 | |
|         if args.sloleks_db is not None:
 | |
|             sloleks_db.close()
 | |
| 
 | |
|     Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
 | |
|         structures, match_store)
 | |
|     Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
 | |
|         structures, match_store)
 | |
|     Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
 | |
|         structures, match_store)
 | |
|     Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
 | |
|         structures, match_store)
 | |
| 
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     parser = argparse.ArgumentParser(
 | |
|         description='Extract structures from a parsed corpus.')
 | |
|     parser.add_argument('structures',
 | |
|                         help='Structures definitions in xml file')
 | |
|     parser.add_argument('input',
 | |
|                         help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
 | |
|     parser.add_argument('--sloleks_db', type=str, default=None, help='Sloleks database credentials')
 | |
|     parser.add_argument('--out',
 | |
|                         help='Classic output file')
 | |
|     parser.add_argument('--out-no-stat',
 | |
|                         help='Output file, but without statistical columns')
 | |
|     parser.add_argument('--all',
 | |
|                         help='Additional output file, writes more data')
 | |
|     parser.add_argument('--stats',
 | |
|                         help='Output file for statistics')
 | |
| #
 | |
|     parser.add_argument('--no-msd-translate',
 | |
|                         help='MSDs are translated from slovene to english by default',
 | |
|                         action='store_true')
 | |
|     parser.add_argument('--skip-id-check',
 | |
|                         help='Skips checks for ids of <w> and <pc>, if they are in correct format',
 | |
|                         action='store_true')
 | |
|     parser.add_argument('--min_freq', help='Minimal frequency in output',
 | |
|                         type=int, default=0, const=1, nargs='?')
 | |
|     parser.add_argument('--verbose', help='Enable verbose output to stderr',
 | |
|                         choices=["warning", "info", "debug"], default="info",
 | |
|                         const="info", nargs='?')
 | |
|     parser.add_argument('--count-files',
 | |
|                         help="Count files: more verbose output", action='store_true')
 | |
|     parser.add_argument('--multiple-output',
 | |
|                         help='Generate one output for each syntactic structure',
 | |
|                         action='store_true')
 | |
| 
 | |
|     parser.add_argument('--load-sloleks',
 | |
|                         help='Tells weather sloleks is loaded into memory at the beginning of processing or not. Should be in',
 | |
|                         action='store_true')
 | |
| 
 | |
|     parser.add_argument('--sort-by',
 | |
|                         help="Sort by a this column (index)", type=int, default=-1)
 | |
|     parser.add_argument('--sort-reversed',
 | |
|                         help="Sort in reversed ored", action='store_true')
 | |
| 
 | |
|     parser.add_argument('--db',
 | |
|                         help="Database file to use (instead of memory)", default=None)
 | |
|     parser.add_argument('--collocation_sentence_map_dest',
 | |
|                         help="Destination to folder where collocation-sentence mapper (mappers in case of multiple-output).", default=None)
 | |
|     parser.add_argument('--new-db',
 | |
|                         help="Writes over database file, if there exists one", action='store_true')
 | |
| 
 | |
|     parser.add_argument('--pc-tag',
 | |
|                         help='Tag for separators, usually pc or c', default="pc")
 | |
| 
 | |
|     args = parser.parse_args()
 | |
|     logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
 | |
| 
 | |
|     start = time.time()
 | |
|     main(args)
 | |
|     logging.info("TIME: {}".format(time.time() - start))
 |