Added s/z, k/h + fixed bug 90 + connecting with sloleks on lemma_fallback
This commit is contained in:
25
src/wani.py
25
src/wani.py
@@ -11,6 +11,7 @@ import concurrent.futures
|
||||
import tempfile
|
||||
|
||||
from progress_bar import progress
|
||||
from sloleks_db import SloleksDatabase
|
||||
from word import Word
|
||||
from syntactic_structure import build_structures
|
||||
from match_store import MatchStore
|
||||
@@ -20,16 +21,20 @@ from loader import load_files
|
||||
from database import Database
|
||||
from time_info import TimeInfo
|
||||
|
||||
from src.postprocessor import Postprocessor
|
||||
|
||||
def match_file(words, structures):
|
||||
|
||||
def match_file(words, structures, postprocessor):
|
||||
matches = {s: [] for s in structures}
|
||||
|
||||
for s in progress(structures, "matching"):
|
||||
for w in words:
|
||||
mhere = s.match(w)
|
||||
for match in mhere:
|
||||
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
||||
# colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
||||
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
|
||||
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
|
||||
match, collocation_id = postprocessor.process(match, colocation_id)
|
||||
colocation_id = tuple(colocation_id)
|
||||
|
||||
matches[s].append((match, colocation_id))
|
||||
@@ -38,6 +43,7 @@ def match_file(words, structures):
|
||||
|
||||
|
||||
def main(args):
|
||||
sloleks_db = SloleksDatabase(args.sloleks_db)
|
||||
structures, lemma_msds, max_num_components = build_structures(args)
|
||||
timeinfo = TimeInfo(len(args.input))
|
||||
|
||||
@@ -51,7 +57,11 @@ def main(args):
|
||||
continue
|
||||
|
||||
start_time = time.time()
|
||||
matches = match_file(words, structures)
|
||||
postprocessor = Postprocessor()
|
||||
matches = match_file(words, structures, postprocessor)
|
||||
|
||||
# matches = .process()
|
||||
# TODO Add postprocessing here or inside previous function!
|
||||
match_store.add_matches(matches)
|
||||
word_stats.add_words(words)
|
||||
database.commit()
|
||||
@@ -74,7 +84,7 @@ def main(args):
|
||||
|
||||
# figure out representations!
|
||||
if args.out or args.out_no_stat:
|
||||
match_store.set_representations(word_stats, structures)
|
||||
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
|
||||
|
||||
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||
structures, match_store)
|
||||
@@ -85,6 +95,10 @@ def main(args):
|
||||
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||
structures, match_store)
|
||||
|
||||
# sloleks_db.get_word_form(lemma, gender, number, case)
|
||||
sloleks_db.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract structures from a parsed corpus.')
|
||||
@@ -92,6 +106,7 @@ if __name__ == '__main__':
|
||||
help='Structures definitions in xml file')
|
||||
parser.add_argument('input',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
|
||||
parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
|
||||
parser.add_argument('--out',
|
||||
help='Classic output file')
|
||||
parser.add_argument('--out-no-stat',
|
||||
@@ -100,7 +115,7 @@ if __name__ == '__main__':
|
||||
help='Additional output file, writes more data')
|
||||
parser.add_argument('--stats',
|
||||
help='Output file for statistics')
|
||||
|
||||
#
|
||||
parser.add_argument('--no-msd-translate',
|
||||
help='MSDs are translated from slovene to english by default',
|
||||
action='store_true')
|
||||
|
||||
Reference in New Issue
Block a user