Added s/z, k/h + fixed bug 90 + connecting with sloleks on lemma_fallback

This commit is contained in:
2020-07-08 19:23:56 +02:00
parent ec113f9cd2
commit 777791ad1e
12 changed files with 443 additions and 32 deletions

View File

@@ -11,6 +11,7 @@ import concurrent.futures
import tempfile
from progress_bar import progress
from sloleks_db import SloleksDatabase
from word import Word
from syntactic_structure import build_structures
from match_store import MatchStore
@@ -20,16 +21,20 @@ from loader import load_files
from database import Database
from time_info import TimeInfo
from src.postprocessor import Postprocessor
def match_file(words, structures):
def match_file(words, structures, postprocessor):
matches = {s: [] for s in structures}
for s in progress(structures, "matching"):
for w in words:
mhere = s.match(w)
for match in mhere:
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
# colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
match, collocation_id = postprocessor.process(match, colocation_id)
colocation_id = tuple(colocation_id)
matches[s].append((match, colocation_id))
@@ -38,6 +43,7 @@ def match_file(words, structures):
def main(args):
sloleks_db = SloleksDatabase(args.sloleks_db)
structures, lemma_msds, max_num_components = build_structures(args)
timeinfo = TimeInfo(len(args.input))
@@ -51,7 +57,11 @@ def main(args):
continue
start_time = time.time()
matches = match_file(words, structures)
postprocessor = Postprocessor()
matches = match_file(words, structures, postprocessor)
# matches = .process()
# TODO Add postprocessing here or inside previous function!
match_store.add_matches(matches)
word_stats.add_words(words)
database.commit()
@@ -74,7 +84,7 @@ def main(args):
# figure out representations!
if args.out or args.out_no_stat:
match_store.set_representations(word_stats, structures)
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
@@ -85,6 +95,10 @@ def main(args):
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
# sloleks_db.get_word_form(lemma, gender, number, case)
sloleks_db.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract structures from a parsed corpus.')
@@ -92,6 +106,7 @@ if __name__ == '__main__':
help='Structures definitions in xml file')
parser.add_argument('input',
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
parser.add_argument('--out',
help='Classic output file')
parser.add_argument('--out-no-stat',
@@ -100,7 +115,7 @@ if __name__ == '__main__':
help='Additional output file, writes more data')
parser.add_argument('--stats',
help='Output file for statistics')
#
parser.add_argument('--no-msd-translate',
help='MSDs are translated from slovene to english by default',
action='store_true')