|
|
|
@ -1,3 +1,5 @@
|
|
|
|
|
import gc
|
|
|
|
|
|
|
|
|
|
from psycopg2cffi import compat
|
|
|
|
|
compat.register()
|
|
|
|
|
|
|
|
|
@ -5,11 +7,11 @@ from sqlalchemy.ext.declarative import declarative_base
|
|
|
|
|
from sqlalchemy.orm import Session, aliased
|
|
|
|
|
from sqlalchemy import create_engine
|
|
|
|
|
|
|
|
|
|
from codes_tagset import TAGSET, CODES, CODES_TRANSLATION
|
|
|
|
|
from codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SloleksDatabase:
|
|
|
|
|
def __init__(self, db):
|
|
|
|
|
def __init__(self, db, load_sloleks):
|
|
|
|
|
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
|
|
|
|
[db_user, db_password, db_database, db_host] = db.split(':')
|
|
|
|
|
|
|
|
|
@ -71,12 +73,65 @@ class SloleksDatabase:
|
|
|
|
|
|
|
|
|
|
self.session = Session(engine)
|
|
|
|
|
|
|
|
|
|
self.load_sloleks = load_sloleks
|
|
|
|
|
if self.load_sloleks:
|
|
|
|
|
self.init_load_sloleks()
|
|
|
|
|
|
|
|
|
|
def init_load_sloleks(self):
|
|
|
|
|
query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
|
|
|
|
|
word_form_features = query_word_form_features.all()
|
|
|
|
|
query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form)
|
|
|
|
|
form_representations = query_form_representations.all()
|
|
|
|
|
query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
|
|
|
|
|
word_forms = query_word_forms.all()
|
|
|
|
|
query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
|
|
|
|
|
lexemes = query_lexemes.all()
|
|
|
|
|
|
|
|
|
|
self.lemmas = {}
|
|
|
|
|
for lexeme in lexemes:
|
|
|
|
|
if lexeme.lemma not in self.lemmas:
|
|
|
|
|
self.lemmas[lexeme.lemma] = []
|
|
|
|
|
self.lemmas[lexeme.lemma].append(lexeme.id)
|
|
|
|
|
|
|
|
|
|
self.word_form_features = {}
|
|
|
|
|
for word_form_feature in word_form_features:
|
|
|
|
|
if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES:
|
|
|
|
|
continue
|
|
|
|
|
if word_form_feature.word_form_id not in self.word_form_features:
|
|
|
|
|
self.word_form_features[word_form_feature.word_form_id] = set()
|
|
|
|
|
self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
|
|
|
|
|
|
|
|
|
|
self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation
|
|
|
|
|
in form_representations}
|
|
|
|
|
|
|
|
|
|
self.word_forms = {}
|
|
|
|
|
for word_form in word_forms:
|
|
|
|
|
if word_form.lexeme_id not in self.word_forms:
|
|
|
|
|
self.word_forms[word_form.lexeme_id] = []
|
|
|
|
|
self.word_forms[word_form.lexeme_id].append(word_form.id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.connected_lemmas = {}
|
|
|
|
|
for lemma, lemma_ids in self.lemmas.items():
|
|
|
|
|
for lemma_id in lemma_ids:
|
|
|
|
|
if lemma_id in self.word_forms:
|
|
|
|
|
for word_form_id in self.word_forms[lemma_id]:
|
|
|
|
|
if word_form_id in self.word_form_features and word_form_id in self.form_representations:
|
|
|
|
|
if lemma not in self.connected_lemmas:
|
|
|
|
|
self.connected_lemmas[lemma] = []
|
|
|
|
|
self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id]))
|
|
|
|
|
|
|
|
|
|
del self.lemmas, self.word_form_features, self.form_representations, self.word_forms
|
|
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
|
self.session.close()
|
|
|
|
|
|
|
|
|
|
def decypher_msd(self, msd):
|
|
|
|
|
t = msd[0]
|
|
|
|
|
decypher = []
|
|
|
|
|
# IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES
|
|
|
|
|
if t == 'N':
|
|
|
|
|
# gender = CODES_TRANSLATION[t][2][msd[2]]
|
|
|
|
|
number = CODES_TRANSLATION[t][3][msd[3]]
|
|
|
|
@ -118,7 +173,6 @@ class SloleksDatabase:
|
|
|
|
|
# fix for verbs with short msds
|
|
|
|
|
if v + 1 >= len(msd):
|
|
|
|
|
msd = msd + ['-' for _ in range(v - len(msd) + 2)]
|
|
|
|
|
# return None, None, None
|
|
|
|
|
|
|
|
|
|
msd[v + 1] = align_msd[v_align_msd + 1]
|
|
|
|
|
|
|
|
|
@ -127,20 +181,31 @@ class SloleksDatabase:
|
|
|
|
|
if not decypher_msd:
|
|
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
|
|
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
|
|
|
|
|
query_preposition = self.session.query(FormRepresentation.form) \
|
|
|
|
|
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
|
|
|
|
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
|
|
|
|
|
|
|
|
|
|
for wf in wfs:
|
|
|
|
|
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
|
|
|
|
|
|
|
|
|
|
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
|
|
|
|
|
|
|
|
|
|
for wf, msd_el in zip(wfs, decypher_msd):
|
|
|
|
|
query_preposition = query_preposition.filter(wf.value == msd_el)
|
|
|
|
|
|
|
|
|
|
pattern_translation_hws = query_preposition.all()
|
|
|
|
|
if len(pattern_translation_hws) > 0:
|
|
|
|
|
return ''.join(msd), lemma, pattern_translation_hws[0][0]
|
|
|
|
|
if self.load_sloleks and lemma in self.connected_lemmas:
|
|
|
|
|
for (word_form_features, form_representations) in self.connected_lemmas[lemma]:
|
|
|
|
|
fits = True
|
|
|
|
|
for d_m in decypher_msd:
|
|
|
|
|
if d_m not in word_form_features:
|
|
|
|
|
fits = False
|
|
|
|
|
break
|
|
|
|
|
if fits:
|
|
|
|
|
break
|
|
|
|
|
return ''.join(msd), lemma, form_representations
|
|
|
|
|
else:
|
|
|
|
|
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
|
|
|
|
|
query_preposition = self.session.query(FormRepresentation.form) \
|
|
|
|
|
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
|
|
|
|
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
|
|
|
|
|
|
|
|
|
|
for wf in wfs:
|
|
|
|
|
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
|
|
|
|
|
|
|
|
|
|
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
|
|
|
|
|
|
|
|
|
|
for wf, msd_el in zip(wfs, decypher_msd):
|
|
|
|
|
query_preposition = query_preposition.filter(wf.value == msd_el)
|
|
|
|
|
|
|
|
|
|
pattern_translation_hws = query_preposition.limit(1).all()
|
|
|
|
|
if len(pattern_translation_hws) > 0:
|
|
|
|
|
return ''.join(msd), lemma, pattern_translation_hws[0][0]
|
|
|
|
|
return None, None, None
|
|
|
|
|