parent
ec113f9cd2
commit
777791ad1e
@ -0,0 +1,38 @@
|
||||
|
||||
class Postprocessor:
|
||||
def __init__(self, fix_one_letter_words=True):
|
||||
self.fix_one_letter_words = fix_one_letter_words
|
||||
|
||||
@staticmethod
|
||||
def fix_sz(next_word):
|
||||
if next_word[0] in ['c', 'č', 'f', 'h', 'k', 'p', 's', 'š', 't']:
|
||||
return 's'
|
||||
return 'z'
|
||||
|
||||
@staticmethod
|
||||
def fix_kh(next_word):
|
||||
if next_word[0] in ['g', 'k']:
|
||||
return 'h'
|
||||
return 'k'
|
||||
|
||||
def process(self, match, collocation_id):
|
||||
# self.matches = matches
|
||||
# if self.fix_one_letter_words:
|
||||
# for syn_structure_key, syn_structure_value in self.matches.items():
|
||||
# for match, collocation_id in syn_structure_value:
|
||||
if len(collocation_id) > 2:
|
||||
# a = collocation_id[1:-1]
|
||||
# b = enumerate(collocation_id[1:-1])
|
||||
# for a, c in b:
|
||||
# print('here')
|
||||
for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
|
||||
if word in ['s', 'z']:
|
||||
correct_letter = self.fix_sz(collocation_id[idx + 2][1])
|
||||
collocation_id[idx + 1][1] = correct_letter
|
||||
match[col_id].text = correct_letter
|
||||
elif word in ['k', 'h']:
|
||||
correct_letter = self.fix_kh(collocation_id[idx + 2][1])
|
||||
collocation_id[idx + 1][1] = correct_letter
|
||||
match[col_id].text = correct_letter
|
||||
collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]]
|
||||
return match, collocation_id
|
@ -0,0 +1,191 @@
|
||||
from collections import defaultdict
|
||||
from ast import literal_eval
|
||||
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import Session, aliased
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy import func
|
||||
|
||||
from match import StructureMatch
|
||||
from representation_assigner import RepresentationAssigner
|
||||
from progress_bar import progress
|
||||
|
||||
# Lexeme = None
|
||||
# LexemeFeature = None
|
||||
# SyntacticStructure = None
|
||||
# StructureComponent = None
|
||||
# Feature = None
|
||||
# LexicalUnitLexeme = None
|
||||
# LexicalUnit = None
|
||||
# LexicalUnitType = None
|
||||
# Category = None
|
||||
# Sense = None
|
||||
# Measure = None
|
||||
# LexicalUnitMeasure = None
|
||||
# Corpus = None
|
||||
# Definition = None
|
||||
# WordForm = None
|
||||
# WordFormFeature = None
|
||||
# FormRepresentation = None
|
||||
from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
|
||||
|
||||
|
||||
class SloleksDatabase:
|
||||
def __init__(self, db):
|
||||
# self.db = db
|
||||
# self.dispersions = {}
|
||||
# self.min_freq = args.min_freq
|
||||
|
||||
# self.db.init("""CREATE TABLE Colocations (
|
||||
# colocation_id INTEGER PRIMARY KEY,
|
||||
# structure_id varchar(8),
|
||||
# key varchar(256))
|
||||
# """)
|
||||
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
||||
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||
|
||||
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
||||
pool_recycle=14400)
|
||||
Base = declarative_base()
|
||||
Base.metadata.reflect(engine)
|
||||
|
||||
class Lexeme(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexeme']
|
||||
|
||||
class LexemeFeature(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexeme_feature']
|
||||
|
||||
class SyntacticStructure(Base):
|
||||
__table__ = Base.metadata.tables['jedro_syntacticstructure']
|
||||
|
||||
class StructureComponent(Base):
|
||||
__table__ = Base.metadata.tables['jedro_structurecomponent']
|
||||
|
||||
class Feature(Base):
|
||||
__table__ = Base.metadata.tables['jedro_feature']
|
||||
|
||||
class LexicalUnitLexeme(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
|
||||
|
||||
class LexicalUnit(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexicalunit']
|
||||
|
||||
class LexicalUnitType(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexicalunittype']
|
||||
|
||||
class Category(Base):
|
||||
__table__ = Base.metadata.tables['jedro_category']
|
||||
|
||||
class Sense(Base):
|
||||
__table__ = Base.metadata.tables['jedro_sense']
|
||||
|
||||
class Measure(Base):
|
||||
__table__ = Base.metadata.tables['jedro_measure']
|
||||
|
||||
class LexicalUnitMeasure(Base):
|
||||
__table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
|
||||
|
||||
class Corpus(Base):
|
||||
__table__ = Base.metadata.tables['jedro_corpus']
|
||||
|
||||
class Definition(Base):
|
||||
__table__ = Base.metadata.tables['jedro_definition']
|
||||
|
||||
class WordForm(Base):
|
||||
__table__ = Base.metadata.tables['jedro_wordform']
|
||||
|
||||
class WordFormFeature(Base):
|
||||
__table__ = Base.metadata.tables['jedro_wordform_feature']
|
||||
|
||||
class FormRepresentation(Base):
|
||||
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
||||
|
||||
self.session = Session(engine)
|
||||
|
||||
def close(self):
|
||||
self.session.close()
|
||||
|
||||
def decypher_msd(self, msd):
|
||||
t = msd[0]
|
||||
decypher = []
|
||||
if t == 'N':
|
||||
# gender = CODES_TRANSLATION[t][2][msd[2]]
|
||||
number = CODES_TRANSLATION[t][3][msd[3]]
|
||||
case = CODES_TRANSLATION[t][4][msd[4]]
|
||||
decypher = [number, case]
|
||||
elif t == 'V':
|
||||
# gender = CODES_TRANSLATION[t][6][msd[6]]
|
||||
vform = CODES_TRANSLATION[t][3][msd[3]]
|
||||
number = CODES_TRANSLATION[t][5][msd[5]]
|
||||
person = 'third'
|
||||
decypher = [vform, number, person]
|
||||
elif t == 'A':
|
||||
gender = CODES_TRANSLATION[t][3][msd[3]]
|
||||
number = CODES_TRANSLATION[t][4][msd[4]]
|
||||
case = CODES_TRANSLATION[t][5][msd[5]]
|
||||
decypher = [gender, number, case]
|
||||
|
||||
return decypher
|
||||
|
||||
def get_word_form(self, lemma, msd, data, align_msd=False):
|
||||
# modify msd as required
|
||||
msd = list(msd)
|
||||
|
||||
if not align_msd and 'msd' in data:
|
||||
for key, value in data['msd'].items():
|
||||
t = msd[0]
|
||||
v = TAGSET[t].index(key.lower())
|
||||
msd[v + 1] = CODES[value]
|
||||
|
||||
elif 'agreement' in data:
|
||||
align_msd = list(align_msd)
|
||||
t_align_msd = align_msd[0]
|
||||
t = msd[0]
|
||||
|
||||
for att in data['agreement']:
|
||||
v_align_msd = TAGSET[t_align_msd].index(att.lower())
|
||||
v = TAGSET[t].index(att.lower())
|
||||
# fix for verbs with short msds
|
||||
if v >= len(msd):
|
||||
return None, None, None
|
||||
# if v >= len(msd) and t == 'V' and att == 'number':
|
||||
# if len(msd) == 4:
|
||||
# msd += ['3']
|
||||
# if len(msd) == 5:
|
||||
# msd += ['_']
|
||||
# try:
|
||||
msd[v + 1] = align_msd[v_align_msd + 1]
|
||||
# except:
|
||||
# print('here')
|
||||
|
||||
# msd = list(msd)
|
||||
decypher_msd = self.decypher_msd(msd)
|
||||
|
||||
if not decypher_msd:
|
||||
return None, None, None
|
||||
|
||||
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
|
||||
# wf1 = aliased(WordFormFeature)
|
||||
# wf2 = aliased(WordFormFeature)
|
||||
# wf3 = aliased(WordFormFeature)
|
||||
query_preposition = self.session.query(FormRepresentation.form) \
|
||||
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
||||
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
|
||||
|
||||
for wf in wfs:
|
||||
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
|
||||
# .join(wf1, wf1.word_form_id == WordForm.id) \
|
||||
# .join(wf2, wf2.word_form_id == WordForm.id) \
|
||||
# .join(wf3, wf3.word_form_id == WordForm.id) \
|
||||
|
||||
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
|
||||
|
||||
for wf, msd_el in zip(wfs, decypher_msd):
|
||||
query_preposition = query_preposition.filter(wf.value == msd_el)
|
||||
|
||||
pattern_translation_hws = query_preposition.all()
|
||||
if len(pattern_translation_hws) > 0:
|
||||
return ''.join(msd), lemma, pattern_translation_hws[0][0]
|
||||
# pattern_translation_hws = [el[0] for el in query_preposition.all()]
|
||||
return None, None, None
|
||||
# return pattern_translation_hws
|
Loading…
Reference in new issue