Created new column "Joint_representative_form_variable" + Fixed collocation structures + Fixed bug with wrong lemma_fallback msds

This commit is contained in:
Luka 2020-07-16 20:53:59 +02:00
parent de3e52c57c
commit 9a9d344510
9 changed files with 55 additions and 103 deletions

View File

@ -40,26 +40,28 @@ class OutNoStatFormatter(Formatter):
return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"] return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
def header_right(self): def header_right(self):
return ["Joint_representative_form", "Frequency"] return ["Joint_representative_form_fixed", "Joint_representative_form_variable", "Frequency"]
def content_repeat(self, words, representations, idx, _sidx): def content_repeat(self, words, representations, idx, _sidx):
word = words[idx] word = words[idx]
if idx not in representations: if idx not in representations:
return [word.lemma, "", ""] return [word.lemma, "", ""]
rep = representations[idx] rep_text, rep_msd = representations[idx]
if rep is None: if rep_text is None:
self.representation[idx] = word.lemma self.representation[idx] = word.lemma
return [word.lemma, word.lemma, "", "lemma_fallback"] return [word.lemma, word.lemma, "", "lemma_fallback"]
else: else:
self.representation[idx] = rep self.representation[idx] = rep_text
return [word.lemma, rep, word.msd, "ok"] return [word.lemma, rep_text, rep_msd, "ok"]
def content_right(self, freq, best_word_order=None): def content_right(self, freq, variable_word_order=None):
if best_word_order is None: fixed_word_order = sorted(self.representation.keys())
best_word_order = sorted(self.representation.keys()) if variable_word_order is None:
rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation]) variable_word_order = fixed_word_order
result = [rep, str(freq)] rep_fixed_word_order = ' '.join([self.representation[o] for o in fixed_word_order if o in self.representation])
rep_variable_word_order = ' '.join([self.representation[o] for o in variable_word_order if o in self.representation])
result = [rep_fixed_word_order, rep_variable_word_order, str(freq)]
self.representation = {} self.representation = {}
return result return result
@ -183,13 +185,13 @@ class OutFormatter(Formatter):
def header_right(self): def header_right(self):
return self.f1.header_right() + self.f2.header_right() return self.f1.header_right() + self.f2.header_right()
def content_repeat(self, words, representations, idx, sidx, best_word_order=None): def content_repeat(self, words, representations, idx, sidx, variable_word_order=None):
cr1 = self.f1.content_repeat(words, representations, idx, sidx) cr1 = self.f1.content_repeat(words, representations, idx, sidx)
cr2 = self.f2.content_repeat(words, representations, idx, sidx) cr2 = self.f2.content_repeat(words, representations, idx, sidx)
return cr1 + cr2 return cr1 + cr2
def content_right(self, freq, best_word_order=None): def content_right(self, freq, variable_word_order=None):
return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq) return self.f1.content_right(freq, variable_word_order) + self.f2.content_right(freq)
def group(self): def group(self):
return self.f1.group() and self.f2.group() return self.f1.group() and self.f2.group()

View File

@ -28,8 +28,8 @@ class StructureMatch:
result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False) result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)): for component_id, text, msd in db.execute("SELECT component_id, text, msd FROM Representations WHERE colocation_id=?", (colocation_id,)):
result.representations[str(component_id)] = text result.representations[str(component_id)] = (text, msd)
return result return result

View File

@ -35,6 +35,7 @@ class MatchStore:
colocation_id INTEGER, colocation_id INTEGER,
component_id INTEGER, component_id INTEGER,
text varchar(32), text varchar(32),
msd varchar(32),
FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id)) FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
""") """)
self.db.init("""CREATE TABLE Dispersions ( self.db.init("""CREATE TABLE Dispersions (
@ -93,10 +94,10 @@ class MatchStore:
def add_inserts(self, inserts): def add_inserts(self, inserts):
for match in inserts: for match in inserts:
for component_id, text in match.representations.items(): for component_id, (text, msd) in match.representations.items():
self.db.execute(""" self.db.execute("""
INSERT INTO Representations (colocation_id, component_id, text) INSERT INTO Representations (colocation_id, component_id, text, msd)
VALUES (?,?,?)""", (match.match_id, component_id, text)) VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd))
def set_representations(self, word_renderer, structures, sloleks_db=None): def set_representations(self, word_renderer, structures, sloleks_db=None):
step_name = 'representation' step_name = 'representation'

View File

@ -16,15 +16,7 @@ class Postprocessor:
return 'k' return 'k'
def process(self, match, collocation_id): def process(self, match, collocation_id):
# self.matches = matches
# if self.fix_one_letter_words:
# for syn_structure_key, syn_structure_value in self.matches.items():
# for match, collocation_id in syn_structure_value:
if len(collocation_id) > 2: if len(collocation_id) > 2:
# a = collocation_id[1:-1]
# b = enumerate(collocation_id[1:-1])
# for a, c in b:
# print('here')
for idx, (col_id, word) in enumerate(collocation_id[1:-1]): for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
if word in ['s', 'z']: if word in ['s', 'z']:
correct_letter = self.fix_sz(collocation_id[idx + 2][1]) correct_letter = self.fix_sz(collocation_id[idx + 2][1])

View File

@ -14,6 +14,7 @@ class ComponentRepresentation:
self.words = [] self.words = []
self.rendition_text = None self.rendition_text = None
self.rendition_msd = None
self.agreement = [] self.agreement = []
def get_agreement(self): def get_agreement(self):
@ -24,18 +25,22 @@ class ComponentRepresentation:
def render(self, sloleks_db=None): def render(self, sloleks_db=None):
if self.rendition_text is None: if self.rendition_text is None:
self.rendition_text = self._render(sloleks_db=sloleks_db) self.rendition_text, self.rendition_msd = self._render(sloleks_db=sloleks_db)
def _render(self, sloleks_db=None): def _render(self, sloleks_db=None):
raise NotImplementedError("Not implemented for class: {}".format(type(self))) raise NotImplementedError("Not implemented for class: {}".format(type(self)))
class LemmaCR(ComponentRepresentation): class LemmaCR(ComponentRepresentation):
def _render(self, sloleks_db=None): def _render(self, sloleks_db=None):
return self.words[0].lemma if len(self.words) > 0 else None # TODO FIX THIS TO LEMMA MSD
if len(self.words) > 0:
return self.words[0].lemma, self.words[0].msd
else:
return None, None
class LexisCR(ComponentRepresentation): class LexisCR(ComponentRepresentation):
def _render(self, sloleks_db=None): def _render(self, sloleks_db=None):
return self.data['lexis'] return self.data['lexis'], 'Q'
class WordFormAllCR(ComponentRepresentation): class WordFormAllCR(ComponentRepresentation):
def _render(self, sloleks_db=None): def _render(self, sloleks_db=None):
@ -43,7 +48,9 @@ class WordFormAllCR(ComponentRepresentation):
return None return None
else: else:
forms = [w.text.lower() for w in self.words] forms = [w.text.lower() for w in self.words]
return "/".join(set(forms)) msds = [w.msd for w in self.words]
return "/".join(set(forms)), "/".join(set(msds))
class WordFormAnyCR(ComponentRepresentation): class WordFormAnyCR(ComponentRepresentation):
def _render(self, sloleks_db=None): def _render(self, sloleks_db=None):
@ -86,14 +93,14 @@ class WordFormAnyCR(ComponentRepresentation):
for agr, matched in zip(self.agreement, agreements_matched): for agr, matched in zip(self.agreement, agreements_matched):
if matched: if matched:
agr.confirm_match() agr.confirm_match()
return None return None, None
# if all agreements match, we win! # if all agreements match, we win!
if all(agreements_matched): if all(agreements_matched):
for agr in self.agreement: for agr in self.agreement:
agr.confirm_match() agr.confirm_match()
return text_forms[(word_msd, word_lemma)] return text_forms[(word_msd, word_lemma)], word_msd
class WordFormMsdCR(WordFormAnyCR): class WordFormMsdCR(WordFormAnyCR):
@ -154,6 +161,7 @@ class WordFormAgreementCR(WordFormMsdCR):
def __init__(self, data, word_renderer): def __init__(self, data, word_renderer):
super().__init__(data, word_renderer) super().__init__(data, word_renderer)
self.rendition_candidate = None self.rendition_candidate = None
self.rendition_msd_candidate = None
def get_agreement(self): def get_agreement(self):
return self.data['other'] return self.data['other']
@ -169,12 +177,14 @@ class WordFormAgreementCR(WordFormMsdCR):
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']): if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
if self.check_msd(candidate_msd): if self.check_msd(candidate_msd):
self.rendition_candidate = candidate_text self.rendition_candidate = candidate_text
self.rendition_msd_candidate = candidate_msd
return True return True
return False return False
def confirm_match(self): def confirm_match(self):
self.rendition_text = self.rendition_candidate self.rendition_text = self.rendition_candidate
self.rendition_msd = self.rendition_msd_candidate
@staticmethod @staticmethod
def check_agreement(msd1, msd2, agreements): def check_agreement(msd1, msd2, agreements):

View File

@ -73,10 +73,11 @@ class RepresentationAssigner:
rep.render(sloleks_db=sloleks_db) rep.render(sloleks_db=sloleks_db)
for cid, reps in representations.items(): for cid, reps in representations.items():
reps = [rep.rendition_text for rep in reps] reps_text = [rep.rendition_text for rep in reps]
if reps == []: reps_msd = [rep.rendition_msd for rep in reps]
if reps_text == []:
pass pass
elif all(r is None for r in reps): elif all(r is None for r in reps_text):
match.representations[cid] = None match.representations[cid] = (None, None)
else: else:
match.representations[cid] = " ".join(("" if r is None else r) for r in reps) match.representations[cid] = (" ".join(("" if r is None else r) for r in reps_text), " ".join(("" if r is None else r) for r in reps_msd))

View File

@ -1,46 +1,12 @@
from collections import defaultdict
from ast import literal_eval
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, aliased from sqlalchemy.orm import Session, aliased
from sqlalchemy import create_engine from sqlalchemy import create_engine
from sqlalchemy import func
from match import StructureMatch
from representation_assigner import RepresentationAssigner
from progress_bar import progress
# Lexeme = None
# LexemeFeature = None
# SyntacticStructure = None
# StructureComponent = None
# Feature = None
# LexicalUnitLexeme = None
# LexicalUnit = None
# LexicalUnitType = None
# Category = None
# Sense = None
# Measure = None
# LexicalUnitMeasure = None
# Corpus = None
# Definition = None
# WordForm = None
# WordFormFeature = None
# FormRepresentation = None
from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
class SloleksDatabase: class SloleksDatabase:
def __init__(self, db): def __init__(self, db):
# self.db = db
# self.dispersions = {}
# self.min_freq = args.min_freq
# self.db.init("""CREATE TABLE Colocations (
# colocation_id INTEGER PRIMARY KEY,
# structure_id varchar(8),
# key varchar(256))
# """)
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':') [db_user, db_password, db_database, db_host] = db.split(':')
@ -130,14 +96,15 @@ class SloleksDatabase:
def get_word_form(self, lemma, msd, data, align_msd=False): def get_word_form(self, lemma, msd, data, align_msd=False):
# modify msd as required # modify msd as required
msd = list(msd) msd = list(msd)
if 'msd' in data:
if not align_msd and 'msd' in data:
for key, value in data['msd'].items(): for key, value in data['msd'].items():
t = msd[0] t = msd[0]
v = TAGSET[t].index(key.lower()) v = TAGSET[t].index(key.lower())
if v + 1 >= len(msd):
msd = msd + ['-' for _ in range(v - len(msd) + 2)]
msd[v + 1] = CODES[value] msd[v + 1] = CODES[value]
elif 'agreement' in data: if align_msd and 'agreement' in data:
align_msd = list(align_msd) align_msd = list(align_msd)
t_align_msd = align_msd[0] t_align_msd = align_msd[0]
t = msd[0] t = msd[0]
@ -146,37 +113,24 @@ class SloleksDatabase:
v_align_msd = TAGSET[t_align_msd].index(att.lower()) v_align_msd = TAGSET[t_align_msd].index(att.lower())
v = TAGSET[t].index(att.lower()) v = TAGSET[t].index(att.lower())
# fix for verbs with short msds # fix for verbs with short msds
if v >= len(msd): if v + 1 >= len(msd):
return None, None, None msd = msd + ['-' for _ in range(v - len(msd) + 2)]
# if v >= len(msd) and t == 'V' and att == 'number': # return None, None, None
# if len(msd) == 4:
# msd += ['3'] msd[v + 1] = align_msd[v_align_msd + 1]
# if len(msd) == 5:
# msd += ['_']
# try:
msd[v + 1] = align_msd[v_align_msd + 1]
# except:
# print('here')
# msd = list(msd)
decypher_msd = self.decypher_msd(msd) decypher_msd = self.decypher_msd(msd)
if not decypher_msd: if not decypher_msd:
return None, None, None return None, None, None
wfs = [aliased(WordFormFeature) for _ in decypher_msd] wfs = [aliased(WordFormFeature) for _ in decypher_msd]
# wf1 = aliased(WordFormFeature)
# wf2 = aliased(WordFormFeature)
# wf3 = aliased(WordFormFeature)
query_preposition = self.session.query(FormRepresentation.form) \ query_preposition = self.session.query(FormRepresentation.form) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
for wf in wfs: for wf in wfs:
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id) query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
# .join(wf1, wf1.word_form_id == WordForm.id) \
# .join(wf2, wf2.word_form_id == WordForm.id) \
# .join(wf3, wf3.word_form_id == WordForm.id) \
query_preposition = query_preposition.filter(Lexeme.lemma == lemma) query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
@ -186,6 +140,4 @@ class SloleksDatabase:
pattern_translation_hws = query_preposition.all() pattern_translation_hws = query_preposition.all()
if len(pattern_translation_hws) > 0: if len(pattern_translation_hws) > 0:
return ''.join(msd), lemma, pattern_translation_hws[0][0] return ''.join(msd), lemma, pattern_translation_hws[0][0]
# pattern_translation_hws = [el[0] for el in query_preposition.all()]
return None, None, None return None, None, None
# return pattern_translation_hws

View File

@ -73,8 +73,6 @@ def main(args):
postprocessor = Postprocessor() postprocessor = Postprocessor()
matches = match_file(words, structures, postprocessor) matches = match_file(words, structures, postprocessor)
# matches = .process()
# TODO Add postprocessing here or inside previous function!
match_store.add_matches(matches) match_store.add_matches(matches)
word_stats.add_words(words) word_stats.add_words(words)
database.commit() database.commit()

View File

@ -82,15 +82,11 @@ class Writer:
self.formatter.new_match(match) self.formatter.new_match(match)
best_word_order = self.find_best_word_order(match.matches) variable_word_order = self.find_variable_word_order(match.matches)
for words in match.matches: for words in match.matches:
to_write = [] to_write = []
# TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare
# word.int_id and return most popular order and append to it remaining numbers to len(components)
for idx, _comp in enumerate(components): for idx, _comp in enumerate(components):
idx = str(idx + 1) idx = str(idx + 1)
if idx not in words: if idx not in words:
@ -105,7 +101,7 @@ class Writer:
to_write = [structure.id] + to_write + [match.match_id] to_write = [structure.id] + to_write + [match.match_id]
# header_right # header_right
to_write.extend(self.formatter.content_right(len(match), best_word_order)) to_write.extend(self.formatter.content_right(len(match), variable_word_order))
rows.append(to_write) rows.append(to_write)
if self.formatter.group(): if self.formatter.group():
@ -148,7 +144,7 @@ class Writer:
fp_close(fp) fp_close(fp)
@staticmethod @staticmethod
def find_best_word_order(matches): def find_variable_word_order(matches):
orders = {} orders = {}
for words in matches: for words in matches:
order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)]) order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])