Created new column "Joint_representative_form_variable" + Fixed collocation structures + Fixed bug with wrong lemma_fallback msds

This commit is contained in:
Luka 2020-07-16 20:53:59 +02:00
parent de3e52c57c
commit 9a9d344510
9 changed files with 55 additions and 103 deletions

View File

@ -40,26 +40,28 @@ class OutNoStatFormatter(Formatter):
return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
def header_right(self):
return ["Joint_representative_form", "Frequency"]
return ["Joint_representative_form_fixed", "Joint_representative_form_variable", "Frequency"]
def content_repeat(self, words, representations, idx, _sidx):
word = words[idx]
if idx not in representations:
return [word.lemma, "", ""]
rep = representations[idx]
if rep is None:
rep_text, rep_msd = representations[idx]
if rep_text is None:
self.representation[idx] = word.lemma
return [word.lemma, word.lemma, "", "lemma_fallback"]
else:
self.representation[idx] = rep
return [word.lemma, rep, word.msd, "ok"]
self.representation[idx] = rep_text
return [word.lemma, rep_text, rep_msd, "ok"]
def content_right(self, freq, best_word_order=None):
if best_word_order is None:
best_word_order = sorted(self.representation.keys())
rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation])
result = [rep, str(freq)]
def content_right(self, freq, variable_word_order=None):
fixed_word_order = sorted(self.representation.keys())
if variable_word_order is None:
variable_word_order = fixed_word_order
rep_fixed_word_order = ' '.join([self.representation[o] for o in fixed_word_order if o in self.representation])
rep_variable_word_order = ' '.join([self.representation[o] for o in variable_word_order if o in self.representation])
result = [rep_fixed_word_order, rep_variable_word_order, str(freq)]
self.representation = {}
return result
@ -183,13 +185,13 @@ class OutFormatter(Formatter):
def header_right(self):
return self.f1.header_right() + self.f2.header_right()
def content_repeat(self, words, representations, idx, sidx, best_word_order=None):
def content_repeat(self, words, representations, idx, sidx, variable_word_order=None):
cr1 = self.f1.content_repeat(words, representations, idx, sidx)
cr2 = self.f2.content_repeat(words, representations, idx, sidx)
return cr1 + cr2
def content_right(self, freq, best_word_order=None):
return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq)
def content_right(self, freq, variable_word_order=None):
return self.f1.content_right(freq, variable_word_order) + self.f2.content_right(freq)
def group(self):
return self.f1.group() and self.f2.group()

View File

@ -28,8 +28,8 @@ class StructureMatch:
result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
result.representations[str(component_id)] = text
for component_id, text, msd in db.execute("SELECT component_id, text, msd FROM Representations WHERE colocation_id=?", (colocation_id,)):
result.representations[str(component_id)] = (text, msd)
return result

View File

@ -35,6 +35,7 @@ class MatchStore:
colocation_id INTEGER,
component_id INTEGER,
text varchar(32),
msd varchar(32),
FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
""")
self.db.init("""CREATE TABLE Dispersions (
@ -93,10 +94,10 @@ class MatchStore:
def add_inserts(self, inserts):
for match in inserts:
for component_id, text in match.representations.items():
for component_id, (text, msd) in match.representations.items():
self.db.execute("""
INSERT INTO Representations (colocation_id, component_id, text)
VALUES (?,?,?)""", (match.match_id, component_id, text))
INSERT INTO Representations (colocation_id, component_id, text, msd)
VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd))
def set_representations(self, word_renderer, structures, sloleks_db=None):
step_name = 'representation'

View File

@ -16,15 +16,7 @@ class Postprocessor:
return 'k'
def process(self, match, collocation_id):
# self.matches = matches
# if self.fix_one_letter_words:
# for syn_structure_key, syn_structure_value in self.matches.items():
# for match, collocation_id in syn_structure_value:
if len(collocation_id) > 2:
# a = collocation_id[1:-1]
# b = enumerate(collocation_id[1:-1])
# for a, c in b:
# print('here')
for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
if word in ['s', 'z']:
correct_letter = self.fix_sz(collocation_id[idx + 2][1])

View File

@ -14,6 +14,7 @@ class ComponentRepresentation:
self.words = []
self.rendition_text = None
self.rendition_msd = None
self.agreement = []
def get_agreement(self):
@ -24,18 +25,22 @@ class ComponentRepresentation:
def render(self, sloleks_db=None):
if self.rendition_text is None:
self.rendition_text = self._render(sloleks_db=sloleks_db)
self.rendition_text, self.rendition_msd = self._render(sloleks_db=sloleks_db)
def _render(self, sloleks_db=None):
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
class LemmaCR(ComponentRepresentation):
def _render(self, sloleks_db=None):
return self.words[0].lemma if len(self.words) > 0 else None
# TODO FIX THIS TO LEMMA MSD
if len(self.words) > 0:
return self.words[0].lemma, self.words[0].msd
else:
return None, None
class LexisCR(ComponentRepresentation):
def _render(self, sloleks_db=None):
return self.data['lexis']
return self.data['lexis'], 'Q'
class WordFormAllCR(ComponentRepresentation):
def _render(self, sloleks_db=None):
@ -43,7 +48,9 @@ class WordFormAllCR(ComponentRepresentation):
return None
else:
forms = [w.text.lower() for w in self.words]
return "/".join(set(forms))
msds = [w.msd for w in self.words]
return "/".join(set(forms)), "/".join(set(msds))
class WordFormAnyCR(ComponentRepresentation):
def _render(self, sloleks_db=None):
@ -86,14 +93,14 @@ class WordFormAnyCR(ComponentRepresentation):
for agr, matched in zip(self.agreement, agreements_matched):
if matched:
agr.confirm_match()
return None
return None, None
# if all agreements match, we win!
if all(agreements_matched):
for agr in self.agreement:
agr.confirm_match()
return text_forms[(word_msd, word_lemma)]
return text_forms[(word_msd, word_lemma)], word_msd
class WordFormMsdCR(WordFormAnyCR):
@ -154,6 +161,7 @@ class WordFormAgreementCR(WordFormMsdCR):
def __init__(self, data, word_renderer):
super().__init__(data, word_renderer)
self.rendition_candidate = None
self.rendition_msd_candidate = None
def get_agreement(self):
return self.data['other']
@ -169,12 +177,14 @@ class WordFormAgreementCR(WordFormMsdCR):
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
if self.check_msd(candidate_msd):
self.rendition_candidate = candidate_text
self.rendition_msd_candidate = candidate_msd
return True
return False
def confirm_match(self):
self.rendition_text = self.rendition_candidate
self.rendition_msd = self.rendition_msd_candidate
@staticmethod
def check_agreement(msd1, msd2, agreements):

View File

@ -73,10 +73,11 @@ class RepresentationAssigner:
rep.render(sloleks_db=sloleks_db)
for cid, reps in representations.items():
reps = [rep.rendition_text for rep in reps]
if reps == []:
reps_text = [rep.rendition_text for rep in reps]
reps_msd = [rep.rendition_msd for rep in reps]
if reps_text == []:
pass
elif all(r is None for r in reps):
match.representations[cid] = None
elif all(r is None for r in reps_text):
match.representations[cid] = (None, None)
else:
match.representations[cid] = " ".join(("" if r is None else r) for r in reps)
match.representations[cid] = (" ".join(("" if r is None else r) for r in reps_text), " ".join(("" if r is None else r) for r in reps_msd))

View File

@ -1,46 +1,12 @@
from collections import defaultdict
from ast import literal_eval
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, aliased
from sqlalchemy import create_engine
from sqlalchemy import func
from match import StructureMatch
from representation_assigner import RepresentationAssigner
from progress_bar import progress
# Lexeme = None
# LexemeFeature = None
# SyntacticStructure = None
# StructureComponent = None
# Feature = None
# LexicalUnitLexeme = None
# LexicalUnit = None
# LexicalUnitType = None
# Category = None
# Sense = None
# Measure = None
# LexicalUnitMeasure = None
# Corpus = None
# Definition = None
# WordForm = None
# WordFormFeature = None
# FormRepresentation = None
from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
class SloleksDatabase:
def __init__(self, db):
# self.db = db
# self.dispersions = {}
# self.min_freq = args.min_freq
# self.db.init("""CREATE TABLE Colocations (
# colocation_id INTEGER PRIMARY KEY,
# structure_id varchar(8),
# key varchar(256))
# """)
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':')
@ -130,14 +96,15 @@ class SloleksDatabase:
def get_word_form(self, lemma, msd, data, align_msd=False):
# modify msd as required
msd = list(msd)
if not align_msd and 'msd' in data:
if 'msd' in data:
for key, value in data['msd'].items():
t = msd[0]
v = TAGSET[t].index(key.lower())
if v + 1 >= len(msd):
msd = msd + ['-' for _ in range(v - len(msd) + 2)]
msd[v + 1] = CODES[value]
elif 'agreement' in data:
if align_msd and 'agreement' in data:
align_msd = list(align_msd)
t_align_msd = align_msd[0]
t = msd[0]
@ -146,37 +113,24 @@ class SloleksDatabase:
v_align_msd = TAGSET[t_align_msd].index(att.lower())
v = TAGSET[t].index(att.lower())
# fix for verbs with short msds
if v >= len(msd):
return None, None, None
# if v >= len(msd) and t == 'V' and att == 'number':
# if len(msd) == 4:
# msd += ['3']
# if len(msd) == 5:
# msd += ['_']
# try:
msd[v + 1] = align_msd[v_align_msd + 1]
# except:
# print('here')
if v + 1 >= len(msd):
msd = msd + ['-' for _ in range(v - len(msd) + 2)]
# return None, None, None
msd[v + 1] = align_msd[v_align_msd + 1]
# msd = list(msd)
decypher_msd = self.decypher_msd(msd)
if not decypher_msd:
return None, None, None
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
# wf1 = aliased(WordFormFeature)
# wf2 = aliased(WordFormFeature)
# wf3 = aliased(WordFormFeature)
query_preposition = self.session.query(FormRepresentation.form) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
for wf in wfs:
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
# .join(wf1, wf1.word_form_id == WordForm.id) \
# .join(wf2, wf2.word_form_id == WordForm.id) \
# .join(wf3, wf3.word_form_id == WordForm.id) \
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
@ -186,6 +140,4 @@ class SloleksDatabase:
pattern_translation_hws = query_preposition.all()
if len(pattern_translation_hws) > 0:
return ''.join(msd), lemma, pattern_translation_hws[0][0]
# pattern_translation_hws = [el[0] for el in query_preposition.all()]
return None, None, None
# return pattern_translation_hws

View File

@ -73,8 +73,6 @@ def main(args):
postprocessor = Postprocessor()
matches = match_file(words, structures, postprocessor)
# matches = .process()
# TODO Add postprocessing here or inside previous function!
match_store.add_matches(matches)
word_stats.add_words(words)
database.commit()

View File

@ -82,15 +82,11 @@ class Writer:
self.formatter.new_match(match)
best_word_order = self.find_best_word_order(match.matches)
variable_word_order = self.find_variable_word_order(match.matches)
for words in match.matches:
to_write = []
# TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare
# word.int_id and return most popular order and append to it remaining numbers to len(components)
for idx, _comp in enumerate(components):
idx = str(idx + 1)
if idx not in words:
@ -105,7 +101,7 @@ class Writer:
to_write = [structure.id] + to_write + [match.match_id]
# header_right
to_write.extend(self.formatter.content_right(len(match), best_word_order))
to_write.extend(self.formatter.content_right(len(match), variable_word_order))
rows.append(to_write)
if self.formatter.group():
@ -148,7 +144,7 @@ class Writer:
fp_close(fp)
@staticmethod
def find_best_word_order(matches):
def find_variable_word_order(matches):
orders = {}
for words in matches:
order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])