From 9a9d344510f31eae56c1e2fe35a77c2bf344ae64 Mon Sep 17 00:00:00 2001 From: Luka Date: Thu, 16 Jul 2020 20:53:59 +0200 Subject: [PATCH] Created new column "Joint_representative_form_variable" + Fixed collocation structures + Fixed bug with wrong lemma_fallback msds --- src/formatter.py | 28 ++++++++------- src/match.py | 4 +-- src/match_store.py | 7 ++-- src/postprocessor.py | 8 ----- src/representation.py | 22 ++++++++---- src/representation_assigner.py | 11 +++--- src/sloleks_db.py | 66 +++++----------------------------- src/wani.py | 2 -- src/writer.py | 10 ++---- 9 files changed, 55 insertions(+), 103 deletions(-) diff --git a/src/formatter.py b/src/formatter.py index 363ad21..85adc6d 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -40,26 +40,28 @@ class OutNoStatFormatter(Formatter): return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"] def header_right(self): - return ["Joint_representative_form", "Frequency"] + return ["Joint_representative_form_fixed", "Joint_representative_form_variable", "Frequency"] def content_repeat(self, words, representations, idx, _sidx): word = words[idx] if idx not in representations: return [word.lemma, "", ""] - rep = representations[idx] - if rep is None: + rep_text, rep_msd = representations[idx] + if rep_text is None: self.representation[idx] = word.lemma return [word.lemma, word.lemma, "", "lemma_fallback"] else: - self.representation[idx] = rep - return [word.lemma, rep, word.msd, "ok"] + self.representation[idx] = rep_text + return [word.lemma, rep_text, rep_msd, "ok"] - def content_right(self, freq, best_word_order=None): - if best_word_order is None: - best_word_order = sorted(self.representation.keys()) - rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation]) - result = [rep, str(freq)] + def content_right(self, freq, variable_word_order=None): + fixed_word_order = sorted(self.representation.keys()) + if variable_word_order is None: + variable_word_order = fixed_word_order + rep_fixed_word_order = ' '.join([self.representation[o] for o in fixed_word_order if o in self.representation]) + rep_variable_word_order = ' '.join([self.representation[o] for o in variable_word_order if o in self.representation]) + result = [rep_fixed_word_order, rep_variable_word_order, str(freq)] self.representation = {} return result @@ -183,13 +185,13 @@ class OutFormatter(Formatter): def header_right(self): return self.f1.header_right() + self.f2.header_right() - def content_repeat(self, words, representations, idx, sidx, best_word_order=None): + def content_repeat(self, words, representations, idx, sidx, variable_word_order=None): cr1 = self.f1.content_repeat(words, representations, idx, sidx) cr2 = self.f2.content_repeat(words, representations, idx, sidx) return cr1 + cr2 - def content_right(self, freq, best_word_order=None): - return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq) + def content_right(self, freq, variable_word_order=None): + return self.f1.content_right(freq, variable_word_order) + self.f2.content_right(freq) def group(self): return self.f1.group() and self.f2.group() diff --git a/src/match.py b/src/match.py index 1d65db0..d3f82ef 100644 --- a/src/match.py +++ b/src/match.py @@ -28,8 +28,8 @@ class StructureMatch: result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False) - for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)): - result.representations[str(component_id)] = text + for component_id, text, msd in db.execute("SELECT component_id, text, msd FROM Representations WHERE colocation_id=?", (colocation_id,)): + result.representations[str(component_id)] = (text, msd) return result diff --git a/src/match_store.py b/src/match_store.py index ff200ce..6025200 100644 --- a/src/match_store.py +++ b/src/match_store.py @@ -35,6 +35,7 @@ class MatchStore: colocation_id INTEGER, component_id INTEGER, text varchar(32), + msd varchar(32), FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id)) """) self.db.init("""CREATE TABLE Dispersions ( @@ -93,10 +94,10 @@ class MatchStore: def add_inserts(self, inserts): for match in inserts: - for component_id, text in match.representations.items(): + for component_id, (text, msd) in match.representations.items(): self.db.execute(""" - INSERT INTO Representations (colocation_id, component_id, text) - VALUES (?,?,?)""", (match.match_id, component_id, text)) + INSERT INTO Representations (colocation_id, component_id, text, msd) + VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd)) def set_representations(self, word_renderer, structures, sloleks_db=None): step_name = 'representation' diff --git a/src/postprocessor.py b/src/postprocessor.py index 375cb3f..60d15cc 100644 --- a/src/postprocessor.py +++ b/src/postprocessor.py @@ -16,15 +16,7 @@ class Postprocessor: return 'k' def process(self, match, collocation_id): - # self.matches = matches - # if self.fix_one_letter_words: - # for syn_structure_key, syn_structure_value in self.matches.items(): - # for match, collocation_id in syn_structure_value: if len(collocation_id) > 2: - # a = collocation_id[1:-1] - # b = enumerate(collocation_id[1:-1]) - # for a, c in b: - # print('here') for idx, (col_id, word) in enumerate(collocation_id[1:-1]): if word in ['s', 'z']: correct_letter = self.fix_sz(collocation_id[idx + 2][1]) diff --git a/src/representation.py b/src/representation.py index 31bdeff..c059415 100644 --- a/src/representation.py +++ b/src/representation.py @@ -14,6 +14,7 @@ class ComponentRepresentation: self.words = [] self.rendition_text = None + self.rendition_msd = None self.agreement = [] def get_agreement(self): @@ -24,18 +25,22 @@ class ComponentRepresentation: def render(self, sloleks_db=None): if self.rendition_text is None: - self.rendition_text = self._render(sloleks_db=sloleks_db) + self.rendition_text, self.rendition_msd = self._render(sloleks_db=sloleks_db) def _render(self, sloleks_db=None): raise NotImplementedError("Not implemented for class: {}".format(type(self))) class LemmaCR(ComponentRepresentation): def _render(self, sloleks_db=None): - return self.words[0].lemma if len(self.words) > 0 else None + # TODO FIX THIS TO LEMMA MSD + if len(self.words) > 0: + return self.words[0].lemma, self.words[0].msd + else: + return None, None class LexisCR(ComponentRepresentation): def _render(self, sloleks_db=None): - return self.data['lexis'] + return self.data['lexis'], 'Q' class WordFormAllCR(ComponentRepresentation): def _render(self, sloleks_db=None): @@ -43,7 +48,9 @@ class WordFormAllCR(ComponentRepresentation): return None else: forms = [w.text.lower() for w in self.words] - return "/".join(set(forms)) + msds = [w.msd for w in self.words] + + return "/".join(set(forms)), "/".join(set(msds)) class WordFormAnyCR(ComponentRepresentation): def _render(self, sloleks_db=None): @@ -86,14 +93,14 @@ class WordFormAnyCR(ComponentRepresentation): for agr, matched in zip(self.agreement, agreements_matched): if matched: agr.confirm_match() - return None + return None, None # if all agreements match, we win! if all(agreements_matched): for agr in self.agreement: agr.confirm_match() - return text_forms[(word_msd, word_lemma)] + return text_forms[(word_msd, word_lemma)], word_msd class WordFormMsdCR(WordFormAnyCR): @@ -154,6 +161,7 @@ class WordFormAgreementCR(WordFormMsdCR): def __init__(self, data, word_renderer): super().__init__(data, word_renderer) self.rendition_candidate = None + self.rendition_msd_candidate = None def get_agreement(self): return self.data['other'] @@ -169,12 +177,14 @@ class WordFormAgreementCR(WordFormMsdCR): if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']): if self.check_msd(candidate_msd): self.rendition_candidate = candidate_text + self.rendition_msd_candidate = candidate_msd return True return False def confirm_match(self): self.rendition_text = self.rendition_candidate + self.rendition_msd = self.rendition_msd_candidate @staticmethod def check_agreement(msd1, msd2, agreements): diff --git a/src/representation_assigner.py b/src/representation_assigner.py index 3c8ca52..822a1b1 100644 --- a/src/representation_assigner.py +++ b/src/representation_assigner.py @@ -73,10 +73,11 @@ class RepresentationAssigner: rep.render(sloleks_db=sloleks_db) for cid, reps in representations.items(): - reps = [rep.rendition_text for rep in reps] - if reps == []: + reps_text = [rep.rendition_text for rep in reps] + reps_msd = [rep.rendition_msd for rep in reps] + if reps_text == []: pass - elif all(r is None for r in reps): - match.representations[cid] = None + elif all(r is None for r in reps_text): + match.representations[cid] = (None, None) else: - match.representations[cid] = " ".join(("" if r is None else r) for r in reps) + match.representations[cid] = (" ".join(("" if r is None else r) for r in reps_text), " ".join(("" if r is None else r) for r in reps_msd)) diff --git a/src/sloleks_db.py b/src/sloleks_db.py index 953048d..7f3897f 100644 --- a/src/sloleks_db.py +++ b/src/sloleks_db.py @@ -1,46 +1,12 @@ -from collections import defaultdict -from ast import literal_eval - from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Session, aliased from sqlalchemy import create_engine -from sqlalchemy import func -from match import StructureMatch -from representation_assigner import RepresentationAssigner -from progress_bar import progress - -# Lexeme = None -# LexemeFeature = None -# SyntacticStructure = None -# StructureComponent = None -# Feature = None -# LexicalUnitLexeme = None -# LexicalUnit = None -# LexicalUnitType = None -# Category = None -# Sense = None -# Measure = None -# LexicalUnitMeasure = None -# Corpus = None -# Definition = None -# WordForm = None -# WordFormFeature = None -# FormRepresentation = None from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION class SloleksDatabase: def __init__(self, db): - # self.db = db - # self.dispersions = {} - # self.min_freq = args.min_freq - - # self.db.init("""CREATE TABLE Colocations ( - # colocation_id INTEGER PRIMARY KEY, - # structure_id varchar(8), - # key varchar(256)) - # """) global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation [db_user, db_password, db_database, db_host] = db.split(':') @@ -130,14 +96,15 @@ class SloleksDatabase: def get_word_form(self, lemma, msd, data, align_msd=False): # modify msd as required msd = list(msd) - - if not align_msd and 'msd' in data: + if 'msd' in data: for key, value in data['msd'].items(): t = msd[0] v = TAGSET[t].index(key.lower()) + if v + 1 >= len(msd): + msd = msd + ['-' for _ in range(v - len(msd) + 2)] msd[v + 1] = CODES[value] - elif 'agreement' in data: + if align_msd and 'agreement' in data: align_msd = list(align_msd) t_align_msd = align_msd[0] t = msd[0] @@ -146,37 +113,24 @@ class SloleksDatabase: v_align_msd = TAGSET[t_align_msd].index(att.lower()) v = TAGSET[t].index(att.lower()) # fix for verbs with short msds - if v >= len(msd): - return None, None, None - # if v >= len(msd) and t == 'V' and att == 'number': - # if len(msd) == 4: - # msd += ['3'] - # if len(msd) == 5: - # msd += ['_'] - # try: - msd[v + 1] = align_msd[v_align_msd + 1] - # except: - # print('here') + if v + 1 >= len(msd): + msd = msd + ['-' for _ in range(v - len(msd) + 2)] + # return None, None, None + + msd[v + 1] = align_msd[v_align_msd + 1] - # msd = list(msd) decypher_msd = self.decypher_msd(msd) if not decypher_msd: return None, None, None wfs = [aliased(WordFormFeature) for _ in decypher_msd] - # wf1 = aliased(WordFormFeature) - # wf2 = aliased(WordFormFeature) - # wf3 = aliased(WordFormFeature) query_preposition = self.session.query(FormRepresentation.form) \ .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ .join(Lexeme, Lexeme.id == WordForm.lexeme_id) for wf in wfs: query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id) - # .join(wf1, wf1.word_form_id == WordForm.id) \ - # .join(wf2, wf2.word_form_id == WordForm.id) \ - # .join(wf3, wf3.word_form_id == WordForm.id) \ query_preposition = query_preposition.filter(Lexeme.lemma == lemma) @@ -186,6 +140,4 @@ class SloleksDatabase: pattern_translation_hws = query_preposition.all() if len(pattern_translation_hws) > 0: return ''.join(msd), lemma, pattern_translation_hws[0][0] - # pattern_translation_hws = [el[0] for el in query_preposition.all()] return None, None, None - # return pattern_translation_hws diff --git a/src/wani.py b/src/wani.py index b386697..cde8dc5 100644 --- a/src/wani.py +++ b/src/wani.py @@ -73,8 +73,6 @@ def main(args): postprocessor = Postprocessor() matches = match_file(words, structures, postprocessor) - # matches = .process() - # TODO Add postprocessing here or inside previous function! match_store.add_matches(matches) word_stats.add_words(words) database.commit() diff --git a/src/writer.py b/src/writer.py index 613cc18..bda8c23 100644 --- a/src/writer.py +++ b/src/writer.py @@ -82,15 +82,11 @@ class Writer: self.formatter.new_match(match) - best_word_order = self.find_best_word_order(match.matches) + variable_word_order = self.find_variable_word_order(match.matches) for words in match.matches: to_write = [] - # TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare - # word.int_id and return most popular order and append to it remaining numbers to len(components) - - for idx, _comp in enumerate(components): idx = str(idx + 1) if idx not in words: @@ -105,7 +101,7 @@ class Writer: to_write = [structure.id] + to_write + [match.match_id] # header_right - to_write.extend(self.formatter.content_right(len(match), best_word_order)) + to_write.extend(self.formatter.content_right(len(match), variable_word_order)) rows.append(to_write) if self.formatter.group(): @@ -148,7 +144,7 @@ class Writer: fp_close(fp) @staticmethod - def find_best_word_order(matches): + def find_variable_word_order(matches): orders = {} for words in matches: order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])