Created new column "Joint_representative_form_variable" + Fixed collocation structures + Fixed bug with wrong lemma_fallback msds
This commit is contained in:
parent
de3e52c57c
commit
9a9d344510
|
@ -40,26 +40,28 @@ class OutNoStatFormatter(Formatter):
|
|||
return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
|
||||
|
||||
def header_right(self):
|
||||
return ["Joint_representative_form", "Frequency"]
|
||||
return ["Joint_representative_form_fixed", "Joint_representative_form_variable", "Frequency"]
|
||||
|
||||
def content_repeat(self, words, representations, idx, _sidx):
|
||||
word = words[idx]
|
||||
if idx not in representations:
|
||||
return [word.lemma, "", ""]
|
||||
|
||||
rep = representations[idx]
|
||||
if rep is None:
|
||||
rep_text, rep_msd = representations[idx]
|
||||
if rep_text is None:
|
||||
self.representation[idx] = word.lemma
|
||||
return [word.lemma, word.lemma, "", "lemma_fallback"]
|
||||
else:
|
||||
self.representation[idx] = rep
|
||||
return [word.lemma, rep, word.msd, "ok"]
|
||||
self.representation[idx] = rep_text
|
||||
return [word.lemma, rep_text, rep_msd, "ok"]
|
||||
|
||||
def content_right(self, freq, best_word_order=None):
|
||||
if best_word_order is None:
|
||||
best_word_order = sorted(self.representation.keys())
|
||||
rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation])
|
||||
result = [rep, str(freq)]
|
||||
def content_right(self, freq, variable_word_order=None):
|
||||
fixed_word_order = sorted(self.representation.keys())
|
||||
if variable_word_order is None:
|
||||
variable_word_order = fixed_word_order
|
||||
rep_fixed_word_order = ' '.join([self.representation[o] for o in fixed_word_order if o in self.representation])
|
||||
rep_variable_word_order = ' '.join([self.representation[o] for o in variable_word_order if o in self.representation])
|
||||
result = [rep_fixed_word_order, rep_variable_word_order, str(freq)]
|
||||
self.representation = {}
|
||||
return result
|
||||
|
||||
|
@ -183,13 +185,13 @@ class OutFormatter(Formatter):
|
|||
def header_right(self):
|
||||
return self.f1.header_right() + self.f2.header_right()
|
||||
|
||||
def content_repeat(self, words, representations, idx, sidx, best_word_order=None):
|
||||
def content_repeat(self, words, representations, idx, sidx, variable_word_order=None):
|
||||
cr1 = self.f1.content_repeat(words, representations, idx, sidx)
|
||||
cr2 = self.f2.content_repeat(words, representations, idx, sidx)
|
||||
return cr1 + cr2
|
||||
|
||||
def content_right(self, freq, best_word_order=None):
|
||||
return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq)
|
||||
def content_right(self, freq, variable_word_order=None):
|
||||
return self.f1.content_right(freq, variable_word_order) + self.f2.content_right(freq)
|
||||
|
||||
def group(self):
|
||||
return self.f1.group() and self.f2.group()
|
||||
|
|
|
@ -28,8 +28,8 @@ class StructureMatch:
|
|||
|
||||
result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
|
||||
|
||||
for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
|
||||
result.representations[str(component_id)] = text
|
||||
for component_id, text, msd in db.execute("SELECT component_id, text, msd FROM Representations WHERE colocation_id=?", (colocation_id,)):
|
||||
result.representations[str(component_id)] = (text, msd)
|
||||
|
||||
return result
|
||||
|
||||
|
|
|
@ -35,6 +35,7 @@ class MatchStore:
|
|||
colocation_id INTEGER,
|
||||
component_id INTEGER,
|
||||
text varchar(32),
|
||||
msd varchar(32),
|
||||
FOREIGN KEY(colocation_id) REFERENCES Colocations(colocation_id))
|
||||
""")
|
||||
self.db.init("""CREATE TABLE Dispersions (
|
||||
|
@ -93,10 +94,10 @@ class MatchStore:
|
|||
|
||||
def add_inserts(self, inserts):
|
||||
for match in inserts:
|
||||
for component_id, text in match.representations.items():
|
||||
for component_id, (text, msd) in match.representations.items():
|
||||
self.db.execute("""
|
||||
INSERT INTO Representations (colocation_id, component_id, text)
|
||||
VALUES (?,?,?)""", (match.match_id, component_id, text))
|
||||
INSERT INTO Representations (colocation_id, component_id, text, msd)
|
||||
VALUES (?,?,?,?)""", (match.match_id, component_id, text, msd))
|
||||
|
||||
def set_representations(self, word_renderer, structures, sloleks_db=None):
|
||||
step_name = 'representation'
|
||||
|
|
|
@ -16,15 +16,7 @@ class Postprocessor:
|
|||
return 'k'
|
||||
|
||||
def process(self, match, collocation_id):
|
||||
# self.matches = matches
|
||||
# if self.fix_one_letter_words:
|
||||
# for syn_structure_key, syn_structure_value in self.matches.items():
|
||||
# for match, collocation_id in syn_structure_value:
|
||||
if len(collocation_id) > 2:
|
||||
# a = collocation_id[1:-1]
|
||||
# b = enumerate(collocation_id[1:-1])
|
||||
# for a, c in b:
|
||||
# print('here')
|
||||
for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
|
||||
if word in ['s', 'z']:
|
||||
correct_letter = self.fix_sz(collocation_id[idx + 2][1])
|
||||
|
|
|
@ -14,6 +14,7 @@ class ComponentRepresentation:
|
|||
|
||||
self.words = []
|
||||
self.rendition_text = None
|
||||
self.rendition_msd = None
|
||||
self.agreement = []
|
||||
|
||||
def get_agreement(self):
|
||||
|
@ -24,18 +25,22 @@ class ComponentRepresentation:
|
|||
|
||||
def render(self, sloleks_db=None):
|
||||
if self.rendition_text is None:
|
||||
self.rendition_text = self._render(sloleks_db=sloleks_db)
|
||||
self.rendition_text, self.rendition_msd = self._render(sloleks_db=sloleks_db)
|
||||
|
||||
def _render(self, sloleks_db=None):
|
||||
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
|
||||
|
||||
class LemmaCR(ComponentRepresentation):
|
||||
def _render(self, sloleks_db=None):
|
||||
return self.words[0].lemma if len(self.words) > 0 else None
|
||||
# TODO FIX THIS TO LEMMA MSD
|
||||
if len(self.words) > 0:
|
||||
return self.words[0].lemma, self.words[0].msd
|
||||
else:
|
||||
return None, None
|
||||
|
||||
class LexisCR(ComponentRepresentation):
|
||||
def _render(self, sloleks_db=None):
|
||||
return self.data['lexis']
|
||||
return self.data['lexis'], 'Q'
|
||||
|
||||
class WordFormAllCR(ComponentRepresentation):
|
||||
def _render(self, sloleks_db=None):
|
||||
|
@ -43,7 +48,9 @@ class WordFormAllCR(ComponentRepresentation):
|
|||
return None
|
||||
else:
|
||||
forms = [w.text.lower() for w in self.words]
|
||||
return "/".join(set(forms))
|
||||
msds = [w.msd for w in self.words]
|
||||
|
||||
return "/".join(set(forms)), "/".join(set(msds))
|
||||
|
||||
class WordFormAnyCR(ComponentRepresentation):
|
||||
def _render(self, sloleks_db=None):
|
||||
|
@ -86,14 +93,14 @@ class WordFormAnyCR(ComponentRepresentation):
|
|||
for agr, matched in zip(self.agreement, agreements_matched):
|
||||
if matched:
|
||||
agr.confirm_match()
|
||||
return None
|
||||
return None, None
|
||||
|
||||
# if all agreements match, we win!
|
||||
if all(agreements_matched):
|
||||
for agr in self.agreement:
|
||||
agr.confirm_match()
|
||||
|
||||
return text_forms[(word_msd, word_lemma)]
|
||||
return text_forms[(word_msd, word_lemma)], word_msd
|
||||
|
||||
|
||||
class WordFormMsdCR(WordFormAnyCR):
|
||||
|
@ -154,6 +161,7 @@ class WordFormAgreementCR(WordFormMsdCR):
|
|||
def __init__(self, data, word_renderer):
|
||||
super().__init__(data, word_renderer)
|
||||
self.rendition_candidate = None
|
||||
self.rendition_msd_candidate = None
|
||||
|
||||
def get_agreement(self):
|
||||
return self.data['other']
|
||||
|
@ -169,12 +177,14 @@ class WordFormAgreementCR(WordFormMsdCR):
|
|||
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
|
||||
if self.check_msd(candidate_msd):
|
||||
self.rendition_candidate = candidate_text
|
||||
self.rendition_msd_candidate = candidate_msd
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def confirm_match(self):
|
||||
self.rendition_text = self.rendition_candidate
|
||||
self.rendition_msd = self.rendition_msd_candidate
|
||||
|
||||
@staticmethod
|
||||
def check_agreement(msd1, msd2, agreements):
|
||||
|
|
|
@ -73,10 +73,11 @@ class RepresentationAssigner:
|
|||
rep.render(sloleks_db=sloleks_db)
|
||||
|
||||
for cid, reps in representations.items():
|
||||
reps = [rep.rendition_text for rep in reps]
|
||||
if reps == []:
|
||||
reps_text = [rep.rendition_text for rep in reps]
|
||||
reps_msd = [rep.rendition_msd for rep in reps]
|
||||
if reps_text == []:
|
||||
pass
|
||||
elif all(r is None for r in reps):
|
||||
match.representations[cid] = None
|
||||
elif all(r is None for r in reps_text):
|
||||
match.representations[cid] = (None, None)
|
||||
else:
|
||||
match.representations[cid] = " ".join(("" if r is None else r) for r in reps)
|
||||
match.representations[cid] = (" ".join(("" if r is None else r) for r in reps_text), " ".join(("" if r is None else r) for r in reps_msd))
|
||||
|
|
|
@ -1,46 +1,12 @@
|
|||
from collections import defaultdict
|
||||
from ast import literal_eval
|
||||
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import Session, aliased
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy import func
|
||||
|
||||
from match import StructureMatch
|
||||
from representation_assigner import RepresentationAssigner
|
||||
from progress_bar import progress
|
||||
|
||||
# Lexeme = None
|
||||
# LexemeFeature = None
|
||||
# SyntacticStructure = None
|
||||
# StructureComponent = None
|
||||
# Feature = None
|
||||
# LexicalUnitLexeme = None
|
||||
# LexicalUnit = None
|
||||
# LexicalUnitType = None
|
||||
# Category = None
|
||||
# Sense = None
|
||||
# Measure = None
|
||||
# LexicalUnitMeasure = None
|
||||
# Corpus = None
|
||||
# Definition = None
|
||||
# WordForm = None
|
||||
# WordFormFeature = None
|
||||
# FormRepresentation = None
|
||||
from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
|
||||
|
||||
|
||||
class SloleksDatabase:
|
||||
def __init__(self, db):
|
||||
# self.db = db
|
||||
# self.dispersions = {}
|
||||
# self.min_freq = args.min_freq
|
||||
|
||||
# self.db.init("""CREATE TABLE Colocations (
|
||||
# colocation_id INTEGER PRIMARY KEY,
|
||||
# structure_id varchar(8),
|
||||
# key varchar(256))
|
||||
# """)
|
||||
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
||||
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||
|
||||
|
@ -130,14 +96,15 @@ class SloleksDatabase:
|
|||
def get_word_form(self, lemma, msd, data, align_msd=False):
|
||||
# modify msd as required
|
||||
msd = list(msd)
|
||||
|
||||
if not align_msd and 'msd' in data:
|
||||
if 'msd' in data:
|
||||
for key, value in data['msd'].items():
|
||||
t = msd[0]
|
||||
v = TAGSET[t].index(key.lower())
|
||||
if v + 1 >= len(msd):
|
||||
msd = msd + ['-' for _ in range(v - len(msd) + 2)]
|
||||
msd[v + 1] = CODES[value]
|
||||
|
||||
elif 'agreement' in data:
|
||||
if align_msd and 'agreement' in data:
|
||||
align_msd = list(align_msd)
|
||||
t_align_msd = align_msd[0]
|
||||
t = msd[0]
|
||||
|
@ -146,37 +113,24 @@ class SloleksDatabase:
|
|||
v_align_msd = TAGSET[t_align_msd].index(att.lower())
|
||||
v = TAGSET[t].index(att.lower())
|
||||
# fix for verbs with short msds
|
||||
if v >= len(msd):
|
||||
return None, None, None
|
||||
# if v >= len(msd) and t == 'V' and att == 'number':
|
||||
# if len(msd) == 4:
|
||||
# msd += ['3']
|
||||
# if len(msd) == 5:
|
||||
# msd += ['_']
|
||||
# try:
|
||||
msd[v + 1] = align_msd[v_align_msd + 1]
|
||||
# except:
|
||||
# print('here')
|
||||
if v + 1 >= len(msd):
|
||||
msd = msd + ['-' for _ in range(v - len(msd) + 2)]
|
||||
# return None, None, None
|
||||
|
||||
msd[v + 1] = align_msd[v_align_msd + 1]
|
||||
|
||||
# msd = list(msd)
|
||||
decypher_msd = self.decypher_msd(msd)
|
||||
|
||||
if not decypher_msd:
|
||||
return None, None, None
|
||||
|
||||
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
|
||||
# wf1 = aliased(WordFormFeature)
|
||||
# wf2 = aliased(WordFormFeature)
|
||||
# wf3 = aliased(WordFormFeature)
|
||||
query_preposition = self.session.query(FormRepresentation.form) \
|
||||
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
||||
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
|
||||
|
||||
for wf in wfs:
|
||||
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
|
||||
# .join(wf1, wf1.word_form_id == WordForm.id) \
|
||||
# .join(wf2, wf2.word_form_id == WordForm.id) \
|
||||
# .join(wf3, wf3.word_form_id == WordForm.id) \
|
||||
|
||||
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
|
||||
|
||||
|
@ -186,6 +140,4 @@ class SloleksDatabase:
|
|||
pattern_translation_hws = query_preposition.all()
|
||||
if len(pattern_translation_hws) > 0:
|
||||
return ''.join(msd), lemma, pattern_translation_hws[0][0]
|
||||
# pattern_translation_hws = [el[0] for el in query_preposition.all()]
|
||||
return None, None, None
|
||||
# return pattern_translation_hws
|
||||
|
|
|
@ -73,8 +73,6 @@ def main(args):
|
|||
postprocessor = Postprocessor()
|
||||
matches = match_file(words, structures, postprocessor)
|
||||
|
||||
# matches = .process()
|
||||
# TODO Add postprocessing here or inside previous function!
|
||||
match_store.add_matches(matches)
|
||||
word_stats.add_words(words)
|
||||
database.commit()
|
||||
|
|
|
@ -82,15 +82,11 @@ class Writer:
|
|||
|
||||
self.formatter.new_match(match)
|
||||
|
||||
best_word_order = self.find_best_word_order(match.matches)
|
||||
variable_word_order = self.find_variable_word_order(match.matches)
|
||||
|
||||
for words in match.matches:
|
||||
to_write = []
|
||||
|
||||
# TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare
|
||||
# word.int_id and return most popular order and append to it remaining numbers to len(components)
|
||||
|
||||
|
||||
for idx, _comp in enumerate(components):
|
||||
idx = str(idx + 1)
|
||||
if idx not in words:
|
||||
|
@ -105,7 +101,7 @@ class Writer:
|
|||
to_write = [structure.id] + to_write + [match.match_id]
|
||||
|
||||
# header_right
|
||||
to_write.extend(self.formatter.content_right(len(match), best_word_order))
|
||||
to_write.extend(self.formatter.content_right(len(match), variable_word_order))
|
||||
rows.append(to_write)
|
||||
|
||||
if self.formatter.group():
|
||||
|
@ -148,7 +144,7 @@ class Writer:
|
|||
fp_close(fp)
|
||||
|
||||
@staticmethod
|
||||
def find_best_word_order(matches):
|
||||
def find_variable_word_order(matches):
|
||||
orders = {}
|
||||
for words in matches:
|
||||
order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])
|
||||
|
|
Loading…
Reference in New Issue
Block a user