From 48795c6227b02fb3218f0057524b139b6241e03a Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Mon, 1 Jul 2019 17:21:28 +0200 Subject: [PATCH] common msd now calculated per colocation id and not for whole corpus --- src/representation.py | 23 +++++++++++++++++------ src/word_stats.py | 37 ------------------------------------- 2 files changed, 17 insertions(+), 43 deletions(-) diff --git a/src/representation.py b/src/representation.py index 93fe828..8c21aa2 100644 --- a/src/representation.py +++ b/src/representation.py @@ -75,11 +75,15 @@ class WordFormAnyCR(ComponentRepresentation): return text_forms[(word_msd, word_lemma)] + class WordFormMsdCR(WordFormAnyCR): def __init__(self, *args): super().__init__(*args) self.lemma = None - self.msd = None + self.msds = [] + + def msd(self): + return self.msds[0] def check_msd(self, word_msd): if 'msd' not in self.data: @@ -100,16 +104,23 @@ class WordFormMsdCR(WordFormAnyCR): def add_word(self, word): if self.lemma is None: self.lemma = word.lemma - self.msd = word.msd + self.msds.append(word.msd) if self.check_msd(word.msd): super().add_word(word) def _render(self): - msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd) - self.words.append(WordMsdOnly(msd)) - + self.words.append(WordMsdOnly(self._common_msd())) return super()._render() + + def _common_msd(self): + msds = sorted(self.msds, key=len) + common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds))) + else msds[0][idx] for idx in range(len(msds[0]))] + common_msd = "".join(common_msd) + iommon_msd = "".join(common_msd) + return self.word_renderer.common_lemma_msd(self.lemma, common_msd) + class WordFormAgreementCR(WordFormMsdCR): def __init__(self, data, word_renderer): @@ -124,7 +135,7 @@ class WordFormAgreementCR(WordFormMsdCR): lemma_available_words = self.word_renderer.available_words(self.lemma, existing) for candidate_msd, candidate_text in lemma_available_words: - if self.msd[0] != candidate_msd[0]: + if self.msd()[0] != candidate_msd[0]: continue if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']): diff --git a/src/word_stats.py b/src/word_stats.py index 33a8ca2..6a4e612 100644 --- a/src/word_stats.py +++ b/src/word_stats.py @@ -18,12 +18,10 @@ class WordStats: text varchar(64), frequency int )""") - self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))") self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") - self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)") self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") def add_words(self, words): @@ -44,24 +42,6 @@ class WordStats: def generate_renders(self): lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] - - for lemma in progress(lemmas, 'common-msd'): - common_msds = defaultdict(lambda: "*" * 10) - - for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)): - msd = msd[0] - current_msd = common_msds[msd[0]] - new_msd = self.merge_msd(current_msd, msd) - common_msds[msd[0]] = new_msd - - for msd0, common_msd in common_msds.items(): - common_msd = self.common_lemma_msd(lemma, common_msd) - self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)", - (lemma, msd0, common_msd)) - - - self.db.commit() - for lemma in progress(lemmas, 'word-count'): num_words = defaultdict(int) for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)): @@ -81,23 +61,6 @@ class WordStats: else: return msd - def merge_msd(self, common_msd, new_msd): - key = (common_msd, new_msd) - if key in self.memoized_msd_merges: - return self.memoized_msd_merges[key] - - def merge_letter(l1, l2): - if l1 == "*": - return l2 - elif l1 != l2: - return "-" - else: - return l1 - - value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd)) - self.memoized_msd_merges[key] = value - return value - def render(self, lemma, msd): statement = """SELECT msd, frequency FROM UniqWords WHERE lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""