common msd now calculated per colocation id and not for whole corpus

This commit is contained in:
Ozbolt Menegatti 2019-07-01 17:21:28 +02:00
parent 2f789e6550
commit 48795c6227
2 changed files with 17 additions and 43 deletions

View File

@ -75,11 +75,15 @@ class WordFormAnyCR(ComponentRepresentation):
return text_forms[(word_msd, word_lemma)] return text_forms[(word_msd, word_lemma)]
class WordFormMsdCR(WordFormAnyCR): class WordFormMsdCR(WordFormAnyCR):
def __init__(self, *args): def __init__(self, *args):
super().__init__(*args) super().__init__(*args)
self.lemma = None self.lemma = None
self.msd = None self.msds = []
def msd(self):
return self.msds[0]
def check_msd(self, word_msd): def check_msd(self, word_msd):
if 'msd' not in self.data: if 'msd' not in self.data:
@ -100,17 +104,24 @@ class WordFormMsdCR(WordFormAnyCR):
def add_word(self, word): def add_word(self, word):
if self.lemma is None: if self.lemma is None:
self.lemma = word.lemma self.lemma = word.lemma
self.msd = word.msd
self.msds.append(word.msd)
if self.check_msd(word.msd): if self.check_msd(word.msd):
super().add_word(word) super().add_word(word)
def _render(self): def _render(self):
msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd) self.words.append(WordMsdOnly(self._common_msd()))
self.words.append(WordMsdOnly(msd))
return super()._render() return super()._render()
def _common_msd(self):
msds = sorted(self.msds, key=len)
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
else msds[0][idx] for idx in range(len(msds[0]))]
common_msd = "".join(common_msd)
iommon_msd = "".join(common_msd)
return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
class WordFormAgreementCR(WordFormMsdCR): class WordFormAgreementCR(WordFormMsdCR):
def __init__(self, data, word_renderer): def __init__(self, data, word_renderer):
super().__init__(data, word_renderer) super().__init__(data, word_renderer)
@ -124,7 +135,7 @@ class WordFormAgreementCR(WordFormMsdCR):
lemma_available_words = self.word_renderer.available_words(self.lemma, existing) lemma_available_words = self.word_renderer.available_words(self.lemma, existing)
for candidate_msd, candidate_text in lemma_available_words: for candidate_msd, candidate_text in lemma_available_words:
if self.msd[0] != candidate_msd[0]: if self.msd()[0] != candidate_msd[0]:
continue continue
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']): if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):

View File

@ -18,12 +18,10 @@ class WordStats:
text varchar(64), text varchar(64),
frequency int frequency int
)""") )""")
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))")
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)") self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)") self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)") self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)")
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)") self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
def add_words(self, words): def add_words(self, words):
@ -44,24 +42,6 @@ class WordStats:
def generate_renders(self): def generate_renders(self):
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
for lemma in progress(lemmas, 'common-msd'):
common_msds = defaultdict(lambda: "*" * 10)
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
msd = msd[0]
current_msd = common_msds[msd[0]]
new_msd = self.merge_msd(current_msd, msd)
common_msds[msd[0]] = new_msd
for msd0, common_msd in common_msds.items():
common_msd = self.common_lemma_msd(lemma, common_msd)
self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)",
(lemma, msd0, common_msd))
self.db.commit()
for lemma in progress(lemmas, 'word-count'): for lemma in progress(lemmas, 'word-count'):
num_words = defaultdict(int) num_words = defaultdict(int)
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)): for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
@ -81,23 +61,6 @@ class WordStats:
else: else:
return msd return msd
def merge_msd(self, common_msd, new_msd):
key = (common_msd, new_msd)
if key in self.memoized_msd_merges:
return self.memoized_msd_merges[key]
def merge_letter(l1, l2):
if l1 == "*":
return l2
elif l1 != l2:
return "-"
else:
return l1
value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
self.memoized_msd_merges[key] = value
return value
def render(self, lemma, msd): def render(self, lemma, msd):
statement = """SELECT msd, frequency FROM UniqWords WHERE statement = """SELECT msd, frequency FROM UniqWords WHERE
lemma=:lemma AND msd=:msd ORDER BY frequency DESC""" lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""