common msd now calculated per colocation id and not for whole corpus

This commit is contained in:
Ozbolt Menegatti 2019-07-01 17:21:28 +02:00
parent 2f789e6550
commit 48795c6227
2 changed files with 17 additions and 43 deletions

View File

@ -75,11 +75,15 @@ class WordFormAnyCR(ComponentRepresentation):
return text_forms[(word_msd, word_lemma)]
class WordFormMsdCR(WordFormAnyCR):
def __init__(self, *args):
super().__init__(*args)
self.lemma = None
self.msd = None
self.msds = []
def msd(self):
return self.msds[0]
def check_msd(self, word_msd):
if 'msd' not in self.data:
@ -100,17 +104,24 @@ class WordFormMsdCR(WordFormAnyCR):
def add_word(self, word):
if self.lemma is None:
self.lemma = word.lemma
self.msd = word.msd
self.msds.append(word.msd)
if self.check_msd(word.msd):
super().add_word(word)
def _render(self):
msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd)
self.words.append(WordMsdOnly(msd))
self.words.append(WordMsdOnly(self._common_msd()))
return super()._render()
def _common_msd(self):
msds = sorted(self.msds, key=len)
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
else msds[0][idx] for idx in range(len(msds[0]))]
common_msd = "".join(common_msd)
iommon_msd = "".join(common_msd)
return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
class WordFormAgreementCR(WordFormMsdCR):
def __init__(self, data, word_renderer):
super().__init__(data, word_renderer)
@ -124,7 +135,7 @@ class WordFormAgreementCR(WordFormMsdCR):
lemma_available_words = self.word_renderer.available_words(self.lemma, existing)
for candidate_msd, candidate_text in lemma_available_words:
if self.msd[0] != candidate_msd[0]:
if self.msd()[0] != candidate_msd[0]:
continue
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):

View File

@ -18,12 +18,10 @@ class WordStats:
text varchar(64),
frequency int
)""")
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))")
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)")
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
def add_words(self, words):
@ -44,24 +42,6 @@ class WordStats:
def generate_renders(self):
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
for lemma in progress(lemmas, 'common-msd'):
common_msds = defaultdict(lambda: "*" * 10)
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
msd = msd[0]
current_msd = common_msds[msd[0]]
new_msd = self.merge_msd(current_msd, msd)
common_msds[msd[0]] = new_msd
for msd0, common_msd in common_msds.items():
common_msd = self.common_lemma_msd(lemma, common_msd)
self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)",
(lemma, msd0, common_msd))
self.db.commit()
for lemma in progress(lemmas, 'word-count'):
num_words = defaultdict(int)
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
@ -81,23 +61,6 @@ class WordStats:
else:
return msd
def merge_msd(self, common_msd, new_msd):
key = (common_msd, new_msd)
if key in self.memoized_msd_merges:
return self.memoized_msd_merges[key]
def merge_letter(l1, l2):
if l1 == "*":
return l2
elif l1 != l2:
return "-"
else:
return l1
value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
self.memoized_msd_merges[key] = value
return value
def render(self, lemma, msd):
statement = """SELECT msd, frequency FROM UniqWords WHERE
lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""