common msd now calculated per colocation id and not for whole corpus
This commit is contained in:
parent
2f789e6550
commit
48795c6227
|
@ -75,11 +75,15 @@ class WordFormAnyCR(ComponentRepresentation):
|
|||
|
||||
return text_forms[(word_msd, word_lemma)]
|
||||
|
||||
|
||||
class WordFormMsdCR(WordFormAnyCR):
|
||||
def __init__(self, *args):
|
||||
super().__init__(*args)
|
||||
self.lemma = None
|
||||
self.msd = None
|
||||
self.msds = []
|
||||
|
||||
def msd(self):
|
||||
return self.msds[0]
|
||||
|
||||
def check_msd(self, word_msd):
|
||||
if 'msd' not in self.data:
|
||||
|
@ -100,17 +104,24 @@ class WordFormMsdCR(WordFormAnyCR):
|
|||
def add_word(self, word):
|
||||
if self.lemma is None:
|
||||
self.lemma = word.lemma
|
||||
self.msd = word.msd
|
||||
|
||||
self.msds.append(word.msd)
|
||||
if self.check_msd(word.msd):
|
||||
super().add_word(word)
|
||||
|
||||
def _render(self):
|
||||
msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd)
|
||||
self.words.append(WordMsdOnly(msd))
|
||||
|
||||
self.words.append(WordMsdOnly(self._common_msd()))
|
||||
return super()._render()
|
||||
|
||||
def _common_msd(self):
|
||||
msds = sorted(self.msds, key=len)
|
||||
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
|
||||
else msds[0][idx] for idx in range(len(msds[0]))]
|
||||
common_msd = "".join(common_msd)
|
||||
iommon_msd = "".join(common_msd)
|
||||
return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
|
||||
|
||||
|
||||
class WordFormAgreementCR(WordFormMsdCR):
|
||||
def __init__(self, data, word_renderer):
|
||||
super().__init__(data, word_renderer)
|
||||
|
@ -124,7 +135,7 @@ class WordFormAgreementCR(WordFormMsdCR):
|
|||
|
||||
lemma_available_words = self.word_renderer.available_words(self.lemma, existing)
|
||||
for candidate_msd, candidate_text in lemma_available_words:
|
||||
if self.msd[0] != candidate_msd[0]:
|
||||
if self.msd()[0] != candidate_msd[0]:
|
||||
continue
|
||||
|
||||
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
|
||||
|
|
|
@ -18,12 +18,10 @@ class WordStats:
|
|||
text varchar(64),
|
||||
frequency int
|
||||
)""")
|
||||
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))")
|
||||
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
|
||||
|
||||
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
|
||||
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
|
||||
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)")
|
||||
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
|
||||
|
||||
def add_words(self, words):
|
||||
|
@ -44,24 +42,6 @@ class WordStats:
|
|||
|
||||
def generate_renders(self):
|
||||
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
||||
|
||||
for lemma in progress(lemmas, 'common-msd'):
|
||||
common_msds = defaultdict(lambda: "*" * 10)
|
||||
|
||||
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
|
||||
msd = msd[0]
|
||||
current_msd = common_msds[msd[0]]
|
||||
new_msd = self.merge_msd(current_msd, msd)
|
||||
common_msds[msd[0]] = new_msd
|
||||
|
||||
for msd0, common_msd in common_msds.items():
|
||||
common_msd = self.common_lemma_msd(lemma, common_msd)
|
||||
self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)",
|
||||
(lemma, msd0, common_msd))
|
||||
|
||||
|
||||
self.db.commit()
|
||||
|
||||
for lemma in progress(lemmas, 'word-count'):
|
||||
num_words = defaultdict(int)
|
||||
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
|
||||
|
@ -81,23 +61,6 @@ class WordStats:
|
|||
else:
|
||||
return msd
|
||||
|
||||
def merge_msd(self, common_msd, new_msd):
|
||||
key = (common_msd, new_msd)
|
||||
if key in self.memoized_msd_merges:
|
||||
return self.memoized_msd_merges[key]
|
||||
|
||||
def merge_letter(l1, l2):
|
||||
if l1 == "*":
|
||||
return l2
|
||||
elif l1 != l2:
|
||||
return "-"
|
||||
else:
|
||||
return l1
|
||||
|
||||
value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
|
||||
self.memoized_msd_merges[key] = value
|
||||
return value
|
||||
|
||||
def render(self, lemma, msd):
|
||||
statement = """SELECT msd, frequency FROM UniqWords WHERE
|
||||
lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""
|
||||
|
|
Loading…
Reference in New Issue
Block a user