common msd now calculated per colocation id and not for whole corpus
This commit is contained in:
parent
2f789e6550
commit
48795c6227
|
@ -75,11 +75,15 @@ class WordFormAnyCR(ComponentRepresentation):
|
||||||
|
|
||||||
return text_forms[(word_msd, word_lemma)]
|
return text_forms[(word_msd, word_lemma)]
|
||||||
|
|
||||||
|
|
||||||
class WordFormMsdCR(WordFormAnyCR):
|
class WordFormMsdCR(WordFormAnyCR):
|
||||||
def __init__(self, *args):
|
def __init__(self, *args):
|
||||||
super().__init__(*args)
|
super().__init__(*args)
|
||||||
self.lemma = None
|
self.lemma = None
|
||||||
self.msd = None
|
self.msds = []
|
||||||
|
|
||||||
|
def msd(self):
|
||||||
|
return self.msds[0]
|
||||||
|
|
||||||
def check_msd(self, word_msd):
|
def check_msd(self, word_msd):
|
||||||
if 'msd' not in self.data:
|
if 'msd' not in self.data:
|
||||||
|
@ -100,17 +104,24 @@ class WordFormMsdCR(WordFormAnyCR):
|
||||||
def add_word(self, word):
|
def add_word(self, word):
|
||||||
if self.lemma is None:
|
if self.lemma is None:
|
||||||
self.lemma = word.lemma
|
self.lemma = word.lemma
|
||||||
self.msd = word.msd
|
|
||||||
|
|
||||||
|
self.msds.append(word.msd)
|
||||||
if self.check_msd(word.msd):
|
if self.check_msd(word.msd):
|
||||||
super().add_word(word)
|
super().add_word(word)
|
||||||
|
|
||||||
def _render(self):
|
def _render(self):
|
||||||
msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd)
|
self.words.append(WordMsdOnly(self._common_msd()))
|
||||||
self.words.append(WordMsdOnly(msd))
|
|
||||||
|
|
||||||
return super()._render()
|
return super()._render()
|
||||||
|
|
||||||
|
def _common_msd(self):
|
||||||
|
msds = sorted(self.msds, key=len)
|
||||||
|
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
|
||||||
|
else msds[0][idx] for idx in range(len(msds[0]))]
|
||||||
|
common_msd = "".join(common_msd)
|
||||||
|
iommon_msd = "".join(common_msd)
|
||||||
|
return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
|
||||||
|
|
||||||
|
|
||||||
class WordFormAgreementCR(WordFormMsdCR):
|
class WordFormAgreementCR(WordFormMsdCR):
|
||||||
def __init__(self, data, word_renderer):
|
def __init__(self, data, word_renderer):
|
||||||
super().__init__(data, word_renderer)
|
super().__init__(data, word_renderer)
|
||||||
|
@ -124,7 +135,7 @@ class WordFormAgreementCR(WordFormMsdCR):
|
||||||
|
|
||||||
lemma_available_words = self.word_renderer.available_words(self.lemma, existing)
|
lemma_available_words = self.word_renderer.available_words(self.lemma, existing)
|
||||||
for candidate_msd, candidate_text in lemma_available_words:
|
for candidate_msd, candidate_text in lemma_available_words:
|
||||||
if self.msd[0] != candidate_msd[0]:
|
if self.msd()[0] != candidate_msd[0]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
|
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
|
||||||
|
|
|
@ -18,12 +18,10 @@ class WordStats:
|
||||||
text varchar(64),
|
text varchar(64),
|
||||||
frequency int
|
frequency int
|
||||||
)""")
|
)""")
|
||||||
self.db.init("CREATE TABLE CommonMsd (lemma varchar(64), msd0 char, msd varchar(16))")
|
|
||||||
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
|
self.db.init("CREATE TABLE WordCount (lemma varchar(64), msd0 char, frequency int)")
|
||||||
|
|
||||||
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
|
self.db.init("CREATE INDEX lemma_msd_text_on_uw ON UniqWords (lemma, msd, text)")
|
||||||
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
|
self.db.init("CREATE INDEX lemma_on_uw ON UniqWords (lemma)")
|
||||||
self.db.init("CREATE INDEX lemma_on_cm ON CommonMsd (lemma, msd0)")
|
|
||||||
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
|
self.db.init("CREATE INDEX lemma_msd0_on_wc ON WordCount (lemma, msd0)")
|
||||||
|
|
||||||
def add_words(self, words):
|
def add_words(self, words):
|
||||||
|
@ -44,24 +42,6 @@ class WordStats:
|
||||||
|
|
||||||
def generate_renders(self):
|
def generate_renders(self):
|
||||||
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
||||||
|
|
||||||
for lemma in progress(lemmas, 'common-msd'):
|
|
||||||
common_msds = defaultdict(lambda: "*" * 10)
|
|
||||||
|
|
||||||
for msd in self.db.execute("SELECT DISTINCT msd FROM UniqWords WHERE lemma=?", (lemma,)):
|
|
||||||
msd = msd[0]
|
|
||||||
current_msd = common_msds[msd[0]]
|
|
||||||
new_msd = self.merge_msd(current_msd, msd)
|
|
||||||
common_msds[msd[0]] = new_msd
|
|
||||||
|
|
||||||
for msd0, common_msd in common_msds.items():
|
|
||||||
common_msd = self.common_lemma_msd(lemma, common_msd)
|
|
||||||
self.db.execute("INSERT INTO CommonMsd (lemma, msd0, msd) VALUES (?, ?, ?)",
|
|
||||||
(lemma, msd0, common_msd))
|
|
||||||
|
|
||||||
|
|
||||||
self.db.commit()
|
|
||||||
|
|
||||||
for lemma in progress(lemmas, 'word-count'):
|
for lemma in progress(lemmas, 'word-count'):
|
||||||
num_words = defaultdict(int)
|
num_words = defaultdict(int)
|
||||||
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
|
for (msd, freq) in self.db.execute("SELECT msd, frequency FROM UniqWords WHERE lemma=?", (lemma,)):
|
||||||
|
@ -81,23 +61,6 @@ class WordStats:
|
||||||
else:
|
else:
|
||||||
return msd
|
return msd
|
||||||
|
|
||||||
def merge_msd(self, common_msd, new_msd):
|
|
||||||
key = (common_msd, new_msd)
|
|
||||||
if key in self.memoized_msd_merges:
|
|
||||||
return self.memoized_msd_merges[key]
|
|
||||||
|
|
||||||
def merge_letter(l1, l2):
|
|
||||||
if l1 == "*":
|
|
||||||
return l2
|
|
||||||
elif l1 != l2:
|
|
||||||
return "-"
|
|
||||||
else:
|
|
||||||
return l1
|
|
||||||
|
|
||||||
value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
|
|
||||||
self.memoized_msd_merges[key] = value
|
|
||||||
return value
|
|
||||||
|
|
||||||
def render(self, lemma, msd):
|
def render(self, lemma, msd):
|
||||||
statement = """SELECT msd, frequency FROM UniqWords WHERE
|
statement = """SELECT msd, frequency FROM UniqWords WHERE
|
||||||
lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""
|
lemma=:lemma AND msd=:msd ORDER BY frequency DESC"""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user