looking for agreements from the whole corpus
This commit is contained in:
parent
e99ba59908
commit
3c669c7901
89
wani.py
89
wani.py
|
@ -222,30 +222,44 @@ class ComponentRendition:
|
||||||
c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
|
c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
|
||||||
for c in structure.components
|
for c in structure.components
|
||||||
}
|
}
|
||||||
representations_to_check = []
|
found_agreements = {}
|
||||||
word_component_id = {}
|
word_component_id = {}
|
||||||
|
|
||||||
# doprint = structure.id == '1' and matches[0]['1'].text.startswith('evrop') and matches[0]['2'].text.startswith('prv')
|
# doprint = structure.id == '1' and matches[0]['1'].text.startswith('evrop') and matches[0]['2'].text.startswith('prv')
|
||||||
doprint = False
|
doprint = False
|
||||||
|
|
||||||
def render_all(lst):
|
def render_all(component_id, lst):
|
||||||
return "/".join([w.text for w in set(lst)])
|
matches.representations[component_id] = "/".join([w.text for w in set(lst)])
|
||||||
|
|
||||||
def render_form(lst):
|
def render_form(component_id, lst):
|
||||||
sorted_lst = sorted(set(lst), key=lst.count)
|
sorted_lst = sorted(set(lst), key=lst.count)
|
||||||
for word in sorted_lst:
|
for word in sorted_lst:
|
||||||
othw = are_agreements_ok(word, representations_to_check)
|
if component_id in found_agreements:
|
||||||
if othw is not None:
|
other_component_id, other_word, agreements = found_agreements[component_id]
|
||||||
matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer)
|
print(word.lemma, other_word.lemma, component_id, other_component_id, word.msd, word.msd)
|
||||||
matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer)
|
agr = are_agreements_ok(word.msd, other_word.lemma, other_word.msd, agreements)
|
||||||
return
|
if agr is None:
|
||||||
|
continue
|
||||||
|
matches.representations[other_component_id] = agr
|
||||||
|
|
||||||
def are_agreements_ok(word, words_to_try):
|
matches.representations[word_component_id[word.id]] = word.most_frequent_text(word_renderer)
|
||||||
for w_id, other_word, agreements in words_to_try:
|
break
|
||||||
if check_agreement(word, other_word, agreements):
|
|
||||||
|
# othw = are_agreements_ok(word, found_agreements)
|
||||||
|
# if othw is not None:
|
||||||
|
# matches.representations[word_component_id[othw.id]] = othw.most_frequent_text(word_renderer)
|
||||||
|
# return
|
||||||
|
|
||||||
|
def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements):
|
||||||
|
for w2_msd, w2_txt in word_renderer.available_words(ow_lemma):
|
||||||
|
if ow_msd[0] != w2_msd[0]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(w1_msd, w2_msd)
|
||||||
|
if check_agreement(w1_msd, w2_msd, agreements):
|
||||||
if doprint:
|
if doprint:
|
||||||
print("GOOD :)")
|
print("GOOD :)")
|
||||||
return other_word
|
return w2_txt
|
||||||
|
|
||||||
def check_msd(word, selectors):
|
def check_msd(word, selectors):
|
||||||
for key, value in selectors.items():
|
for key, value in selectors.items():
|
||||||
|
@ -259,27 +273,32 @@ class ComponentRendition:
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def check_agreement(w1, w2, agreements):
|
def check_agreement(msd1, msd2, agreements):
|
||||||
if doprint:
|
|
||||||
print("CHECK", w1.text, w1, w2.text, w2)
|
|
||||||
|
|
||||||
for agr_case in agreements:
|
for agr_case in agreements:
|
||||||
t1 = w1.msd[0]
|
t1 = msd1[0]
|
||||||
|
# if not in msd, some strange msd was tries, skipping...
|
||||||
|
if agr_case not in TAGSET[t1]:
|
||||||
|
logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
|
||||||
|
print("BAAAD")
|
||||||
|
return False
|
||||||
|
|
||||||
v1 = TAGSET[t1].index(agr_case)
|
v1 = TAGSET[t1].index(agr_case)
|
||||||
assert(v1 >= 0)
|
|
||||||
# if none specified: nedolocnik, always agrees
|
# if none specified: nedolocnik, always agrees
|
||||||
if v1 + 1 >= len(w1.msd):
|
if v1 + 1 >= len(msd1):
|
||||||
continue
|
continue
|
||||||
# first is uppercase, not in TAGSET
|
# first is uppercase, not in TAGSET
|
||||||
m1 = w1.msd[v1 + 1]
|
m1 = msd1[v1 + 1]
|
||||||
|
|
||||||
# REPEAT (not DRY!)
|
# REPEAT (not DRY!)
|
||||||
t2 = w2.msd[0]
|
t2 = msd2[0]
|
||||||
|
if agr_case not in TAGSET[t2]:
|
||||||
|
logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
|
||||||
|
print("BAAAD")
|
||||||
|
return False
|
||||||
v2 = TAGSET[t2].index(agr_case)
|
v2 = TAGSET[t2].index(agr_case)
|
||||||
assert(v2 >= 0)
|
if v2 + 1 >= len(msd2):
|
||||||
if v2 + 1 >= len(w2.msd):
|
|
||||||
continue
|
continue
|
||||||
m2 = w2.msd[v2 + 1]
|
m2 = msd2[v2 + 1]
|
||||||
|
|
||||||
# match!
|
# match!
|
||||||
if '-' not in [m1, m2] and m1 != m2:
|
if '-' not in [m1, m2] and m1 != m2:
|
||||||
|
@ -321,18 +340,18 @@ class ComponentRendition:
|
||||||
else:
|
else:
|
||||||
assert(wf_type is WordFormSelection.Agreement)
|
assert(wf_type is WordFormSelection.Agreement)
|
||||||
other_w, agreements = more
|
other_w, agreements = more
|
||||||
representations_to_check.append((other_w, w, agreements))
|
found_agreements[other_w] = (w_id, w.lemma, agreements)
|
||||||
add = True
|
add = True
|
||||||
func = lambda x: None
|
func = lambda *x: None
|
||||||
|
|
||||||
if add:
|
if add:
|
||||||
representations[w_id][0].append(w)
|
representations[w_id][0].append(w)
|
||||||
representations[w_id][1] = func
|
representations[w_id][1] = func
|
||||||
|
|
||||||
if doprint:
|
if doprint:
|
||||||
print(len(matches), len(representations_to_check))
|
print(len(matches), len(found_agreements))
|
||||||
|
|
||||||
# for w1i, w2i, agreements in representations_to_check:
|
# for w1i, w2i, agreements in found_agreements:
|
||||||
# w1, w2 = words[w1i], words[w2i]
|
# w1, w2 = words[w1i], words[w2i]
|
||||||
# if doprint:
|
# if doprint:
|
||||||
# print("? ", w1.msd, w2.msd, end="")
|
# print("? ", w1.msd, w2.msd, end="")
|
||||||
|
@ -368,7 +387,7 @@ class ComponentRendition:
|
||||||
elif len(data[0]) == 0:
|
elif len(data[0]) == 0:
|
||||||
matches.representations[w_id] = None
|
matches.representations[w_id] = None
|
||||||
else:
|
else:
|
||||||
data[1](data[0])
|
data[1](str(w_id), data[0])
|
||||||
|
|
||||||
if doprint:
|
if doprint:
|
||||||
print(matches.representations)
|
print(matches.representations)
|
||||||
|
@ -883,6 +902,11 @@ class WordMsdRenderer:
|
||||||
if msd in self.rendered_words[lemma]:
|
if msd in self.rendered_words[lemma]:
|
||||||
return self.rendered_words[lemma][msd]
|
return self.rendered_words[lemma][msd]
|
||||||
|
|
||||||
|
def available_words(self, lemma):
|
||||||
|
if lemma in self.rendered_words:
|
||||||
|
for msd in self.rendered_words[lemma].keys():
|
||||||
|
yield (msd, self.rendered_words[lemma][msd])
|
||||||
|
|
||||||
def is_root_id(id_):
|
def is_root_id(id_):
|
||||||
return len(id_.split('.')) == 3
|
return len(id_.split('.')) == 3
|
||||||
|
|
||||||
|
@ -928,7 +952,7 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
|
||||||
|
|
||||||
if lfrom in words:
|
if lfrom in words:
|
||||||
if not skip_id_check and is_root_id(lfrom):
|
if not skip_id_check and is_root_id(lfrom):
|
||||||
logging.error("NOO: ", lfrom)
|
logging.error("NOO: {}".format(lfrom))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if dest in words:
|
if dest in words:
|
||||||
|
@ -986,7 +1010,7 @@ class Writer:
|
||||||
elif self.all:
|
elif self.all:
|
||||||
return [word.id, word.text, word.lemma, word.msd]
|
return [word.id, word.text, word.lemma, word.msd]
|
||||||
else:
|
else:
|
||||||
print("1", word)
|
# print("1", word)
|
||||||
if representation is None:
|
if representation is None:
|
||||||
return [word.lemma, word.lemma, "lemma_fallback"]
|
return [word.lemma, word.lemma, "lemma_fallback"]
|
||||||
else:
|
else:
|
||||||
|
@ -1021,7 +1045,6 @@ class Writer:
|
||||||
for idx, _comp in enumerate(components):
|
for idx, _comp in enumerate(components):
|
||||||
idx = str(idx + 1)
|
idx = str(idx + 1)
|
||||||
word = m[idx] if idx in m else None
|
word = m[idx] if idx in m else None
|
||||||
print(rprsnt)
|
|
||||||
rep = rprsnt[idx] if idx in rprsnt else None
|
rep = rprsnt[idx] if idx in rprsnt else None
|
||||||
to_write.extend(self.from_word(word, rep))
|
to_write.extend(self.from_word(word, rep))
|
||||||
representation += " " + to_write[-2]
|
representation += " " + to_write[-2]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user