Intermediate UGLY CODE commit. Working more on representations
This commit is contained in:
parent
dce55d04a3
commit
d14efff709
203
wani.py
203
wani.py
|
@ -136,6 +136,7 @@ class WordFormSelection(Enum):
|
||||||
All = 0
|
All = 0
|
||||||
Msd = 1
|
Msd = 1
|
||||||
Agreement = 2
|
Agreement = 2
|
||||||
|
Any = 3
|
||||||
|
|
||||||
class Order(Enum):
|
class Order(Enum):
|
||||||
FromTo = 0
|
FromTo = 0
|
||||||
|
@ -179,7 +180,6 @@ class ComponentRendition:
|
||||||
self.rendition = r
|
self.rendition = r
|
||||||
|
|
||||||
def _set_more(self, m):
|
def _set_more(self, m):
|
||||||
assert(self.more is None and m is not None)
|
|
||||||
self.more = m
|
self.more = m
|
||||||
|
|
||||||
def add_feature(self, feature):
|
def add_feature(self, feature):
|
||||||
|
@ -188,6 +188,7 @@ class ComponentRendition:
|
||||||
self._set_rendition(Rendition.Lemma)
|
self._set_rendition(Rendition.Lemma)
|
||||||
elif feature['rendition'] == "word_form":
|
elif feature['rendition'] == "word_form":
|
||||||
self._set_rendition(Rendition.WordForm)
|
self._set_rendition(Rendition.WordForm)
|
||||||
|
self._set_more((WordFormSelection.Any, None))
|
||||||
elif feature['rendition'] == "lexis":
|
elif feature['rendition'] == "lexis":
|
||||||
self._set_rendition(Rendition.Lexis)
|
self._set_rendition(Rendition.Lexis)
|
||||||
self._set_more(feature['string'])
|
self._set_more(feature['string'])
|
||||||
|
@ -217,14 +218,37 @@ class ComponentRendition:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def set_representations(matches, structure):
|
def set_representations(matches, structure):
|
||||||
representations = {c.idx: [True, ""] for c in structure.components}
|
representations = {
|
||||||
|
c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
|
||||||
|
for c in structure.components
|
||||||
|
}
|
||||||
|
representations_to_check = []
|
||||||
|
word_component_id = {}
|
||||||
|
|
||||||
|
# doprint = structure.id == '1' and matches[0]['1'].text.startswith('evrop') and matches[0]['2'].text.startswith('prv')
|
||||||
|
doprint = False
|
||||||
|
|
||||||
def render_all(lst):
|
def render_all(lst):
|
||||||
return "/".join(set(lst))
|
return "/".join([w.text for w in set(lst)])
|
||||||
|
|
||||||
def render_form(lst):
|
def render_form(lst):
|
||||||
# find most frequent
|
sorted_lst = sorted(set(lst), key=lst.count)
|
||||||
return max(set(lst), key=lst.count)
|
for word in sorted_lst:
|
||||||
|
othw = are_agreements_ok(word, representations_to_check)
|
||||||
|
if othw is not None:
|
||||||
|
if doprint:
|
||||||
|
print("AOK", othw.text, othw)
|
||||||
|
|
||||||
|
matches.representations[word_component_id[othw.id]] = othw.text
|
||||||
|
matches.representations[word_component_id[word.id]] = word.text
|
||||||
|
return
|
||||||
|
|
||||||
|
def are_agreements_ok(word, words_to_try):
|
||||||
|
for w_id, other_word, agreements in words_to_try:
|
||||||
|
if check_agreement(word, other_word, agreements):
|
||||||
|
if doprint:
|
||||||
|
print("GOOD :)")
|
||||||
|
return other_word
|
||||||
|
|
||||||
def check_msd(word, selectors):
|
def check_msd(word, selectors):
|
||||||
for key, value in selectors.items():
|
for key, value in selectors.items():
|
||||||
|
@ -239,6 +263,9 @@ class ComponentRendition:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def check_agreement(w1, w2, agreements):
|
def check_agreement(w1, w2, agreements):
|
||||||
|
if doprint:
|
||||||
|
print("CHECK", w1.text, w1, w2.text, w2)
|
||||||
|
|
||||||
for agr_case in agreements:
|
for agr_case in agreements:
|
||||||
t1 = w1.msd[0]
|
t1 = w1.msd[0]
|
||||||
v1 = TAGSET[t1].index(agr_case)
|
v1 = TAGSET[t1].index(agr_case)
|
||||||
|
@ -263,11 +290,12 @@ class ComponentRendition:
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
for words in matches.matches:
|
||||||
for words in matches:
|
# first pass, check everything but agreements
|
||||||
for w_id, w in words.items():
|
for w_id, w in words.items():
|
||||||
component = structure.get_component(w_id)
|
component = structure.get_component(w_id)
|
||||||
rep = component.representation
|
rep = component.representation
|
||||||
|
word_component_id[w.id] = w_id
|
||||||
|
|
||||||
if rep.isit(Rendition.Lemma):
|
if rep.isit(Rendition.Lemma):
|
||||||
representations[w_id][0] = False
|
representations[w_id][0] = False
|
||||||
|
@ -281,45 +309,72 @@ class ComponentRendition:
|
||||||
|
|
||||||
# it HAS to be word_form now
|
# it HAS to be word_form now
|
||||||
else:
|
else:
|
||||||
|
assert(rep.isit(Rendition.WordForm))
|
||||||
wf_type, more = rep.more
|
wf_type, more = rep.more
|
||||||
|
|
||||||
# set correct type first
|
|
||||||
if type(representations[w_id][1]) is str:
|
|
||||||
representations[w_id] = (
|
|
||||||
[], render_all if wf_type is WordFormSelection.All else render_form
|
|
||||||
)
|
|
||||||
|
|
||||||
if wf_type is WordFormSelection.All:
|
if wf_type is WordFormSelection.All:
|
||||||
add = True
|
add = True
|
||||||
|
func = render_all
|
||||||
elif wf_type is WordFormSelection.Msd:
|
elif wf_type is WordFormSelection.Msd:
|
||||||
add = check_msd(w, more)
|
add = check_msd(w, more)
|
||||||
|
func = render_form
|
||||||
|
elif wf_type is WordFormSelection.Any:
|
||||||
|
add = True
|
||||||
|
func = render_form
|
||||||
else:
|
else:
|
||||||
assert(wf_type is WordFormSelection.Agreement)
|
assert(wf_type is WordFormSelection.Agreement)
|
||||||
other_w, agreements = more
|
other_w, agreements = more
|
||||||
add = check_agreement(w, words[other_w], agreements)
|
representations_to_check.append((other_w, w, agreements))
|
||||||
|
add = True
|
||||||
|
func = lambda x: None
|
||||||
|
|
||||||
if add:
|
if add:
|
||||||
representations[w_id][0].append(w.text)
|
representations[w_id][0].append(w)
|
||||||
|
representations[w_id][1] = func
|
||||||
doprint = matches[0]['1'].text.startswith('evrop')
|
|
||||||
|
|
||||||
# just need to set representation to first group...
|
|
||||||
for w_id, w in matches[0].items():
|
|
||||||
data = representations[w_id]
|
|
||||||
if doprint:
|
|
||||||
print(data)
|
|
||||||
|
|
||||||
if type(data[1]) is str:
|
|
||||||
w.representation_failed = data[0]
|
|
||||||
w.representation = w.lemma if w.representation_failed else data[1]
|
|
||||||
else:
|
|
||||||
w.representation_failed = len(data[0]) == 0
|
|
||||||
w.representation = w.lemma if w.representation_failed else data[1](data[0])
|
|
||||||
|
|
||||||
if doprint:
|
|
||||||
print(w.representation_failed, w.representation)
|
|
||||||
|
|
||||||
if doprint:
|
if doprint:
|
||||||
|
print(len(matches), len(representations_to_check))
|
||||||
|
|
||||||
|
# for w1i, w2i, agreements in representations_to_check:
|
||||||
|
# w1, w2 = words[w1i], words[w2i]
|
||||||
|
# if doprint:
|
||||||
|
# print("? ", w1.msd, w2.msd, end="")
|
||||||
|
|
||||||
|
# if w2i not in bad_words:
|
||||||
|
#
|
||||||
|
# if check_agreement(w1, w2, agreements):
|
||||||
|
# representations[w1i][0].append(w1.text)
|
||||||
|
# if doprint:
|
||||||
|
# print(" :)")
|
||||||
|
# elif doprint:
|
||||||
|
# print(" :(")
|
||||||
|
# elif doprint:
|
||||||
|
# print(" :((")
|
||||||
|
|
||||||
|
# just need to set representation to first group,
|
||||||
|
# but in correct order, agreements last!
|
||||||
|
representation_sorted_words = []
|
||||||
|
for w_id, w in matches.matches[0].items():
|
||||||
|
rep = component.representation
|
||||||
|
if rep.isit(Rendition.WordForm) and rep.more[0] is WordFormSelection.Agreement:
|
||||||
|
representation_sorted_words.append((w_id, w))
|
||||||
|
else:
|
||||||
|
representation_sorted_words.insert(0, (w_id, w))
|
||||||
|
|
||||||
|
for w_id, w in representation_sorted_words:
|
||||||
|
data = representations[w_id]
|
||||||
|
if doprint:
|
||||||
|
print([(r.text, r.lemma, r.msd) for r in data[0]])
|
||||||
|
|
||||||
|
if type(data[1]) is str:
|
||||||
|
matches.representations[w_id] = None if data[0] else data[1]
|
||||||
|
elif len(data[0]) == 0:
|
||||||
|
matches.representations[w_id] = None
|
||||||
|
else:
|
||||||
|
data[1](data[0])
|
||||||
|
|
||||||
|
if doprint:
|
||||||
|
print(matches.representations)
|
||||||
print('--')
|
print('--')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
@ -781,9 +836,6 @@ class Word:
|
||||||
self.text = xml.text
|
self.text = xml.text
|
||||||
self.links = defaultdict(list)
|
self.links = defaultdict(list)
|
||||||
|
|
||||||
self.representation = None
|
|
||||||
self.representation_failed = False
|
|
||||||
|
|
||||||
last_num = self.id.split('.')[-1]
|
last_num = self.id.split('.')[-1]
|
||||||
if last_num[0] not in '0123456789':
|
if last_num[0] not in '0123456789':
|
||||||
last_num = last_num[1:]
|
last_num = last_num[1:]
|
||||||
|
@ -807,6 +859,29 @@ class Word:
|
||||||
|
|
||||||
return self.links[link]
|
return self.links[link]
|
||||||
|
|
||||||
|
class WordMsdRenderer:
|
||||||
|
def __init__(self):
|
||||||
|
self.all_words = []
|
||||||
|
self.rendered_words = {}
|
||||||
|
|
||||||
|
def add_word(self, word):
|
||||||
|
self.all_words.append(word)
|
||||||
|
|
||||||
|
def generate_renders(self):
|
||||||
|
data = defaultdict(lambda: defaultdict([]))
|
||||||
|
for w in self.all_words:
|
||||||
|
data[w.lemma][w.msd].append(w.text)
|
||||||
|
|
||||||
|
for lemma, ld in data.items():
|
||||||
|
self.rendered_words[lemma] = {}
|
||||||
|
for msd, texts in ld.items():
|
||||||
|
rep = max(set(texts), key=texts.count)
|
||||||
|
self.rendered_words[lemma][msd] = rep
|
||||||
|
|
||||||
|
def render(self, lemma, msd):
|
||||||
|
if lemma in self.rendered_words:
|
||||||
|
if msd in self.rendered_words[lemma]:
|
||||||
|
return self.rendered_words[lemma][msd]
|
||||||
|
|
||||||
def is_root_id(id_):
|
def is_root_id(id_):
|
||||||
return len(id_.split('.')) == 3
|
return len(id_.split('.')) == 3
|
||||||
|
@ -905,15 +980,17 @@ class Writer:
|
||||||
def length(self):
|
def length(self):
|
||||||
return 4 if self.all else 3
|
return 4 if self.all else 3
|
||||||
|
|
||||||
def from_word(self, word):
|
def from_word(self, word, representation):
|
||||||
if word is None:
|
if word is None:
|
||||||
return [""] * self.length()
|
return [""] * self.length()
|
||||||
elif self.all:
|
elif self.all:
|
||||||
return [word.id, word.text, word.lemma, word.msd]
|
return [word.id, word.text, word.lemma, word.msd]
|
||||||
else:
|
else:
|
||||||
assert(word.representation is not None)
|
print("1", word)
|
||||||
failed = "lemma_fallback" if word.representation_failed else "ok"
|
if representation is None:
|
||||||
return [word.lemma, word.representation, failed]
|
return [word.lemma, word.lemma, "lemma_fallback"]
|
||||||
|
else:
|
||||||
|
return [word.lemma, representation, "ok"]
|
||||||
|
|
||||||
def sorted_rows(self, rows):
|
def sorted_rows(self, rows):
|
||||||
if self.sort_by < 0 or len(rows) < 2:
|
if self.sort_by < 0 or len(rows) < 2:
|
||||||
|
@ -937,14 +1014,16 @@ class Writer:
|
||||||
def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
|
def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
|
||||||
rows = []
|
rows = []
|
||||||
|
|
||||||
for cid, m, freq in colocation_ids.get_matches_for(structure_id, not self.all):
|
for cid, m, freq, rprsnt in colocation_ids.get_matches_for(structure_id, not self.all):
|
||||||
to_write = []
|
to_write = []
|
||||||
representation = ""
|
representation = ""
|
||||||
|
|
||||||
for idx, _comp in enumerate(components):
|
for idx, _comp in enumerate(components):
|
||||||
idx = str(idx + 1)
|
idx = str(idx + 1)
|
||||||
word = m[idx] if idx in m else None
|
word = m[idx] if idx in m else None
|
||||||
to_write.extend(self.from_word(word))
|
print(rprsnt)
|
||||||
|
rep = rprsnt[idx] if idx in rprsnt else None
|
||||||
|
to_write.extend(self.from_word(word, rep))
|
||||||
representation += " " + to_write[-2]
|
representation += " " + to_write[-2]
|
||||||
|
|
||||||
# make them equal size
|
# make them equal size
|
||||||
|
@ -993,6 +1072,19 @@ class Writer:
|
||||||
if not self.multiple_output:
|
if not self.multiple_output:
|
||||||
fp_close(fp)
|
fp_close(fp)
|
||||||
|
|
||||||
|
class StructureMatch:
|
||||||
|
def __init__(self, match_id, structure_id):
|
||||||
|
self.match_id = match_id
|
||||||
|
self.structure_id = structure_id
|
||||||
|
|
||||||
|
self.matches = []
|
||||||
|
self.representations = {}
|
||||||
|
|
||||||
|
def append(self, match):
|
||||||
|
self.matches.append(match)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.matches)
|
||||||
|
|
||||||
class ColocationIds:
|
class ColocationIds:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -1000,41 +1092,32 @@ class ColocationIds:
|
||||||
self.min_frequency = args.min_freq
|
self.min_frequency = args.min_freq
|
||||||
|
|
||||||
def _add_match(self, key, sid, match):
|
def _add_match(self, key, sid, match):
|
||||||
if key in self.data:
|
if key not in self.data:
|
||||||
self.data[key][1].append(match)
|
self.data[key] = StructureMatch(str(len(self.data) + 1), sid)
|
||||||
else:
|
self.data[key].append(match)
|
||||||
self.data[key] = (str(len(self.data) + 1), [match], sid)
|
|
||||||
|
|
||||||
def get(self, key, n):
|
def get(self, key, n):
|
||||||
return self.data[key][n]
|
return self.data[key][n]
|
||||||
|
|
||||||
def num(self, key):
|
|
||||||
return str(len(self.get(key, 1)))
|
|
||||||
|
|
||||||
def to_id(self, key):
|
|
||||||
return self.get(key, 0)
|
|
||||||
|
|
||||||
def add_matches(self, matches):
|
def add_matches(self, matches):
|
||||||
for sid, nms in matches.items():
|
for sid, nms in matches.items():
|
||||||
for nm in nms:
|
for nm in nms:
|
||||||
self._add_match(nm[1], sid, nm[0])
|
self._add_match(nm[1], sid, nm[0])
|
||||||
|
|
||||||
def get_matches_for(self, structure_id, group):
|
def get_matches_for(self, structure_id, group):
|
||||||
for _cid_tup, (cid, cid_matches, sid) in self.data.items():
|
for _cid_tup, sm in self.data.items():
|
||||||
if sid != structure_id:
|
if sm.structure_id != structure_id:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for words in cid_matches:
|
for words in sm.matches:
|
||||||
yield (cid, words, len(cid_matches))
|
yield (sm.match_id, words, len(sm), sm.representations)
|
||||||
if group:
|
if group:
|
||||||
break
|
break
|
||||||
|
|
||||||
def set_representations(self, structures):
|
def set_representations(self, structures):
|
||||||
components_dict = {structure.id: structure for structure in structures}
|
components_dict = {structure.id: structure for structure in structures}
|
||||||
for _1, (_2, cid_matches, sid) in self.data.items():
|
for _1, sm in self.data.items():
|
||||||
if _2 == '1309':
|
ComponentRendition.set_representations(sm, components_dict[sm.structure_id])
|
||||||
a = 1
|
|
||||||
ComponentRendition.set_representations(cid_matches, components_dict[sid])
|
|
||||||
|
|
||||||
|
|
||||||
def match_file(words, structures):
|
def match_file(words, structures):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user