continued work on representation, almost there...

This commit is contained in:
Ozbolt Menegatti 2019-05-16 01:53:38 +02:00
parent 84a184c44d
commit d2f1e95a8f

89
wani.py
View File

@ -203,28 +203,58 @@ class ComponentRendition:
else: else:
return None return None
def render(self, words): def isit(self, rendition):
if self.rendition == Rendition.Lemma: return self.rendition is rendition
return words[0].lemma
elif self.rendition == Rendition.Lexis:
return self.more
elif self.rendition == Rendition.Unknown:
return None
elif self.rendition == Rendition.WordForm:
# check more!
return words[0].text
else:
raise RuntimeError("Unknown rendition: {}".format(self.rendition))
@staticmethod @staticmethod
def set_representations(matches, components): def set_representations(matches, structure):
representations = {c.idx: [True, ""] for c in structure.components}
def render_all(lst):
return "/".join(set(lst))
def render_form(_lst):
return ":("
for words, agreement in matches: for words, agreement in matches:
for _, w in words.items(): if not agreement:
w.representation = ":(" continue
for w_id, w in words.items():
component = structure.get_component(w_id)
rep = component.representation
if rep.isit(Rendition.Lemma):
representations[w_id][0] = False
representations[w_id][1] = w.lemma
elif rep.isit(Rendition.Lexis):
representations[w_id][0] = False
representations[w_id][1] = rep.more
elif rep.isit(Rendition.Unknown):
representations[w_id][0] = False
representations[w_id][1] = ""
# it HAS to be word_form now
else:
# set correct type first
if type(representations[w_id][1]) is str:
representations[w_id] = (
[], render_all if rep.more is StructureSelection.All else render_form
)
representations[w_id][0].append(w.text)
# just need to set representation to first group...
for w_id, w in matches[0][0].items():
data = representations[w_id]
if type(data[1]) is str:
w.representation_failed = data[0]
w.representation = w.lemma if w.representation_failed else data[1]
else:
w.representation_failed = len(data[0]) > 0
w.representation = w.lemma if w.representation_failed else data[1](data[0])
def __str__(self): def __str__(self):
return str(self.rendition) return str(self.rendition)
@ -377,9 +407,6 @@ class Component:
self.iter_ctr = 0 self.iter_ctr = 0
def render_word(self, word):
return self.representation.render(word)
def add_next(self, next_component, link_label, order): def add_next(self, next_component, link_label, order):
self.next_element.append((next_component, link_label, Order.new(order))) self.next_element.append((next_component, link_label, Order.new(order)))
@ -397,9 +424,8 @@ class Component:
raise RuntimeError("Unreachable") raise RuntimeError("Unreachable")
def set_representation(self, representation): def set_representation(self, representation):
if len(representation) > 0: for feature in representation:
for feature in representation: self.representation.add_feature(feature.attrib)
self.representation.add_feature(feature)
def find_next(self, deps, comps, restrs, reprs): def find_next(self, deps, comps, restrs, reprs):
to_ret = [] to_ret = []
@ -720,6 +746,9 @@ class Word:
self.text = xml.text self.text = xml.text
self.links = defaultdict(list) self.links = defaultdict(list)
self.representation = None
self.representation_failed = False
last_num = self.id.split('.')[-1] last_num = self.id.split('.')[-1]
if last_num[0] not in '0123456789': if last_num[0] not in '0123456789':
last_num = last_num[1:] last_num = last_num[1:]
@ -827,7 +856,7 @@ class Writer:
if self.all: if self.all:
cols = ["Token_ID", "Word_form"] + cols + ["Msd"] cols = ["Token_ID", "Word_form"] + cols + ["Msd"]
else: else:
cols.append("Representative_form") cols.extend(["Representative_form", "RF_scenario"])
assert(len(cols) == self.length()) assert(len(cols) == self.length())
cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols] cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols]
@ -839,7 +868,7 @@ class Writer:
return cols return cols
def length(self): def length(self):
return 4 if self.all else 2 return 4 if self.all else 3
def from_word(self, word): def from_word(self, word):
if word is None: if word is None:
@ -848,7 +877,8 @@ class Writer:
return [word.id, word.text, word.lemma, word.msd] return [word.id, word.text, word.lemma, word.msd]
else: else:
assert(word.representation is not None) assert(word.representation is not None)
return [word.lemma, word.representation] failed = "ok" if word.representation_failed else "lemma_fallback"
return [word.lemma, word.representation, failed]
def sorted_rows(self, rows): def sorted_rows(self, rows):
if self.sort_by < 0 or len(rows) < 2: if self.sort_by < 0 or len(rows) < 2:
@ -880,7 +910,7 @@ class Writer:
idx = str(idx + 1) idx = str(idx + 1)
word = m[idx] if idx in m else None word = m[idx] if idx in m else None
to_write.extend(self.from_word(word)) to_write.extend(self.from_word(word))
representation += " " + to_write[-1] representation += " " + to_write[-2]
# make them equal size # make them equal size
to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write))) to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write)))
@ -1071,4 +1101,3 @@ if __name__ == '__main__':
start = time.time() start = time.time()
main(args.input, args.structures, args) main(args.input, args.structures, args)
logging.info("TIME: {}".format(time.time() - start)) logging.info("TIME: {}".format(time.time() - start))