Does not yet work, agreements in representation

This commit is contained in:
Ozbolt Menegatti 2019-05-20 18:14:11 +02:00
parent 5bd0b4a064
commit dce55d04a3

147
wani.py
View File

@ -132,6 +132,11 @@ class Rendition(Enum):
Lexis = 2
Unknown = 3
class WordFormSelection(Enum):
All = 0
Msd = 1
Agreement = 2
class Order(Enum):
FromTo = 0
ToFrom = 1
@ -192,9 +197,15 @@ class ComponentRendition:
elif 'selection' in feature:
if feature['selection'] == "msd":
selectors = {k: v for k, v in feature.items() if k != 'selection'}
self._set_more(selectors)
self._set_more((WordFormSelection.Msd, selectors))
elif feature['selection'] == "all":
self._set_more("all")
self._set_more((WordFormSelection.All, None))
elif feature['selection'] == 'agreement':
assert(feature['head'][:4] == 'cid_')
assert(feature['msd'] is not None)
self._set_more((WordFormSelection.Agreement,
(feature['head'][4:], feature['msd'].split('+'))))
else:
raise NotImplementedError("Representation selection: {}".format(feature))
@ -211,13 +222,49 @@ class ComponentRendition:
def render_all(lst):
return "/".join(set(lst))
def render_form(_lst):
return ":("
def render_form(lst):
# find most frequent
return max(set(lst), key=lst.count)
for words, agreement in matches:
if not agreement:
def check_msd(word, selectors):
for key, value in selectors.items():
t = word.msd[0]
v = TAGSET[t].index(key.lower())
f1 = word.msd[v + 1]
f2 = CODES[value]
if '-' not in [f1, f2] and f1 != f2:
return False
return True
def check_agreement(w1, w2, agreements):
for agr_case in agreements:
t1 = w1.msd[0]
v1 = TAGSET[t1].index(agr_case)
assert(v1 >= 0)
# if none specified: nedolocnik, always agrees
if v1 + 1 >= len(w1.msd):
continue
# first is uppercase, not in TAGSET
m1 = w1.msd[v1 + 1]
# REPEAT (not DRY!)
t2 = w2.msd[0]
v2 = TAGSET[t2].index(agr_case)
assert(v2 >= 0)
if v2 + 1 >= len(w2.msd):
continue
m2 = w2.msd[v2 + 1]
# match!
if '-' not in [m1, m2] and m1 != m2:
return False
return True
for words in matches:
for w_id, w in words.items():
component = structure.get_component(w_id)
rep = component.representation
@ -234,24 +281,47 @@ class ComponentRendition:
# it HAS to be word_form now
else:
wf_type, more = rep.more
# set correct type first
if type(representations[w_id][1]) is str:
representations[w_id] = (
[], render_all if rep.more == "all" else render_form
[], render_all if wf_type is WordFormSelection.All else render_form
)
if wf_type is WordFormSelection.All:
add = True
elif wf_type is WordFormSelection.Msd:
add = check_msd(w, more)
else:
assert(wf_type is WordFormSelection.Agreement)
other_w, agreements = more
add = check_agreement(w, words[other_w], agreements)
if add:
representations[w_id][0].append(w.text)
doprint = matches[0]['1'].text.startswith('evrop')
# just need to set representation to first group...
for w_id, w in matches[0][0].items():
for w_id, w in matches[0].items():
data = representations[w_id]
if doprint:
print(data)
if type(data[1]) is str:
w.representation_failed = data[0]
w.representation = w.lemma if w.representation_failed else data[1]
else:
w.representation_failed = len(data[0]) > 0
w.representation_failed = len(data[0]) == 0
w.representation = w.lemma if w.representation_failed else data[1](data[0])
if doprint:
print(w.representation_failed, w.representation)
if doprint:
print('--')
def __str__(self):
return str(self.rendition)
@ -563,7 +633,6 @@ class SyntacticStructure:
def __init__(self):
self.id = None
self.lbs = None
self.agreements = []
self.components = []
@staticmethod
@ -611,38 +680,19 @@ class SyntacticStructure:
assert(el.tag == "feature")
if 'rendition' in el.attrib:
forms[n].append(el)
elif 'selection' in el.attrib and el.attrib["selection"] != "agreement":
forms[n].append(el)
elif 'selection' in el.attrib:
self.add_agreement(n, el)
forms[n].append(el)
else:
logging.warning("Strange representation feature in structure {}. Skipping"
.format(self.id))
continue
def add_agreement(self, n, el):
assert(el.get('head')[:4] == 'cid_')
n1 = n
n2 = el.get('head')[4:]
agreement_str = el.get('msd')
assert(agreement_str is not None)
self.agreements.append({
'n1': n1,
'n2': n2,
'match': agreement_str.split('+')})
def __str__(self):
comp_str = "\n".join(str(comp) for comp in self.components)
agrs = "\n".join("({} -[{}]- {}) ".format(
a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
links_str = "\n".join(self.components[0].tree())
return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
return "{} LBS {}\nCOMPONENTS\n{}\n\nLINKS\n{}\n{}".format(
self.id, self.lbs, comp_str, links_str, "-" * 40)
def get_component(self, idx):
for c in self.components:
@ -695,21 +745,10 @@ class SyntacticStructure:
def match(self, word):
matches = self.components[0].match(word)
if matches is None:
return []
return [] if matches is None else matches
to_ret = []
for m in matches:
# if not self.check_agreements(m):
# bad = "Agreement"
# elif not self.check_form(m):
# bad = "Form"
# else:
# bad = "OK"
to_ret.append((m, self.check_agreements(m)))
return to_ret
# for m in matches:
# to_ret.append((m, self.check_agreements(m)))
def build_structures(filename):
@ -898,11 +937,11 @@ class Writer:
def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
rows = []
for cid, m, reason, freq in colocation_ids.get_matches_for(structure_id, not self.all):
for cid, m, freq in colocation_ids.get_matches_for(structure_id, not self.all):
to_write = []
representation = ""
for idx, comp in enumerate(components):
for idx, _comp in enumerate(components):
idx = str(idx + 1)
word = m[idx] if idx in m else None
to_write.extend(self.from_word(word))
@ -978,21 +1017,23 @@ class ColocationIds:
def add_matches(self, matches):
for sid, nms in matches.items():
for nm in nms:
self._add_match(nm[2], sid, (nm[0], nm[1]))
self._add_match(nm[1], sid, nm[0])
def get_matches_for(self, structure_id, group):
for _cid_tup, (cid, cid_matches, sid) in self.data.items():
if sid != structure_id:
continue
for words, reason in cid_matches:
yield (cid, words, reason, len(cid_matches))
for words in cid_matches:
yield (cid, words, len(cid_matches))
if group:
break
def set_representations(self, structures):
components_dict = {structure.id: structure for structure in structures}
for _1, (_2, cid_matches, sid) in self.data.items():
if _2 == '1309':
a = 1
ComponentRendition.set_representations(cid_matches, components_dict[sid])
@ -1004,12 +1045,12 @@ def match_file(words, structures):
for w in words:
mhere = s.match(w)
logging.debug(" GOT: {}".format(len(mhere)))
for match, reason in mhere:
for match in mhere:
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
colocation_id = tuple(colocation_id)
matches[s.id].append((match, reason, colocation_id))
matches[s.id].append((match, colocation_id))
return matches