Does not yet work, agreements in representation
This commit is contained in:
parent
5bd0b4a064
commit
dce55d04a3
151
wani.py
151
wani.py
|
@ -132,6 +132,11 @@ class Rendition(Enum):
|
|||
Lexis = 2
|
||||
Unknown = 3
|
||||
|
||||
class WordFormSelection(Enum):
|
||||
All = 0
|
||||
Msd = 1
|
||||
Agreement = 2
|
||||
|
||||
class Order(Enum):
|
||||
FromTo = 0
|
||||
ToFrom = 1
|
||||
|
@ -192,9 +197,15 @@ class ComponentRendition:
|
|||
elif 'selection' in feature:
|
||||
if feature['selection'] == "msd":
|
||||
selectors = {k: v for k, v in feature.items() if k != 'selection'}
|
||||
self._set_more(selectors)
|
||||
self._set_more((WordFormSelection.Msd, selectors))
|
||||
elif feature['selection'] == "all":
|
||||
self._set_more("all")
|
||||
self._set_more((WordFormSelection.All, None))
|
||||
elif feature['selection'] == 'agreement':
|
||||
assert(feature['head'][:4] == 'cid_')
|
||||
assert(feature['msd'] is not None)
|
||||
|
||||
self._set_more((WordFormSelection.Agreement,
|
||||
(feature['head'][4:], feature['msd'].split('+'))))
|
||||
else:
|
||||
raise NotImplementedError("Representation selection: {}".format(feature))
|
||||
|
||||
|
@ -211,13 +222,49 @@ class ComponentRendition:
|
|||
def render_all(lst):
|
||||
return "/".join(set(lst))
|
||||
|
||||
def render_form(_lst):
|
||||
return ":("
|
||||
def render_form(lst):
|
||||
# find most frequent
|
||||
return max(set(lst), key=lst.count)
|
||||
|
||||
for words, agreement in matches:
|
||||
if not agreement:
|
||||
continue
|
||||
def check_msd(word, selectors):
|
||||
for key, value in selectors.items():
|
||||
t = word.msd[0]
|
||||
v = TAGSET[t].index(key.lower())
|
||||
f1 = word.msd[v + 1]
|
||||
f2 = CODES[value]
|
||||
|
||||
if '-' not in [f1, f2] and f1 != f2:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def check_agreement(w1, w2, agreements):
|
||||
for agr_case in agreements:
|
||||
t1 = w1.msd[0]
|
||||
v1 = TAGSET[t1].index(agr_case)
|
||||
assert(v1 >= 0)
|
||||
# if none specified: nedolocnik, always agrees
|
||||
if v1 + 1 >= len(w1.msd):
|
||||
continue
|
||||
# first is uppercase, not in TAGSET
|
||||
m1 = w1.msd[v1 + 1]
|
||||
|
||||
# REPEAT (not DRY!)
|
||||
t2 = w2.msd[0]
|
||||
v2 = TAGSET[t2].index(agr_case)
|
||||
assert(v2 >= 0)
|
||||
if v2 + 1 >= len(w2.msd):
|
||||
continue
|
||||
m2 = w2.msd[v2 + 1]
|
||||
|
||||
# match!
|
||||
if '-' not in [m1, m2] and m1 != m2:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
for words in matches:
|
||||
for w_id, w in words.items():
|
||||
component = structure.get_component(w_id)
|
||||
rep = component.representation
|
||||
|
@ -234,24 +281,47 @@ class ComponentRendition:
|
|||
|
||||
# it HAS to be word_form now
|
||||
else:
|
||||
wf_type, more = rep.more
|
||||
|
||||
# set correct type first
|
||||
if type(representations[w_id][1]) is str:
|
||||
representations[w_id] = (
|
||||
[], render_all if rep.more == "all" else render_form
|
||||
[], render_all if wf_type is WordFormSelection.All else render_form
|
||||
)
|
||||
representations[w_id][0].append(w.text)
|
||||
|
||||
if wf_type is WordFormSelection.All:
|
||||
add = True
|
||||
elif wf_type is WordFormSelection.Msd:
|
||||
add = check_msd(w, more)
|
||||
else:
|
||||
assert(wf_type is WordFormSelection.Agreement)
|
||||
other_w, agreements = more
|
||||
add = check_agreement(w, words[other_w], agreements)
|
||||
|
||||
if add:
|
||||
representations[w_id][0].append(w.text)
|
||||
|
||||
doprint = matches[0]['1'].text.startswith('evrop')
|
||||
|
||||
# just need to set representation to first group...
|
||||
for w_id, w in matches[0][0].items():
|
||||
for w_id, w in matches[0].items():
|
||||
data = representations[w_id]
|
||||
if doprint:
|
||||
print(data)
|
||||
|
||||
if type(data[1]) is str:
|
||||
w.representation_failed = data[0]
|
||||
w.representation = w.lemma if w.representation_failed else data[1]
|
||||
else:
|
||||
w.representation_failed = len(data[0]) > 0
|
||||
w.representation_failed = len(data[0]) == 0
|
||||
w.representation = w.lemma if w.representation_failed else data[1](data[0])
|
||||
|
||||
if doprint:
|
||||
print(w.representation_failed, w.representation)
|
||||
|
||||
if doprint:
|
||||
print('--')
|
||||
|
||||
def __str__(self):
|
||||
return str(self.rendition)
|
||||
|
||||
|
@ -563,7 +633,6 @@ class SyntacticStructure:
|
|||
def __init__(self):
|
||||
self.id = None
|
||||
self.lbs = None
|
||||
self.agreements = []
|
||||
self.components = []
|
||||
|
||||
@staticmethod
|
||||
|
@ -611,38 +680,19 @@ class SyntacticStructure:
|
|||
assert(el.tag == "feature")
|
||||
if 'rendition' in el.attrib:
|
||||
forms[n].append(el)
|
||||
elif 'selection' in el.attrib and el.attrib["selection"] != "agreement":
|
||||
forms[n].append(el)
|
||||
elif 'selection' in el.attrib:
|
||||
self.add_agreement(n, el)
|
||||
forms[n].append(el)
|
||||
else:
|
||||
logging.warning("Strange representation feature in structure {}. Skipping"
|
||||
.format(self.id))
|
||||
continue
|
||||
|
||||
def add_agreement(self, n, el):
|
||||
assert(el.get('head')[:4] == 'cid_')
|
||||
|
||||
n1 = n
|
||||
n2 = el.get('head')[4:]
|
||||
agreement_str = el.get('msd')
|
||||
assert(agreement_str is not None)
|
||||
|
||||
self.agreements.append({
|
||||
'n1': n1,
|
||||
'n2': n2,
|
||||
'match': agreement_str.split('+')})
|
||||
|
||||
def __str__(self):
|
||||
comp_str = "\n".join(str(comp) for comp in self.components)
|
||||
|
||||
agrs = "\n".join("({} -[{}]- {}) ".format(
|
||||
a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
|
||||
|
||||
links_str = "\n".join(self.components[0].tree())
|
||||
|
||||
return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
|
||||
self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
|
||||
return "{} LBS {}\nCOMPONENTS\n{}\n\nLINKS\n{}\n{}".format(
|
||||
self.id, self.lbs, comp_str, links_str, "-" * 40)
|
||||
|
||||
def get_component(self, idx):
|
||||
for c in self.components:
|
||||
|
@ -695,21 +745,10 @@ class SyntacticStructure:
|
|||
|
||||
def match(self, word):
|
||||
matches = self.components[0].match(word)
|
||||
if matches is None:
|
||||
return []
|
||||
return [] if matches is None else matches
|
||||
|
||||
to_ret = []
|
||||
for m in matches:
|
||||
# if not self.check_agreements(m):
|
||||
# bad = "Agreement"
|
||||
# elif not self.check_form(m):
|
||||
# bad = "Form"
|
||||
# else:
|
||||
# bad = "OK"
|
||||
|
||||
to_ret.append((m, self.check_agreements(m)))
|
||||
|
||||
return to_ret
|
||||
# for m in matches:
|
||||
# to_ret.append((m, self.check_agreements(m)))
|
||||
|
||||
|
||||
def build_structures(filename):
|
||||
|
@ -898,11 +937,11 @@ class Writer:
|
|||
def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
|
||||
rows = []
|
||||
|
||||
for cid, m, reason, freq in colocation_ids.get_matches_for(structure_id, not self.all):
|
||||
for cid, m, freq in colocation_ids.get_matches_for(structure_id, not self.all):
|
||||
to_write = []
|
||||
representation = ""
|
||||
|
||||
for idx, comp in enumerate(components):
|
||||
for idx, _comp in enumerate(components):
|
||||
idx = str(idx + 1)
|
||||
word = m[idx] if idx in m else None
|
||||
to_write.extend(self.from_word(word))
|
||||
|
@ -978,21 +1017,23 @@ class ColocationIds:
|
|||
def add_matches(self, matches):
|
||||
for sid, nms in matches.items():
|
||||
for nm in nms:
|
||||
self._add_match(nm[2], sid, (nm[0], nm[1]))
|
||||
self._add_match(nm[1], sid, nm[0])
|
||||
|
||||
def get_matches_for(self, structure_id, group):
|
||||
for _cid_tup, (cid, cid_matches, sid) in self.data.items():
|
||||
if sid != structure_id:
|
||||
continue
|
||||
|
||||
for words, reason in cid_matches:
|
||||
yield (cid, words, reason, len(cid_matches))
|
||||
for words in cid_matches:
|
||||
yield (cid, words, len(cid_matches))
|
||||
if group:
|
||||
break
|
||||
|
||||
def set_representations(self, structures):
|
||||
components_dict = {structure.id: structure for structure in structures}
|
||||
for _1, (_2, cid_matches, sid) in self.data.items():
|
||||
if _2 == '1309':
|
||||
a = 1
|
||||
ComponentRendition.set_representations(cid_matches, components_dict[sid])
|
||||
|
||||
|
||||
|
@ -1004,12 +1045,12 @@ def match_file(words, structures):
|
|||
for w in words:
|
||||
mhere = s.match(w)
|
||||
logging.debug(" GOT: {}".format(len(mhere)))
|
||||
for match, reason in mhere:
|
||||
for match in mhere:
|
||||
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
||||
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
|
||||
colocation_id = tuple(colocation_id)
|
||||
|
||||
matches[s.id].append((match, reason, colocation_id))
|
||||
matches[s.id].append((match, colocation_id))
|
||||
|
||||
return matches
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user