Does not yet work, agreements in representation
This commit is contained in:
parent
5bd0b4a064
commit
dce55d04a3
151
wani.py
151
wani.py
|
@ -132,6 +132,11 @@ class Rendition(Enum):
|
||||||
Lexis = 2
|
Lexis = 2
|
||||||
Unknown = 3
|
Unknown = 3
|
||||||
|
|
||||||
|
class WordFormSelection(Enum):
|
||||||
|
All = 0
|
||||||
|
Msd = 1
|
||||||
|
Agreement = 2
|
||||||
|
|
||||||
class Order(Enum):
|
class Order(Enum):
|
||||||
FromTo = 0
|
FromTo = 0
|
||||||
ToFrom = 1
|
ToFrom = 1
|
||||||
|
@ -192,9 +197,15 @@ class ComponentRendition:
|
||||||
elif 'selection' in feature:
|
elif 'selection' in feature:
|
||||||
if feature['selection'] == "msd":
|
if feature['selection'] == "msd":
|
||||||
selectors = {k: v for k, v in feature.items() if k != 'selection'}
|
selectors = {k: v for k, v in feature.items() if k != 'selection'}
|
||||||
self._set_more(selectors)
|
self._set_more((WordFormSelection.Msd, selectors))
|
||||||
elif feature['selection'] == "all":
|
elif feature['selection'] == "all":
|
||||||
self._set_more("all")
|
self._set_more((WordFormSelection.All, None))
|
||||||
|
elif feature['selection'] == 'agreement':
|
||||||
|
assert(feature['head'][:4] == 'cid_')
|
||||||
|
assert(feature['msd'] is not None)
|
||||||
|
|
||||||
|
self._set_more((WordFormSelection.Agreement,
|
||||||
|
(feature['head'][4:], feature['msd'].split('+'))))
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("Representation selection: {}".format(feature))
|
raise NotImplementedError("Representation selection: {}".format(feature))
|
||||||
|
|
||||||
|
@ -211,13 +222,49 @@ class ComponentRendition:
|
||||||
def render_all(lst):
|
def render_all(lst):
|
||||||
return "/".join(set(lst))
|
return "/".join(set(lst))
|
||||||
|
|
||||||
def render_form(_lst):
|
def render_form(lst):
|
||||||
return ":("
|
# find most frequent
|
||||||
|
return max(set(lst), key=lst.count)
|
||||||
|
|
||||||
for words, agreement in matches:
|
def check_msd(word, selectors):
|
||||||
if not agreement:
|
for key, value in selectors.items():
|
||||||
continue
|
t = word.msd[0]
|
||||||
|
v = TAGSET[t].index(key.lower())
|
||||||
|
f1 = word.msd[v + 1]
|
||||||
|
f2 = CODES[value]
|
||||||
|
|
||||||
|
if '-' not in [f1, f2] and f1 != f2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def check_agreement(w1, w2, agreements):
|
||||||
|
for agr_case in agreements:
|
||||||
|
t1 = w1.msd[0]
|
||||||
|
v1 = TAGSET[t1].index(agr_case)
|
||||||
|
assert(v1 >= 0)
|
||||||
|
# if none specified: nedolocnik, always agrees
|
||||||
|
if v1 + 1 >= len(w1.msd):
|
||||||
|
continue
|
||||||
|
# first is uppercase, not in TAGSET
|
||||||
|
m1 = w1.msd[v1 + 1]
|
||||||
|
|
||||||
|
# REPEAT (not DRY!)
|
||||||
|
t2 = w2.msd[0]
|
||||||
|
v2 = TAGSET[t2].index(agr_case)
|
||||||
|
assert(v2 >= 0)
|
||||||
|
if v2 + 1 >= len(w2.msd):
|
||||||
|
continue
|
||||||
|
m2 = w2.msd[v2 + 1]
|
||||||
|
|
||||||
|
# match!
|
||||||
|
if '-' not in [m1, m2] and m1 != m2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
for words in matches:
|
||||||
for w_id, w in words.items():
|
for w_id, w in words.items():
|
||||||
component = structure.get_component(w_id)
|
component = structure.get_component(w_id)
|
||||||
rep = component.representation
|
rep = component.representation
|
||||||
|
@ -234,24 +281,47 @@ class ComponentRendition:
|
||||||
|
|
||||||
# it HAS to be word_form now
|
# it HAS to be word_form now
|
||||||
else:
|
else:
|
||||||
|
wf_type, more = rep.more
|
||||||
|
|
||||||
# set correct type first
|
# set correct type first
|
||||||
if type(representations[w_id][1]) is str:
|
if type(representations[w_id][1]) is str:
|
||||||
representations[w_id] = (
|
representations[w_id] = (
|
||||||
[], render_all if rep.more == "all" else render_form
|
[], render_all if wf_type is WordFormSelection.All else render_form
|
||||||
)
|
)
|
||||||
representations[w_id][0].append(w.text)
|
|
||||||
|
if wf_type is WordFormSelection.All:
|
||||||
|
add = True
|
||||||
|
elif wf_type is WordFormSelection.Msd:
|
||||||
|
add = check_msd(w, more)
|
||||||
|
else:
|
||||||
|
assert(wf_type is WordFormSelection.Agreement)
|
||||||
|
other_w, agreements = more
|
||||||
|
add = check_agreement(w, words[other_w], agreements)
|
||||||
|
|
||||||
|
if add:
|
||||||
|
representations[w_id][0].append(w.text)
|
||||||
|
|
||||||
|
doprint = matches[0]['1'].text.startswith('evrop')
|
||||||
|
|
||||||
# just need to set representation to first group...
|
# just need to set representation to first group...
|
||||||
for w_id, w in matches[0][0].items():
|
for w_id, w in matches[0].items():
|
||||||
data = representations[w_id]
|
data = representations[w_id]
|
||||||
|
if doprint:
|
||||||
|
print(data)
|
||||||
|
|
||||||
if type(data[1]) is str:
|
if type(data[1]) is str:
|
||||||
w.representation_failed = data[0]
|
w.representation_failed = data[0]
|
||||||
w.representation = w.lemma if w.representation_failed else data[1]
|
w.representation = w.lemma if w.representation_failed else data[1]
|
||||||
else:
|
else:
|
||||||
w.representation_failed = len(data[0]) > 0
|
w.representation_failed = len(data[0]) == 0
|
||||||
w.representation = w.lemma if w.representation_failed else data[1](data[0])
|
w.representation = w.lemma if w.representation_failed else data[1](data[0])
|
||||||
|
|
||||||
|
if doprint:
|
||||||
|
print(w.representation_failed, w.representation)
|
||||||
|
|
||||||
|
if doprint:
|
||||||
|
print('--')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return str(self.rendition)
|
return str(self.rendition)
|
||||||
|
|
||||||
|
@ -563,7 +633,6 @@ class SyntacticStructure:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.id = None
|
self.id = None
|
||||||
self.lbs = None
|
self.lbs = None
|
||||||
self.agreements = []
|
|
||||||
self.components = []
|
self.components = []
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -611,38 +680,19 @@ class SyntacticStructure:
|
||||||
assert(el.tag == "feature")
|
assert(el.tag == "feature")
|
||||||
if 'rendition' in el.attrib:
|
if 'rendition' in el.attrib:
|
||||||
forms[n].append(el)
|
forms[n].append(el)
|
||||||
elif 'selection' in el.attrib and el.attrib["selection"] != "agreement":
|
|
||||||
forms[n].append(el)
|
|
||||||
elif 'selection' in el.attrib:
|
elif 'selection' in el.attrib:
|
||||||
self.add_agreement(n, el)
|
forms[n].append(el)
|
||||||
else:
|
else:
|
||||||
logging.warning("Strange representation feature in structure {}. Skipping"
|
logging.warning("Strange representation feature in structure {}. Skipping"
|
||||||
.format(self.id))
|
.format(self.id))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
def add_agreement(self, n, el):
|
|
||||||
assert(el.get('head')[:4] == 'cid_')
|
|
||||||
|
|
||||||
n1 = n
|
|
||||||
n2 = el.get('head')[4:]
|
|
||||||
agreement_str = el.get('msd')
|
|
||||||
assert(agreement_str is not None)
|
|
||||||
|
|
||||||
self.agreements.append({
|
|
||||||
'n1': n1,
|
|
||||||
'n2': n2,
|
|
||||||
'match': agreement_str.split('+')})
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
comp_str = "\n".join(str(comp) for comp in self.components)
|
comp_str = "\n".join(str(comp) for comp in self.components)
|
||||||
|
|
||||||
agrs = "\n".join("({} -[{}]- {}) ".format(
|
|
||||||
a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
|
|
||||||
|
|
||||||
links_str = "\n".join(self.components[0].tree())
|
links_str = "\n".join(self.components[0].tree())
|
||||||
|
|
||||||
return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
|
return "{} LBS {}\nCOMPONENTS\n{}\n\nLINKS\n{}\n{}".format(
|
||||||
self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
|
self.id, self.lbs, comp_str, links_str, "-" * 40)
|
||||||
|
|
||||||
def get_component(self, idx):
|
def get_component(self, idx):
|
||||||
for c in self.components:
|
for c in self.components:
|
||||||
|
@ -695,21 +745,10 @@ class SyntacticStructure:
|
||||||
|
|
||||||
def match(self, word):
|
def match(self, word):
|
||||||
matches = self.components[0].match(word)
|
matches = self.components[0].match(word)
|
||||||
if matches is None:
|
return [] if matches is None else matches
|
||||||
return []
|
|
||||||
|
|
||||||
to_ret = []
|
# for m in matches:
|
||||||
for m in matches:
|
# to_ret.append((m, self.check_agreements(m)))
|
||||||
# if not self.check_agreements(m):
|
|
||||||
# bad = "Agreement"
|
|
||||||
# elif not self.check_form(m):
|
|
||||||
# bad = "Form"
|
|
||||||
# else:
|
|
||||||
# bad = "OK"
|
|
||||||
|
|
||||||
to_ret.append((m, self.check_agreements(m)))
|
|
||||||
|
|
||||||
return to_ret
|
|
||||||
|
|
||||||
|
|
||||||
def build_structures(filename):
|
def build_structures(filename):
|
||||||
|
@ -898,11 +937,11 @@ class Writer:
|
||||||
def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
|
def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
|
||||||
rows = []
|
rows = []
|
||||||
|
|
||||||
for cid, m, reason, freq in colocation_ids.get_matches_for(structure_id, not self.all):
|
for cid, m, freq in colocation_ids.get_matches_for(structure_id, not self.all):
|
||||||
to_write = []
|
to_write = []
|
||||||
representation = ""
|
representation = ""
|
||||||
|
|
||||||
for idx, comp in enumerate(components):
|
for idx, _comp in enumerate(components):
|
||||||
idx = str(idx + 1)
|
idx = str(idx + 1)
|
||||||
word = m[idx] if idx in m else None
|
word = m[idx] if idx in m else None
|
||||||
to_write.extend(self.from_word(word))
|
to_write.extend(self.from_word(word))
|
||||||
|
@ -978,21 +1017,23 @@ class ColocationIds:
|
||||||
def add_matches(self, matches):
|
def add_matches(self, matches):
|
||||||
for sid, nms in matches.items():
|
for sid, nms in matches.items():
|
||||||
for nm in nms:
|
for nm in nms:
|
||||||
self._add_match(nm[2], sid, (nm[0], nm[1]))
|
self._add_match(nm[1], sid, nm[0])
|
||||||
|
|
||||||
def get_matches_for(self, structure_id, group):
|
def get_matches_for(self, structure_id, group):
|
||||||
for _cid_tup, (cid, cid_matches, sid) in self.data.items():
|
for _cid_tup, (cid, cid_matches, sid) in self.data.items():
|
||||||
if sid != structure_id:
|
if sid != structure_id:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for words, reason in cid_matches:
|
for words in cid_matches:
|
||||||
yield (cid, words, reason, len(cid_matches))
|
yield (cid, words, len(cid_matches))
|
||||||
if group:
|
if group:
|
||||||
break
|
break
|
||||||
|
|
||||||
def set_representations(self, structures):
|
def set_representations(self, structures):
|
||||||
components_dict = {structure.id: structure for structure in structures}
|
components_dict = {structure.id: structure for structure in structures}
|
||||||
for _1, (_2, cid_matches, sid) in self.data.items():
|
for _1, (_2, cid_matches, sid) in self.data.items():
|
||||||
|
if _2 == '1309':
|
||||||
|
a = 1
|
||||||
ComponentRendition.set_representations(cid_matches, components_dict[sid])
|
ComponentRendition.set_representations(cid_matches, components_dict[sid])
|
||||||
|
|
||||||
|
|
||||||
|
@ -1004,12 +1045,12 @@ def match_file(words, structures):
|
||||||
for w in words:
|
for w in words:
|
||||||
mhere = s.match(w)
|
mhere = s.match(w)
|
||||||
logging.debug(" GOT: {}".format(len(mhere)))
|
logging.debug(" GOT: {}".format(len(mhere)))
|
||||||
for match, reason in mhere:
|
for match in mhere:
|
||||||
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
||||||
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
|
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
|
||||||
colocation_id = tuple(colocation_id)
|
colocation_id = tuple(colocation_id)
|
||||||
|
|
||||||
matches[s.id].append((match, reason, colocation_id))
|
matches[s.id].append((match, colocation_id))
|
||||||
|
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user