Refactoring representations. Now muuuuch nicer code, not yet working though :)
Added: multiple representations per component id
This commit is contained in:
parent
307007218d
commit
bfd4d4a747
494
wani.py
494
wani.py
|
@ -126,19 +126,6 @@ class RestrictionType(Enum):
|
||||||
Lexis = 1
|
Lexis = 1
|
||||||
MatchAll = 2
|
MatchAll = 2
|
||||||
|
|
||||||
|
|
||||||
class Rendition(Enum):
|
|
||||||
Lemma = 0
|
|
||||||
WordForm = 1
|
|
||||||
Lexis = 2
|
|
||||||
Unknown = 3
|
|
||||||
|
|
||||||
class WordFormSelection(Enum):
|
|
||||||
All = 0
|
|
||||||
Msd = 1
|
|
||||||
Agreement = 2
|
|
||||||
Any = 3
|
|
||||||
|
|
||||||
class Order(Enum):
|
class Order(Enum):
|
||||||
FromTo = 0
|
FromTo = 0
|
||||||
ToFrom = 1
|
ToFrom = 1
|
||||||
|
@ -171,102 +158,73 @@ class Order(Enum):
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("Should not be here: Order match")
|
raise NotImplementedError("Should not be here: Order match")
|
||||||
|
|
||||||
class ComponentRendition:
|
|
||||||
def __init__(self):
|
|
||||||
self.more = None
|
|
||||||
self.rendition = Rendition.Unknown
|
|
||||||
|
|
||||||
def _set_rendition(self, r):
|
class ComponentRepresentation:
|
||||||
assert(self.rendition is Rendition.Unknown)
|
def __init__(self, data, word_renderer):
|
||||||
self.rendition = r
|
self.data = data
|
||||||
|
self.word_renderer = word_renderer
|
||||||
|
|
||||||
def _set_more(self, m):
|
self.words = []
|
||||||
self.more = m
|
self.rendition_text = None
|
||||||
|
self.agreement = None
|
||||||
|
|
||||||
def add_feature(self, feature):
|
def get_agreement(self):
|
||||||
if 'rendition' in feature:
|
|
||||||
if feature['rendition'] == "lemma":
|
|
||||||
self._set_rendition(Rendition.Lemma)
|
|
||||||
elif feature['rendition'] == "word_form":
|
|
||||||
self._set_rendition(Rendition.WordForm)
|
|
||||||
self._set_more((WordFormSelection.Any, None))
|
|
||||||
elif feature['rendition'] == "lexis":
|
|
||||||
self._set_rendition(Rendition.Lexis)
|
|
||||||
self._set_more(feature['string'])
|
|
||||||
else:
|
|
||||||
raise NotImplementedError("Representation rendition: {}".format(feature))
|
|
||||||
|
|
||||||
elif 'selection' in feature:
|
|
||||||
if feature['selection'] == "msd":
|
|
||||||
selectors = {k: v for k, v in feature.items() if k != 'selection'}
|
|
||||||
self._set_more((WordFormSelection.Msd, selectors))
|
|
||||||
elif feature['selection'] == "all":
|
|
||||||
self._set_more((WordFormSelection.All, None))
|
|
||||||
elif feature['selection'] == 'agreement':
|
|
||||||
assert(feature['head'][:4] == 'cid_')
|
|
||||||
assert(feature['msd'] is not None)
|
|
||||||
|
|
||||||
self._set_more((WordFormSelection.Agreement,
|
|
||||||
(feature['head'][4:], feature['msd'].split('+'))))
|
|
||||||
else:
|
|
||||||
raise NotImplementedError("Representation selection: {}".format(feature))
|
|
||||||
|
|
||||||
else:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def isit(self, rendition):
|
def add_word(self, word):
|
||||||
return self.rendition is rendition
|
self.words.append(word)
|
||||||
|
|
||||||
@staticmethod
|
def render(self):
|
||||||
def set_representations(matches, structure, word_renderer):
|
if self.rendition_text is None:
|
||||||
representations = {
|
print(type(self))
|
||||||
c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
|
self.rendition_text = self._render()
|
||||||
for c in structure.components
|
|
||||||
}
|
|
||||||
found_agreements = {}
|
|
||||||
word_component_id = {}
|
|
||||||
|
|
||||||
def render_all(component_id, lst, _bw):
|
def rendition(self):
|
||||||
rep = "/".join(set([w.text for w in set(lst)])) if len(lst) > 0 else None
|
return "" if self.rendition_text is None else self.rendition_text
|
||||||
matches.representations[component_id] = rep
|
|
||||||
|
|
||||||
def render_form(component_id, lst, backup_word):
|
def _render(self):
|
||||||
if backup_word is not None:
|
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
|
||||||
lst.append(backup_word)
|
|
||||||
|
|
||||||
|
class LemmaCR(ComponentRepresentation):
|
||||||
|
def _render(self):
|
||||||
|
return self.words[0].lemma if len(self.words) > 0 else None
|
||||||
|
|
||||||
|
class LexisCR(ComponentRepresentation):
|
||||||
|
def _render(self):
|
||||||
|
return self.data
|
||||||
|
|
||||||
|
class WordFormAllCR(ComponentRepresentation):
|
||||||
|
def _render(self):
|
||||||
|
txt = "/".join(set([w.text for w in set(self.words)])) if len(self.words) > 0 else None
|
||||||
|
return txt
|
||||||
|
|
||||||
|
class WordFormAnyCR(ComponentRepresentation):
|
||||||
|
def _render(self):
|
||||||
text_forms = {}
|
text_forms = {}
|
||||||
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst])
|
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
|
||||||
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
|
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
|
||||||
text_forms[(msd, lemma)] = text
|
text_forms[(msd, lemma)] = text
|
||||||
|
|
||||||
lst_ctr = []
|
words_counter = []
|
||||||
for word in lst:
|
for word in self.words:
|
||||||
lst_ctr.append((word.msd, word.lemma))
|
words_counter.append((word.msd, word.lemma))
|
||||||
sorted_lst = sorted(set(lst_ctr), key=lst.count)
|
sorted_words = sorted(set(words_counter), key=words_counter.count)
|
||||||
|
|
||||||
for word_msd, word_lemma in sorted_lst:
|
for word_msd, word_lemma in sorted_words:
|
||||||
if component_id in found_agreements:
|
if self.agreement is not None:
|
||||||
other_component_id, other_word, agreements, other_texts = found_agreements[component_id]
|
if self.agreement.match(word_msd):
|
||||||
agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts)
|
if word_lemma is None:
|
||||||
if agr is None:
|
return None
|
||||||
continue
|
else:
|
||||||
|
return text_forms[(word_msd, word_lemma)]
|
||||||
|
|
||||||
matches.representations[other_component_id] = agr
|
class WordFormMsdCR(WordFormAnyCR):
|
||||||
|
def __init__(self, *args):
|
||||||
|
super().__init__(*args)
|
||||||
|
self.backup_word = None
|
||||||
|
|
||||||
if word_lemma is not None:
|
def check_msd(self, word):
|
||||||
matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd)
|
selectors = self.data
|
||||||
|
|
||||||
break
|
|
||||||
|
|
||||||
def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts):
|
|
||||||
for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts):
|
|
||||||
if ow_msd[0] != w2_msd[0]:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if check_agreement(w1_msd, w2_msd, agreements):
|
|
||||||
return w2_txt
|
|
||||||
|
|
||||||
def check_msd(word, selectors):
|
|
||||||
for key, value in selectors.items():
|
for key, value in selectors.items():
|
||||||
t = word.msd[0]
|
t = word.msd[0]
|
||||||
v = TAGSET[t].index(key.lower())
|
v = TAGSET[t].index(key.lower())
|
||||||
|
@ -277,7 +235,47 @@ class ComponentRendition:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add_word(self, word):
|
||||||
|
if self.backup_word is None:
|
||||||
|
msd = self.word_renderer.get_lemma_msd(word.lemma, word.msd)
|
||||||
|
WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text')
|
||||||
|
self.backup_word = WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None)
|
||||||
|
|
||||||
|
if self.check_msd(word):
|
||||||
|
super().add_word(word)
|
||||||
|
|
||||||
|
def _render(self):
|
||||||
|
self.words.append(self.backup_word)
|
||||||
|
return super()._render()
|
||||||
|
|
||||||
|
class WordFormAgreementCR(ComponentRepresentation):
|
||||||
|
def __init__(self, data, word_renderer):
|
||||||
|
super().__init__(data, word_renderer)
|
||||||
|
self.agree_with, self.data = self.data
|
||||||
|
|
||||||
|
def get_agreement(self):
|
||||||
|
return self.agree_with
|
||||||
|
|
||||||
|
def match(self, word_msd):
|
||||||
|
word_category = self.words[0].msd[0]
|
||||||
|
word_lemma = self.words[0].lemma
|
||||||
|
agreements = self.data
|
||||||
|
|
||||||
|
existing = [(w.msd, w.text) for w in self.words]
|
||||||
|
|
||||||
|
for candidate_msd, candidate_text in self.word_renderer.available_words(word_lemma, existing):
|
||||||
|
if word_category != candidate_msd[0]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, agreements):
|
||||||
|
self.rendition_text = candidate_text
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def check_agreement(msd1, msd2, agreements):
|
def check_agreement(msd1, msd2, agreements):
|
||||||
for agr_case in agreements:
|
for agr_case in agreements:
|
||||||
t1 = msd1[0]
|
t1 = msd1[0]
|
||||||
|
@ -309,67 +307,220 @@ class ComponentRendition:
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ComponentRendition:
|
||||||
|
def __init__(self):
|
||||||
|
self.more = None
|
||||||
|
self.representation_factory = ComponentRepresentation
|
||||||
|
|
||||||
|
def _set_more(self, m):
|
||||||
|
self.more = m
|
||||||
|
|
||||||
|
def add_feature(self, feature):
|
||||||
|
if 'rendition' in feature:
|
||||||
|
if feature['rendition'] == "lemma":
|
||||||
|
self.representation_factory = LemmaCR
|
||||||
|
elif feature['rendition'] == "word_form":
|
||||||
|
# just by default, changes with selection
|
||||||
|
self.representation_factory = WordFormAnyCR
|
||||||
|
elif feature['rendition'] == "lexis":
|
||||||
|
self.representation_factory = LexisCR
|
||||||
|
self.mor = feature['string']
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Representation rendition: {}".format(feature))
|
||||||
|
|
||||||
|
elif 'selection' in feature:
|
||||||
|
if feature['selection'] == "msd":
|
||||||
|
self.representation_factory = WordFormMsdCR
|
||||||
|
self.more = {k: v for k, v in feature.items() if k != 'selection'}
|
||||||
|
elif feature['selection'] == "all":
|
||||||
|
self.representation_factory = WordFormAllCR
|
||||||
|
elif feature['selection'] == 'agreement':
|
||||||
|
assert(feature['head'][:4] == 'cid_')
|
||||||
|
assert(feature['msd'] is not None)
|
||||||
|
self.representation_factory = WordFormAgreementCR
|
||||||
|
self.more = (feature['head'][4:], feature['msd'].split('+'))
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Representation selection: {}".format(feature))
|
||||||
|
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def cr_instance(self, word_renderer):
|
||||||
|
return self.representation_factory(self.more, word_renderer)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def set_representations(matches, structure, word_renderer):
|
||||||
|
representations = {}
|
||||||
|
for c in structure.components:
|
||||||
|
representations[c.idx] = []
|
||||||
|
for rep in c.representation:
|
||||||
|
representations[c.idx].append(rep.cr_instance(word_renderer))
|
||||||
|
|
||||||
|
for cid, reps in representations.items():
|
||||||
|
for rep in reps:
|
||||||
|
agr = rep.get_agreement()
|
||||||
|
if agr is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(representations[agr]) != 1:
|
||||||
|
n = len(representations[agr])
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Structure {}: ".format(structure.id) +
|
||||||
|
"component {} has agreement".format(cid) +
|
||||||
|
" with component {}".format(agr) +
|
||||||
|
", however there are {} (!= 1) representations".format(n) +
|
||||||
|
" of component {}!".format(agr))
|
||||||
|
|
||||||
|
representations[agr][0].agreement = rep
|
||||||
|
|
||||||
|
# representations = {
|
||||||
|
# c.idx: [[], None] if c.representation.isit(Rendition.WordForm) else [True, ""]
|
||||||
|
# for c in structure.components
|
||||||
|
# }
|
||||||
|
# found_agreements = {}
|
||||||
|
|
||||||
|
# def render_form(component_id, lst, backup_word):
|
||||||
|
# if backup_word is not None:
|
||||||
|
# lst.append(backup_word)
|
||||||
|
|
||||||
|
# text_forms = {}
|
||||||
|
# msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in lst])
|
||||||
|
# for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
|
||||||
|
# text_forms[(msd, lemma)] = text
|
||||||
|
|
||||||
|
# lst_ctr = []
|
||||||
|
# for word in lst:
|
||||||
|
# lst_ctr.append((word.msd, word.lemma))
|
||||||
|
# sorted_lst = sorted(set(lst_ctr), key=lst.count)
|
||||||
|
|
||||||
|
# for word_msd, word_lemma in sorted_lst:
|
||||||
|
# if component_id in found_agreements:
|
||||||
|
# other_component_id, other_word, agreements, other_texts = found_agreements[component_id]
|
||||||
|
# agr = are_agreements_ok(word_msd, other_word.lemma, other_word.msd, agreements, other_texts)
|
||||||
|
# if agr is None:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# matches.representations[other_component_id] = agr
|
||||||
|
|
||||||
|
# if word_lemma is not None:
|
||||||
|
# matches.representations[component_id] = text_forms[(msd, lemma)] #word_renderer.render(word_lemma, word_msd)
|
||||||
|
|
||||||
|
# break
|
||||||
|
|
||||||
|
# def are_agreements_ok(w1_msd, ow_lemma, ow_msd, agreements, ow_texts):
|
||||||
|
# for w2_msd, w2_txt in word_renderer.available_words(ow_lemma, ow_texts):
|
||||||
|
# if ow_msd[0] != w2_msd[0]:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# if check_agreement(w1_msd, w2_msd, agreements):
|
||||||
|
# return w2_txt
|
||||||
|
|
||||||
|
|
||||||
|
# def check_agreement(msd1, msd2, agreements):
|
||||||
|
# for agr_case in agreements:
|
||||||
|
# t1 = msd1[0]
|
||||||
|
# # if not in msd, some strange msd was tries, skipping...
|
||||||
|
# if agr_case not in TAGSET[t1]:
|
||||||
|
# logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
|
||||||
|
# return False
|
||||||
|
|
||||||
|
# v1 = TAGSET[t1].index(agr_case)
|
||||||
|
# # if none specified: nedolocnik, always agrees
|
||||||
|
# if v1 + 1 >= len(msd1):
|
||||||
|
# continue
|
||||||
|
# # first is uppercase, not in TAGSET
|
||||||
|
# m1 = msd1[v1 + 1]
|
||||||
|
|
||||||
|
# # REPEAT (not DRY!)
|
||||||
|
# t2 = msd2[0]
|
||||||
|
# if agr_case not in TAGSET[t2]:
|
||||||
|
# logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
|
||||||
|
# return False
|
||||||
|
# v2 = TAGSET[t2].index(agr_case)
|
||||||
|
# if v2 + 1 >= len(msd2):
|
||||||
|
# continue
|
||||||
|
# m2 = msd2[v2 + 1]
|
||||||
|
|
||||||
|
# # match!
|
||||||
|
# if '-' not in [m1, m2] and m1 != m2:
|
||||||
|
# return False
|
||||||
|
|
||||||
|
# return True
|
||||||
|
|
||||||
for words in matches.matches:
|
for words in matches.matches:
|
||||||
# first pass, check everything but agreements
|
# first pass, check everything but agreements
|
||||||
for w_id, w in words.items():
|
for w_id, w in words.items():
|
||||||
component = structure.get_component(w_id)
|
component = structure.get_component(w_id)
|
||||||
rep = component.representation
|
component_representations = representations[component.idx]
|
||||||
word_component_id[w.id] = w_id
|
for representation in component_representations:
|
||||||
|
representation.add_word(w)
|
||||||
|
|
||||||
if rep.isit(Rendition.Lemma):
|
# if rep.isit(Rendition.Lemma):
|
||||||
representations[w_id][0] = False
|
# representations[w_id][0] = False
|
||||||
representations[w_id][1] = w.lemma
|
# representations[w_id][1] = w.lemma
|
||||||
elif rep.isit(Rendition.Lexis):
|
# elif rep.isit(Rendition.Lexis):
|
||||||
representations[w_id][0] = False
|
# representations[w_id][0] = False
|
||||||
representations[w_id][1] = rep.more
|
# representations[w_id][1] = rep.more
|
||||||
elif rep.isit(Rendition.Unknown):
|
# elif rep.isit(Rendition.Unknown):
|
||||||
representations[w_id][0] = False
|
# representations[w_id][0] = False
|
||||||
representations[w_id][1] = ""
|
# representations[w_id][1] = ""
|
||||||
|
|
||||||
# it HAS to be word_form now
|
# # it HAS to be word_form now
|
||||||
else:
|
# else:
|
||||||
assert(rep.isit(Rendition.WordForm))
|
# assert(rep.isit(Rendition.WordForm))
|
||||||
wf_type, more = rep.more
|
# wf_type, more = rep.more
|
||||||
add = True
|
# add = True
|
||||||
|
|
||||||
if wf_type is WordFormSelection.Msd:
|
# if wf_type is WordFormSelection.Msd:
|
||||||
add = check_msd(w, more)
|
# add = check_msd(w, more)
|
||||||
func = render_form
|
# func = render_form
|
||||||
elif wf_type is WordFormSelection.All:
|
# elif wf_type is WordFormSelection.All:
|
||||||
func = render_all
|
# func = render_all
|
||||||
elif wf_type is WordFormSelection.Any:
|
# elif wf_type is WordFormSelection.Any:
|
||||||
func = render_form
|
# func = render_form
|
||||||
else:
|
# else:
|
||||||
assert(wf_type is WordFormSelection.Agreement)
|
# assert(wf_type is WordFormSelection.Agreement)
|
||||||
other_w, agreements = more
|
# other_w, agreements = more
|
||||||
if other_w not in found_agreements:
|
# if other_w not in found_agreements:
|
||||||
found_agreements[other_w] = (w_id, w, agreements, [])
|
# found_agreements[other_w] = (w_id, w, agreements, [])
|
||||||
|
|
||||||
found_agreements[other_w][-1].append((w.msd, w.text))
|
# found_agreements[other_w][-1].append((w.msd, w.text))
|
||||||
func = lambda *x: None
|
# func = lambda *x: None
|
||||||
|
|
||||||
representations[w_id][1] = func
|
# representations[w_id][1] = func
|
||||||
if add:
|
# if add:
|
||||||
representations[w_id][0].append(w)
|
# representations[w_id][0].append(w)
|
||||||
|
|
||||||
# just need to set representation to first group,
|
for cid, reps in representations.items():
|
||||||
# but in correct order, agreements last!
|
for rep in reps:
|
||||||
representation_sorted_words = []
|
rep.render()
|
||||||
for w_id, w in matches.matches[0].items():
|
|
||||||
rep = component.representation
|
|
||||||
if rep.isit(Rendition.WordForm) and rep.more[0] is WordFormSelection.Agreement:
|
|
||||||
representation_sorted_words.append((w_id, w))
|
|
||||||
else:
|
|
||||||
representation_sorted_words.insert(0, (w_id, w))
|
|
||||||
|
|
||||||
for w_id, w in representation_sorted_words:
|
for cid, reps in representations.items():
|
||||||
data = representations[w_id]
|
rep = " ".join(rep.rendition() for rep in reps)
|
||||||
if type(data[1]) is str:
|
matches.representations[cid] = rep
|
||||||
matches.representations[w_id] = None if data[0] else data[1]
|
|
||||||
else:
|
# # just need to set representation to first group,
|
||||||
backup_msd = word_renderer.get_lemma_msd(w.lemma)
|
# # but in correct order, agreements last!
|
||||||
backup_word = lemma_only_word(backup_msd)
|
# representation_sorted_words = []
|
||||||
data[1](str(w_id), data[0], backup_word)
|
# for w_id, w in matches.matches[0].items():
|
||||||
|
# rep = component.representation
|
||||||
|
# if rep.isit(Rendition.WordForm) and rep.more[0] is WordFormSelection.Agreement:
|
||||||
|
# representation_sorted_words.append((w_id, w))
|
||||||
|
# else:
|
||||||
|
# representation_sorted_words.insert(0, (w_id, w))
|
||||||
|
|
||||||
|
# for w_id, w in representation_sorted_words:
|
||||||
|
# data = representations[w_id]
|
||||||
|
# if type(data[1]) is str:
|
||||||
|
# matches.representations[w_id] = None if data[0] else data[1]
|
||||||
|
# else:
|
||||||
|
# backup_msd = word_renderer.get_lemma_msd(w.lemma)
|
||||||
|
# backup_word = lemma_only_word(backup_msd)
|
||||||
|
# data[1](str(w_id), data[0], backup_word)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return str(self.rendition)
|
return str(self.rendition)
|
||||||
|
@ -519,7 +670,7 @@ class Component:
|
||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.restriction = None
|
self.restriction = None
|
||||||
self.next_element = []
|
self.next_element = []
|
||||||
self.representation = ComponentRendition()
|
self.representation = []
|
||||||
self.selection = {}
|
self.selection = {}
|
||||||
|
|
||||||
self.iter_ctr = 0
|
self.iter_ctr = 0
|
||||||
|
@ -541,8 +692,11 @@ class Component:
|
||||||
raise RuntimeError("Unreachable")
|
raise RuntimeError("Unreachable")
|
||||||
|
|
||||||
def set_representation(self, representation):
|
def set_representation(self, representation):
|
||||||
for feature in representation:
|
for rep in representation:
|
||||||
self.representation.add_feature(feature.attrib)
|
crend = ComponentRendition()
|
||||||
|
for feature in rep:
|
||||||
|
crend.add_feature(feature.attrib)
|
||||||
|
self.representation.append(crend)
|
||||||
|
|
||||||
def find_next(self, deps, comps, restrs, reprs):
|
def find_next(self, deps, comps, restrs, reprs):
|
||||||
to_ret = []
|
to_ret = []
|
||||||
|
@ -721,21 +875,17 @@ class SyntacticStructure:
|
||||||
return st
|
return st
|
||||||
|
|
||||||
def add_representation(self, n, rep_el, forms):
|
def add_representation(self, n, rep_el, forms):
|
||||||
if rep_el.tag == "representation_and":
|
|
||||||
rep_el = rep_el[0]
|
|
||||||
logging.warning("Only using first reprentation in representation_and in structure {}".format(self.id))
|
|
||||||
|
|
||||||
assert(rep_el.tag == "representation")
|
assert(rep_el.tag == "representation")
|
||||||
|
to_add = []
|
||||||
for el in rep_el:
|
for el in rep_el:
|
||||||
assert(el.tag == "feature")
|
assert(el.tag == "feature")
|
||||||
if 'rendition' in el.attrib:
|
if 'rendition' in el.attrib or 'selection' in el.attrib:
|
||||||
forms[n].append(el)
|
to_add.append(el)
|
||||||
elif 'selection' in el.attrib:
|
|
||||||
forms[n].append(el)
|
|
||||||
else:
|
else:
|
||||||
logging.warning("Strange representation feature in structure {}. Skipping"
|
logging.warning("Strange representation feature in structure {}. Skipping"
|
||||||
.format(self.id))
|
.format(self.id))
|
||||||
continue
|
continue
|
||||||
|
forms[n].append(to_add)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
comp_str = "\n".join(str(comp) for comp in self.components)
|
comp_str = "\n".join(str(comp) for comp in self.components)
|
||||||
|
@ -892,16 +1042,17 @@ class Word:
|
||||||
return word_renderer.render(self.lemma, self.msd)
|
return word_renderer.render(self.lemma, self.msd)
|
||||||
|
|
||||||
class WordMsdRenderer:
|
class WordMsdRenderer:
|
||||||
def __init__(self):
|
def __init__(self, lemma_features):
|
||||||
self.all_words = []
|
self.all_words = []
|
||||||
self.rendered_words = {}
|
self.rendered_words = {}
|
||||||
self.frequent_words = {}
|
self.frequent_words = {}
|
||||||
self.lemma_msd = {}
|
self.lemma_msd = {}
|
||||||
|
self.lemma_features = lemma_features
|
||||||
|
|
||||||
def add_words(self, words):
|
def add_words(self, words):
|
||||||
self.all_words.extend(words)
|
self.all_words.extend(words)
|
||||||
|
|
||||||
def generate_renders(self, lemma_features):
|
def generate_renders(self):
|
||||||
data = defaultdict(lambda: defaultdict(list))
|
data = defaultdict(lambda: defaultdict(list))
|
||||||
for w in self.all_words:
|
for w in self.all_words:
|
||||||
data[w.lemma][w.msd].append(w.text)
|
data[w.lemma][w.msd].append(w.text)
|
||||||
|
@ -926,11 +1077,12 @@ class WordMsdRenderer:
|
||||||
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
|
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
|
||||||
self.frequent_words[lemma].append((msd, txt, n))
|
self.frequent_words[lemma].append((msd, txt, n))
|
||||||
|
|
||||||
|
lf = self.lemma_features
|
||||||
for lemma in self.lemma_msd.keys():
|
for lemma in self.lemma_msd.keys():
|
||||||
cmsd = self.lemma_msd[lemma]
|
cmsd = self.lemma_msd[lemma]
|
||||||
if cmsd[0] in lemma_features:
|
if cmsd[0] in lf:
|
||||||
self.lemma_msd[lemma] = "".join(
|
self.lemma_msd[lemma] = "".join(
|
||||||
l1 if l1 != "-" else l2 for l1, l2 in zip(lemma_features[cmsd[0]], cmsd)
|
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd)
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -952,7 +1104,7 @@ class WordMsdRenderer:
|
||||||
|
|
||||||
def available_words(self, lemma, existing_texts):
|
def available_words(self, lemma, existing_texts):
|
||||||
counted_texts = Counter(existing_texts)
|
counted_texts = Counter(existing_texts)
|
||||||
for (msd, text), n in counted_texts.most_common():
|
for (msd, text), _n in counted_texts.most_common():
|
||||||
yield (msd, text)
|
yield (msd, text)
|
||||||
|
|
||||||
if lemma in self.frequent_words:
|
if lemma in self.frequent_words:
|
||||||
|
@ -960,11 +1112,17 @@ class WordMsdRenderer:
|
||||||
if (msd, text) not in counted_texts:
|
if (msd, text) not in counted_texts:
|
||||||
yield (msd, text)
|
yield (msd, text)
|
||||||
|
|
||||||
def get_lemma_msd(self, lemma):
|
def get_lemma_msd(self, lemma, word_msd):
|
||||||
if lemma in self.lemma_msd and self.lemma_msd[lemma][0] != '-':
|
# should be here, since we collect every lemmas
|
||||||
return self.lemma_msd[lemma]
|
lemma_msd = self.lemma_msd[lemma]
|
||||||
|
|
||||||
|
if lemma_msd[0] == '-':
|
||||||
|
if word_msd[0] in self.lemma_features:
|
||||||
|
return self.lemma_features[word_msd[0]]
|
||||||
else:
|
else:
|
||||||
return None
|
return '-'
|
||||||
|
else:
|
||||||
|
return lemma_msd
|
||||||
|
|
||||||
def is_root_id(id_):
|
def is_root_id(id_):
|
||||||
return len(id_.split('.')) == 3
|
return len(id_.split('.')) == 3
|
||||||
|
@ -1200,6 +1358,7 @@ class ColocationIds:
|
||||||
idx = 1
|
idx = 1
|
||||||
for _1, sm in tqdm(self.data.items()):
|
for _1, sm in tqdm(self.data.items()):
|
||||||
ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
|
ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
|
||||||
|
print(idx)
|
||||||
idx += 1
|
idx += 1
|
||||||
|
|
||||||
|
|
||||||
|
@ -1227,7 +1386,7 @@ def main(input_file, structures_file, args):
|
||||||
logging.debug(str(s))
|
logging.debug(str(s))
|
||||||
|
|
||||||
colocation_ids = ColocationIds()
|
colocation_ids = ColocationIds()
|
||||||
word_renderer = WordMsdRenderer()
|
word_renderer = WordMsdRenderer(lemma_msds)
|
||||||
|
|
||||||
# if True:
|
# if True:
|
||||||
# with open("match_word.p", "rb") as fp:
|
# with open("match_word.p", "rb") as fp:
|
||||||
|
@ -1279,13 +1438,14 @@ def main(input_file, structures_file, args):
|
||||||
word_renderer.add_words(words)
|
word_renderer.add_words(words)
|
||||||
|
|
||||||
# get word renders for lemma/msd
|
# get word renders for lemma/msd
|
||||||
word_renderer.generate_renders(lemma_msds)
|
word_renderer.generate_renders()
|
||||||
|
|
||||||
|
if args.output:
|
||||||
# figure out representations!
|
# figure out representations!
|
||||||
colocation_ids.set_representations(structures, word_renderer)
|
colocation_ids.set_representations(structures, word_renderer)
|
||||||
|
Writer.make_output_writer(args).write_out(structures, colocation_ids)
|
||||||
if args.all:
|
if args.all:
|
||||||
Writer.make_all_writer(args).write_out(structures, colocation_ids)
|
Writer.make_all_writer(args).write_out(structures, colocation_ids)
|
||||||
Writer.make_output_writer(args).write_out(structures, colocation_ids)
|
|
||||||
|
|
||||||
logging.debug([(k, len(v)) for k, v in matches.items()])
|
logging.debug([(k, len(v)) for k, v in matches.items()])
|
||||||
logging.debug(sum(len(v) for _, v in matches.items()))
|
logging.debug(sum(len(v) for _, v in matches.items()))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user