Updating for new representations, for now only parsing

This commit is contained in:
Ozbolt Menegatti 2019-05-12 22:13:22 +02:00
parent de6c73980e
commit b4b93022fe

67
wani.py
View File

@ -129,7 +129,8 @@ class RestrictionType(Enum):
class Rendition(Enum): class Rendition(Enum):
Lemma = 0 Lemma = 0
WordForm = 1 WordForm = 1
Unknown = 2 Lexis = 2
Unknown = 3
class Order(Enum): class Order(Enum):
FromTo = 0 FromTo = 0
@ -164,8 +165,9 @@ class Order(Enum):
raise NotImplementedError("Should not be here: Order match") raise NotImplementedError("Should not be here: Order match")
class ComponentRendition: class ComponentRendition:
def __init__(self, rendition=Rendition.Unknown): def __init__(self, rendition=Rendition.Unknown, lexis=None):
self.word_form = {} assert(type(rendition) is Rendition)
self.lexis = lexis
self.rendition = rendition self.rendition = rendition
def render(self, word): def render(self, word):
@ -173,6 +175,8 @@ class ComponentRendition:
return word.lemma return word.lemma
elif self.rendition == Rendition.WordForm: elif self.rendition == Rendition.WordForm:
return word.text return word.text
elif self.rendition == Rendition.Lexis:
return self.lexis
elif self.rendition == Rendition.Unknown: elif self.rendition == Rendition.Unknown:
return None return None
else: else:
@ -182,27 +186,30 @@ class ComponentRendition:
return str(self.rendition) return str(self.rendition)
# dont know...
class StructureSelection(Enum): class StructureSelection(Enum):
All = 0 All = 0
Frequency = 1 Msd = 1
class ComponentRepresentation: class ComponentRepresentation:
@staticmethod
def new(s): def new(s):
if 'rendition' in s: if 'rendition' in s:
if s['rendition'] == "lemma": if s['rendition'] == "lemma":
return ComponentRendition(Rendition.Lemma) return ComponentRendition(Rendition.Lemma)
elif s['rendition'] == "word_form": elif s['rendition'] == "word_form":
return ComponentRendition(Rendition.WordForm) return ComponentRendition(Rendition.WordForm)
elif s['rendition'] == "lexis":
assert(s['string'] is not None)
return ComponentRendition(Rendition.Lexis, s['string'])
else: else:
raise NotImplementedError("Rendition: {}".format(s)) raise NotImplementedError("Rendition: {}".format(s))
elif 'selection' in s: elif 'selection' in s:
if s['selection'] == "frequency": if s['selection'] == "msd":
return StructureSelection.Frequency return StructureSelection.Msd
elif s['selection'] == "all": elif s['selection'] == "all":
return StructureSelection.All return StructureSelection.All
else: else:
return {s['selection']: s['value']} raise NotImplementedError("Selection: {}".format(s))
else: else:
return None return None
@ -376,7 +383,7 @@ class Component:
def set_representation(self, representation): def set_representation(self, representation):
cr = None cr = None
if representation is not None: if len(representation) > 0:
self.representation = [] self.representation = []
for feature in representation: for feature in representation:
@ -414,9 +421,6 @@ class Component:
others, r2 = next_component.find_next(deps, comps, restrs, reprs) others, r2 = next_component.find_next(deps, comps, restrs, reprs)
to_ret.extend(others) to_ret.extend(others)
if StructureSelection.Frequency in (r1, r2):
representation = StructureSelection.Frequency
return to_ret, representation return to_ret, representation
def name_str(self): def name_str(self):
@ -566,7 +570,7 @@ class SyntacticStructure:
for comp in definitions: for comp in definitions:
n = comp.get('cid') n = comp.get('cid')
restrs[n] = None restrs[n] = None
forms[n] = None forms[n] = []
for el in comp: for el in comp:
if el.tag.startswith("restriction"): if el.tag.startswith("restriction"):
@ -575,29 +579,29 @@ class SyntacticStructure:
elif el.tag.startswith("representation"): elif el.tag.startswith("representation"):
st.add_representation(n, el, forms) st.add_representation(n, el, forms)
else: else:
raise NotImplementedError("definition??") raise NotImplementedError("Unknown definition: {} in structure {}".format(el.tag, st.id))
fake_root_component = Component({'cid': '#', 'type': 'other'}) fake_root_component = Component({'cid': '#', 'type': 'other'})
st.components, st.selection = fake_root_component.find_next(deps, comps, restrs, forms) st.components, st.selection = fake_root_component.find_next(deps, comps, restrs, forms)
return st return st
def add_representation(self, n, el, forms): def add_representation(self, n, rep_el, forms):
if el.tag == "representation": if rep_el.tag == "representation_and":
els = [el] rep_el = rep_el[0]
elif el.tag == "representation_and": logging.warning("Only using first reprentation in representation_and in structure {}".format(self.id))
els = list(el)
else: assert(rep_el.tag == "representation")
raise NotImplementedError("Unknown representation tag: {}".format(el.tag)) for el in rep_el:
assert(el.tag == "feature")
for el in els: if 'rendition' in el.attrib:
if el.get('basic') == 'form': forms[n].append(el)
assert(forms[n] is None) elif 'selection' in el.attrib and el.attrib["selection"] != "agreement":
forms[n] = el forms[n].append(el)
elif el.get('basic') == "agreement": elif 'selection' in el.attrib:
self.add_agreement(n, el) self.add_agreement(n, el)
else: else:
logging.warning("Strange representation (basic={}) in structure {}. Skipping" logging.warning("Strange representation feature in structure {}. Skipping"
.format(el.get('basic'), self.id)) .format(self.id))
continue continue
def add_agreement(self, n, el): def add_agreement(self, n, el):
@ -605,12 +609,13 @@ class SyntacticStructure:
n1 = n n1 = n
n2 = el.get('head')[4:] n2 = el.get('head')[4:]
agreement_str = next(iter(el)).get('agreement') agreement_str = el.get('msd')
assert(agreement_str is not None)
self.agreements.append({ self.agreements.append({
'n1': n1, 'n1': n1,
'n2': n2, 'n2': n2,
'match': agreement_str.split('|')}) 'match': agreement_str.split('+')})
def __str__(self): def __str__(self):
comp_str = "\n".join(str(comp) for comp in self.components) comp_str = "\n".join(str(comp) for comp in self.components)