111 lines
3.5 KiB
Python
111 lines
3.5 KiB
Python
|
from xml.etree import ElementTree
|
||
|
import logging
|
||
|
|
||
|
from component import Component, ComponentType
|
||
|
from lemma_features import get_lemma_features
|
||
|
|
||
|
class SyntacticStructure:
|
||
|
def __init__(self):
|
||
|
self.id = None
|
||
|
self.lbs = None
|
||
|
self.components = []
|
||
|
|
||
|
@staticmethod
|
||
|
def from_xml(xml):
|
||
|
st = SyntacticStructure()
|
||
|
st.id = xml.get('id')
|
||
|
st.lbs = xml.get('LBS')
|
||
|
|
||
|
assert len(list(xml)) == 1
|
||
|
system = next(iter(xml))
|
||
|
|
||
|
assert system.get('type') == 'JOS'
|
||
|
components, dependencies, definitions = list(system)
|
||
|
|
||
|
deps = [(dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order'))
|
||
|
for dep in dependencies]
|
||
|
comps = {comp.get('cid'): dict(comp.items()) for comp in components}
|
||
|
|
||
|
restrs, forms = {}, {}
|
||
|
|
||
|
for comp in definitions:
|
||
|
n = comp.get('cid')
|
||
|
restrs[n] = None
|
||
|
forms[n] = []
|
||
|
|
||
|
for el in comp:
|
||
|
if el.tag.startswith("restriction"):
|
||
|
assert restrs[n] is None
|
||
|
restrs[n] = el
|
||
|
elif el.tag.startswith("representation"):
|
||
|
st.add_representation(n, el, forms)
|
||
|
else:
|
||
|
raise NotImplementedError("Unknown definition: {} in structure {}"
|
||
|
.format(el.tag, st.id))
|
||
|
|
||
|
fake_root_component = Component({'cid': '#', 'type': 'other'})
|
||
|
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
|
||
|
|
||
|
st.determine_core2w()
|
||
|
return st
|
||
|
|
||
|
def determine_core2w(self):
|
||
|
ppb_components = []
|
||
|
for c in self.components:
|
||
|
if c.type != ComponentType.Core:
|
||
|
continue
|
||
|
|
||
|
ppb = 4
|
||
|
for r in c.restrictions:
|
||
|
ppb = min(r.ppb, ppb)
|
||
|
|
||
|
ppb_components.append((c, ppb))
|
||
|
|
||
|
ppb_components = sorted(ppb_components, key=lambda c: c[1])
|
||
|
if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]:
|
||
|
raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id)
|
||
|
|
||
|
for c, _ in ppb_components[:2]:
|
||
|
c.type = ComponentType.Core2w
|
||
|
|
||
|
def add_representation(self, n, rep_el, forms):
|
||
|
assert rep_el.tag == "representation"
|
||
|
to_add = []
|
||
|
for el in rep_el:
|
||
|
assert el.tag == "feature"
|
||
|
if 'rendition' in el.attrib or 'selection' in el.attrib:
|
||
|
to_add.append(el)
|
||
|
else:
|
||
|
logging.warning("Strange representation feature in structure {}. Skipping"
|
||
|
.format(self.id))
|
||
|
continue
|
||
|
forms[n].append(to_add)
|
||
|
|
||
|
def get_component(self, idx):
|
||
|
for c in self.components:
|
||
|
if c.idx == idx:
|
||
|
return c
|
||
|
raise RuntimeError("Unknown component id: {}".format(idx))
|
||
|
|
||
|
def match(self, word):
|
||
|
matches = self.components[0].match(word)
|
||
|
return [] if matches is None else matches
|
||
|
|
||
|
|
||
|
def build_structures(filename):
|
||
|
max_num_components = -1
|
||
|
with open(filename, 'r') as fp:
|
||
|
et = ElementTree.XML(fp.read())
|
||
|
|
||
|
structures = []
|
||
|
for structure in et.iter('syntactic_structure'):
|
||
|
to_append = SyntacticStructure.from_xml(structure)
|
||
|
if to_append is None:
|
||
|
continue
|
||
|
|
||
|
structures.append(to_append)
|
||
|
max_num_components = max(max_num_components, len(to_append.components))
|
||
|
|
||
|
lemma_features = get_lemma_features(et)
|
||
|
return structures, lemma_features, max_num_components
|