You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
luscenje_struktur/luscenje_struktur/syntactic_structure.py

152 lines
5.3 KiB

from xml.etree import ElementTree
import logging
import pickle
from luscenje_struktur.codes_tagset import PPB_DEPRELS
from luscenje_struktur.component import Component, ComponentType
from luscenje_struktur.lemma_features import get_lemma_features
class SyntacticStructure:
def __init__(self):
self.id = None
# self.lbs = None
self.components = []
self.fake_root_included = False
@staticmethod
def from_xml(xml, no_stats):
st = SyntacticStructure()
st.id = xml.get('id')
if st.id is None:
st.id = xml.get('tempId')
# st.lbs = xml.get('LBS')
assert len(list(xml)) == 1
system = next(iter(xml))
assert system.get('type') == 'JOS' or system.get('type') == 'UD'
system_type = system.get('type')
components, dependencies, definitions = list(system)
deps = [(dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order'))
for dep in dependencies]
comps = {comp.get('cid'): dict(comp.items()) for comp in components}
restrs, forms = {}, {}
for comp in definitions:
n = comp.get('cid')
restrs[n] = []
forms[n] = []
for el in comp:
if el.tag.startswith("restriction"):
restrs[n].append(el)
elif el.tag.startswith("representation"):
st.add_representation(n, el, forms)
else:
raise NotImplementedError("Unknown definition: {} in structure {}"
.format(el.tag, st.id))
fake_root_component = Component({'cid': '#', 'type': 'other', 'restriction': None}, system_type)
fake_root_component_children = fake_root_component.find_next(deps, comps, restrs, forms, system_type)
# all dep with value modra point to artificial root - fake_root_component
if any([dep[2] == 'modra' for dep in deps]):
st.fake_root_included = True
st.components = [fake_root_component] + fake_root_component_children
else:
st.components = fake_root_component_children
if not no_stats:
if system_type == 'JOS':
st.determine_core2w()
elif system_type == 'UD':
st.determine_core2w_ud()
return st
def determine_core2w_ud(self):
deprels = {}
for c in self.components:
for next_el in c.next_element:
deprels[next_el[0]] = next_el[1]
ppb_components_num = 0
for c in self.components:
if c.type != ComponentType.Core:
continue
if c in deprels and deprels[c] not in PPB_DEPRELS:
continue
ppb_components_num += 1
c.type = ComponentType.Core2w
assert ppb_components_num == 2, RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id)
def determine_core2w(self):
ppb_components = []
for c in self.components:
if c.type != ComponentType.Core:
continue
ppb = 4
for r in c.restrictions:
ppb = min(r.ppb, ppb)
ppb_components.append((c, ppb))
ppb_components = sorted(ppb_components, key=lambda c: c[1])
if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]:
raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id)
for c, _ in ppb_components[:2]:
c.type = ComponentType.Core2w
def add_representation(self, n, rep_el, forms):
assert rep_el.tag == "representation"
to_add = []
for el in rep_el:
assert el.tag == "feature"
if 'rendition' in el.attrib or 'selection' in el.attrib:
to_add.append(el)
else:
logging.warning("Strange representation feature in structure {}. Skipping"
.format(self.id))
continue
forms[n].append(to_add)
def get_component(self, idx):
for c in self.components:
if c.idx == idx:
return c
cs = [c.idx for c in self.components]
msg = "Structure id={} has components: {}, but no component: {}".format(self.id, cs, (idx,))
raise RuntimeError(msg)
def match(self, word):
matches = self.components[0].match(word)
return [] if matches is None else matches
def build_structures(args):
filename = args.structures
no_stats = args.out is None and args.stats is None
max_num_components = -1
with open(filename, 'r') as fp:
et = ElementTree.XML(fp.read())
structures = []
for structure in et.iter('syntactic_structure'):
if structure.attrib['type'] == 'single':
continue
to_append = SyntacticStructure.from_xml(structure, no_stats)
if to_append is None:
continue
structures.append(to_append)
to_append_len = len(to_append.components) if not to_append.fake_root_included else len(to_append.components) - 1
max_num_components = max(max_num_components, to_append_len)
lemma_features = get_lemma_features(et)
return structures, lemma_features, max_num_components