111 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			111 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from xml.etree import ElementTree
 | |
| import logging
 | |
| 
 | |
| from component import Component, ComponentType
 | |
| from lemma_features import get_lemma_features
 | |
| 
 | |
| class SyntacticStructure:
 | |
|     def __init__(self):
 | |
|         self.id = None
 | |
|         self.lbs = None
 | |
|         self.components = []
 | |
| 
 | |
|     @staticmethod
 | |
|     def from_xml(xml):
 | |
|         st = SyntacticStructure()
 | |
|         st.id = xml.get('id')
 | |
|         st.lbs = xml.get('LBS')
 | |
| 
 | |
|         assert len(list(xml)) == 1
 | |
|         system = next(iter(xml))
 | |
| 
 | |
|         assert system.get('type') == 'JOS'
 | |
|         components, dependencies, definitions = list(system)
 | |
| 
 | |
|         deps = [(dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order'))
 | |
|                 for dep in dependencies]
 | |
|         comps = {comp.get('cid'): dict(comp.items()) for comp in components}
 | |
| 
 | |
|         restrs, forms = {}, {}
 | |
| 
 | |
|         for comp in definitions:
 | |
|             n = comp.get('cid')
 | |
|             restrs[n] = None
 | |
|             forms[n] = []
 | |
| 
 | |
|             for el in comp:
 | |
|                 if el.tag.startswith("restriction"):
 | |
|                     assert restrs[n] is None
 | |
|                     restrs[n] = el
 | |
|                 elif el.tag.startswith("representation"):
 | |
|                     st.add_representation(n, el, forms)
 | |
|                 else:
 | |
|                     raise NotImplementedError("Unknown definition: {} in structure {}"
 | |
|                                               .format(el.tag, st.id))
 | |
| 
 | |
|         fake_root_component = Component({'cid': '#', 'type': 'other'})
 | |
|         st.components = fake_root_component.find_next(deps, comps, restrs, forms)
 | |
| 
 | |
|         st.determine_core2w()
 | |
|         return st
 | |
| 
 | |
|     def determine_core2w(self):
 | |
|         ppb_components = []
 | |
|         for c in self.components:
 | |
|             if c.type != ComponentType.Core:
 | |
|                 continue
 | |
| 
 | |
|             ppb = 4
 | |
|             for r in c.restrictions:
 | |
|                 ppb = min(r.ppb, ppb)
 | |
| 
 | |
|             ppb_components.append((c, ppb))
 | |
| 
 | |
|         ppb_components = sorted(ppb_components, key=lambda c: c[1])
 | |
|         if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]:
 | |
|             raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id)
 | |
| 
 | |
|         for c, _ in ppb_components[:2]:
 | |
|             c.type = ComponentType.Core2w
 | |
| 
 | |
|     def add_representation(self, n, rep_el, forms):
 | |
|         assert rep_el.tag == "representation"
 | |
|         to_add = []
 | |
|         for el in rep_el:
 | |
|             assert el.tag == "feature"
 | |
|             if 'rendition' in el.attrib or 'selection' in el.attrib:
 | |
|                 to_add.append(el)
 | |
|             else:
 | |
|                 logging.warning("Strange representation feature in structure {}. Skipping"
 | |
|                                 .format(self.id))
 | |
|                 continue
 | |
|         forms[n].append(to_add)
 | |
| 
 | |
|     def get_component(self, idx):
 | |
|         for c in self.components:
 | |
|             if c.idx == idx:
 | |
|                 return c
 | |
|         raise RuntimeError("Unknown component id: {}".format(idx))
 | |
| 
 | |
|     def match(self, word):
 | |
|         matches = self.components[0].match(word)
 | |
|         return [] if matches is None else matches
 | |
| 
 | |
| 
 | |
| def build_structures(filename):
 | |
|     max_num_components = -1
 | |
|     with open(filename, 'r') as fp:
 | |
|         et = ElementTree.XML(fp.read())
 | |
| 
 | |
|     structures = []
 | |
|     for structure in et.iter('syntactic_structure'):
 | |
|         to_append = SyntacticStructure.from_xml(structure)
 | |
|         if to_append is None:
 | |
|             continue
 | |
| 
 | |
|         structures.append(to_append)
 | |
|         max_num_components = max(max_num_components, len(to_append.components))
 | |
|     
 | |
|     lemma_features = get_lemma_features(et)
 | |
|     return structures, lemma_features, max_num_components
 |