import argparse import lxml.etree as lxml from conversion_utils.jos_msds_and_properties import Converter, Msd, Properties CONVERTER = Converter() FEATURE_CATEGORY_MAP = {'noun':{'case'}, 'adjective':{'case'}, 'numeral':{'case', 'form'}, 'verb':{'type'}} LABEL_MAP = {'form':{'digit':'a', 'roman':'r', 'letter':'b'}, 'case':{'nominative':'1', 'genitive':'2', 'dative':'3', 'accusative':'4', 'locative':'5', 'instrumental':'6'}, 'type':{'main':'g', 'auxiliary':'p', 'reflexive':'p'}} CONTACT_MAP = {(False, False): 'both', (False, True): 'left', (True, False): 'right', (True, True): 'neither'} DEPENDENCY_ROOT_SYMBOL = '#' DEPENDENCY_ROOT_LABEL = 'modra' class SyntacticStructure: def __init__(self): self.id = None self.type = None self.components = [] self.dependencies = [] def set_components(self, component_maps): self.components.clear() for component_map in component_maps: contact = component_map.pop('contact') label = self._generate_label(component_map) self.components.append({'features': component_map, 'contact':contact, 'label':label}) def set_dependencies(self, dependency_tuples): self.dependencies.clear() if (len(dependency_tuples) > 1): for (from_index, to_index, label) in dependency_tuples: self.dependencies.append({'from':from_index, 'to':to_index, 'label':label}) def set_example(self, parsed_unit): elements = xpath_find(parsed_unit, 'tei:w|tei:pc') self.example = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip() def _generate_label(self, feature_map): category = feature_map['POS'] label = CONVERTER.specifications.find_category_by_name(category, 'en').codes.get('sl').lower() if (self.type == 'other'): for name in ['form', 'case', 'type']: try: label += LABEL_MAP[name][feature_map[name]] except KeyError: pass return label def __str__(self): return str(self.components) + str(self.dependencies) def __eq__(self, other): return other.components == self.components and other.dependencies == self.dependencies def assign(unit_input_file_name, structure_old_file_name, unit_output_file_name, structure_new_file_name): parser = lxml.XMLParser(remove_blank_text=True) unit_tree = lxml.parse(unit_input_file_name, parser=parser) unit_root = unit_tree.getroot() structure_tree = lxml.parse(structure_old_file_name, parser=parser) structure_root = structure_tree.getroot() find_and_assign_structures(unit_root, structure_root) unit_tree.write(unit_output_file_name, encoding='UTF-8', pretty_print=True) structure_tree.write(structure_new_file_name, encoding='UTF-8', pretty_print=True) def find_and_assign_structures(unit_root, structure_root): # TODO: check and test syntactic_structures = parse_xml_structures(structure_root) last_id = get_max_id(structure_root) for unit in xpath_find(unit_root, 'tei:text/tei:body/tei:p/tei:s'): if (unit.get('structure_id') is None): new_syntactic_structure = SyntacticStructure() new_syntactic_structure.type = 'single' if len(xpath_find(unit, 'tei:w|tei:pc')) == 1 else 'other' component_maps = make_component_map(unit) new_syntactic_structure.set_components(component_maps) dependency_tuples = make_dependency_tuples(unit) new_syntactic_structure.set_dependencies(dependency_tuples) syntactic_structure = next((ss for ss in syntactic_structures if ss == new_syntactic_structure), None) if (syntactic_structure is None): syntactic_structure = new_syntactic_structure last_id += 1 syntactic_structure.id = last_id syntactic_structure.set_example(unit) syntactic_structures.append(syntactic_structure) structure_element = create_xml_structure(syntactic_structure) structure_root.append(structure_element) unit.set('structure_id', str(syntactic_structure.id)) def parse_xml_structures(root): syntactic_structures = [] for structure_element in root.xpath('syntactic_structure[@type!="collocation"]'): syntactic_structure = SyntacticStructure() syntactic_structure.id = int(structure_element.get('id')) syntactic_structure.type = structure_element.get('type') dependency_tuples = [] for dependency in structure_element.xpath('dependencies/dependency'): dependency_tuples.append((dependency.get('from'), dependency.get('to'), dependency.get('label'))) syntactic_structure.set_dependencies(dependency_tuples) component_maps = [] for component in structure_element.xpath('definition/component'): morphology_features = component.xpath('restriction[@type="morphology"]/feature') component_map = {} for feature in morphology_features: key = feature.attrib.keys()[0] if (key == 'POS' or syntactic_structure.type != 'single'): component_map[key] = feature.get(key) try: contact = component.xpath('restriction[@type="space"]/feature')[0].get('contact') except: contact = None component_map['contact'] = contact component_maps.append(component_map) syntactic_structure.set_components(component_maps) syntactic_structures.append(syntactic_structure) return syntactic_structures def xpath_find(element,expression): return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'}) def make_component_map(unit): component_maps = [] tokens = xpath_find(unit, 'tei:w|tei:pc') for (index, token) in enumerate(tokens, start=1): component_map = {} msd = Msd(token.get('ana')[len('MTE:'):], 'sl') properties = CONVERTER.msd_to_properties(msd, 'en') component_map['POS'] = properties.category component_map['contact'] = get_contact(token, properties.category) if (len(tokens) > 1): if (msd.code[0] == 'Z'): if (msd.code in {'Zp------k', 'Zp---d--k'}): component_map['type'] = 'reflexive' component_map['clitic'] = 'yes' elif (properties.category in FEATURE_CATEGORY_MAP): feature_map = {**properties.lexeme_feature_map, **properties.form_feature_map} for name in FEATURE_CATEGORY_MAP[properties.category]: if (name in feature_map): component_map[name] = feature_map[name] component_maps.append(component_map) return component_maps def make_dependency_tuples(unit): dependency_tuples = [] dependencies = xpath_find(unit, 'tei:linkGrp[@type="JOS-SYN"]/tei:link') dependencies.sort(key=lambda dep: int(dep.get('target')[dep.get('target').rindex('.')+1:])) for dependency in dependencies: relation = dependency.get('ana')[len('jos-syn:'):] [from_index, to_index] = [string[string.rindex('.')+1:] if string.count('.') == 2 else '#' for string in dependency.get('target').split(' ')] dependency_tuples.append((from_index, to_index, relation)) return dependency_tuples def has_space(token): return token.get('join') != 'right' def get_contact(token, category): if (category != 'punctuation'): contact = None else: left_space = token.getprevious() is not None and has_space(token.getprevious()) right_space = has_space(token) contact = CONTACT_MAP[(left_space, right_space)] return contact def get_max_id(root): return max([int(ss.get('id')) for ss in root.xpath('syntactic_structure')]) def create_xml_structure(syntactic_structure): structure_element = lxml.Element('syntactic_structure') structure_element.set('tempId', str(syntactic_structure.id)) comment = lxml.Comment(' example: ' + syntactic_structure.example) structure_element.append(comment) structure_element.set('type', 'other') components = lxml.SubElement(structure_element, 'components') components.set('order', 'fixed') for (index, component_map) in enumerate(syntactic_structure.components, start=1): component = lxml.SubElement(components, 'component') component.set('cid', str(index)) component.set('type', 'core') component.set('label', component_map['label']) structure_element.set('label', '-'.join([c.get('label') for c in components])) dependencies = lxml.SubElement(structure_element, 'dependencies') for dependency_map in syntactic_structure.dependencies: dependency = lxml.SubElement(dependencies, 'dependency') [from_index, label, to_index] = [dependency_map[key] for key in ['from', 'label', 'to']] if (label == DEPENDENCY_ROOT_LABEL): from_index = DEPENDENCY_ROOT_SYMBOL dependency.set('from', from_index) dependency.set('label', label) dependency.set('to', to_index) definition = lxml.SubElement(structure_element, 'definition') for (index, component_map) in enumerate(syntactic_structure.components, start=1): component = lxml.SubElement(definition, 'component') component.set('cid', str(index)) restriction = lxml.SubElement(component, 'restriction') restriction.set('type', 'morphology') for key in ['POS', 'type', 'form', 'case', 'clitic']: if (key in component_map['features']): feature = lxml.SubElement(restriction, 'feature') feature.set(key, component_map['features'][key]) if (component_map['contact'] is not None): space_restriction = lxml.SubElement(component, 'restriction') space_restriction.set('type', 'space') space_feature = lxml.SubElement(space_restriction, 'feature') space_feature.set('contact', component_map['contact']) return structure_element if (__name__ == '__main__'): arg_parser = argparse.ArgumentParser(description='Assign structure ids to single-component structures.') arg_parser.add_argument('-infile', type=str, help='Input TEI') arg_parser.add_argument('-instruct', type=str, help='Structures input file') arg_parser.add_argument('-outfile', type=str, help='Output TEI') arg_parser.add_argument('-outstruct', type=str, help='Structures output file') arguments = arg_parser.parse_args() assign(arguments.infile, arguments.instruct, arguments.outfile, arguments.outstruct)