You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

221 lines
11 KiB

import argparse
import lxml.etree as lxml
from conversion_utils.jos_msds_and_properties import Converter, Msd, Properties
CONVERTER = Converter()
FEATURE_CATEGORY_MAP = {'noun':{'case'},
'adjective':{'case'},
'numeral':{'case', 'form'},
'verb':{'type'}}
LABEL_MAP = {'form':{'digit':'a', 'roman':'r', 'letter':'b'},
'case':{'nominative':'1', 'genitive':'2', 'dative':'3', 'accusative':'4', 'locative':'5', 'instrumental':'6'},
'type':{'main':'g', 'auxiliary':'p', 'reflexive':'p'}}
CONTACT_MAP = {(False, False): 'both', (False, True): 'left', (True, False): 'right', (True, True): 'neither'}
DEPENDENCY_ROOT_SYMBOL = '#'
DEPENDENCY_ROOT_LABEL = 'modra'
class SyntacticStructure:
def __init__(self):
self.id = None
self.type = None
self.components = []
self.dependencies = []
def set_components(self, component_maps):
self.components.clear()
for component_map in component_maps:
contact = component_map.pop('contact')
label = self._generate_label(component_map)
self.components.append({'features': component_map, 'contact':contact, 'label':label})
def set_dependencies(self, dependency_tuples):
self.dependencies.clear()
if (len(dependency_tuples) > 1):
for (from_index, to_index, label) in dependency_tuples:
self.dependencies.append({'from':from_index, 'to':to_index, 'label':label})
def set_example(self, parsed_unit):
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
self.example = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
def _generate_label(self, feature_map):
category = feature_map['POS']
label = CONVERTER.specifications.find_category_by_name(category, 'en').codes.get('sl').lower()
if (self.type == 'other'):
for name in ['form', 'case', 'type']:
try:
label += LABEL_MAP[name][feature_map[name]]
except KeyError:
pass
return label
def __str__(self):
return str(self.components) + str(self.dependencies)
def __eq__(self, other):
return other.components == self.components and other.dependencies == self.dependencies
def assign(unit_input_file_name, structure_old_file_name, unit_output_file_name, structure_new_file_name):
parser = lxml.XMLParser(remove_blank_text=True)
unit_tree = lxml.parse(unit_input_file_name, parser=parser)
unit_root = unit_tree.getroot()
structure_tree = lxml.parse(structure_old_file_name, parser=parser)
structure_root = structure_tree.getroot()
find_and_assign_structures(unit_root, structure_root)
unit_tree.write(unit_output_file_name, encoding='UTF-8', pretty_print=True)
structure_tree.write(structure_new_file_name, encoding='UTF-8', pretty_print=True)
def find_and_assign_structures(unit_root, structure_root): # TODO: check and test
syntactic_structures = parse_xml_structures(structure_root)
last_id = get_max_id(structure_root)
for unit in xpath_find(unit_root, 'tei:text/tei:body/tei:p/tei:s'):
if (unit.get('structure_id') is None):
new_syntactic_structure = SyntacticStructure()
new_syntactic_structure.type = 'single' if len(xpath_find(unit, 'tei:w|tei:pc')) == 1 else 'other'
component_maps = make_component_map(unit)
new_syntactic_structure.set_components(component_maps)
dependency_tuples = make_dependency_tuples(unit)
new_syntactic_structure.set_dependencies(dependency_tuples)
syntactic_structure = next((ss for ss in syntactic_structures if ss == new_syntactic_structure), None)
if (syntactic_structure is None):
syntactic_structure = new_syntactic_structure
last_id += 1
syntactic_structure.id = last_id
syntactic_structure.set_example(unit)
syntactic_structures.append(syntactic_structure)
structure_element = create_xml_structure(syntactic_structure)
structure_root.append(structure_element)
unit.set('structure_id', str(syntactic_structure.id))
def parse_xml_structures(root):
syntactic_structures = []
for structure_element in root.xpath('syntactic_structure[@type!="collocation"]'):
syntactic_structure = SyntacticStructure()
syntactic_structure.id = int(structure_element.get('id'))
syntactic_structure.type = structure_element.get('type')
dependency_tuples = []
for dependency in structure_element.xpath('dependencies/dependency'):
dependency_tuples.append((dependency.get('from'), dependency.get('to'), dependency.get('label')))
syntactic_structure.set_dependencies(dependency_tuples)
component_maps = []
for component in structure_element.xpath('definition/component'):
morphology_features = component.xpath('restriction[@type="morphology"]/feature')
component_map = {}
for feature in morphology_features:
key = feature.attrib.keys()[0]
if (key == 'POS' or syntactic_structure.type != 'single'):
component_map[key] = feature.get(key)
try:
contact = component.xpath('restriction[@type="space"]/feature')[0].get('contact')
except:
contact = None
component_map['contact'] = contact
component_maps.append(component_map)
syntactic_structure.set_components(component_maps)
syntactic_structures.append(syntactic_structure)
return syntactic_structures
def xpath_find(element,expression):
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
def make_component_map(unit):
component_maps = []
tokens = xpath_find(unit, 'tei:w|tei:pc')
for (index, token) in enumerate(tokens, start=1):
component_map = {}
msd = Msd(token.get('ana')[len('MTE:'):], 'sl')
properties = CONVERTER.msd_to_properties(msd, 'en')
component_map['POS'] = properties.category
component_map['contact'] = get_contact(token, properties.category)
if (len(tokens) > 1):
if (msd.code[0] == 'Z'):
if (msd.code in {'Zp------k', 'Zp---d--k'}):
component_map['type'] = 'reflexive'
component_map['clitic'] = 'yes'
elif (properties.category in FEATURE_CATEGORY_MAP):
feature_map = {**properties.lexeme_feature_map, **properties.form_feature_map}
for name in FEATURE_CATEGORY_MAP[properties.category]:
if (name in feature_map):
component_map[name] = feature_map[name]
component_maps.append(component_map)
return component_maps
def make_dependency_tuples(unit):
dependency_tuples = []
dependencies = xpath_find(unit, 'tei:linkGrp[@type="JOS-SYN"]/tei:link')
dependencies.sort(key=lambda dep: int(dep.get('target')[dep.get('target').rindex('.')+1:]))
for dependency in dependencies:
relation = dependency.get('ana')[len('jos-syn:'):]
[from_index, to_index] = [string[string.rindex('.')+1:] if string.count('.') == 2 else '#' for string in dependency.get('target').split(' ')]
dependency_tuples.append((from_index, to_index, relation))
return dependency_tuples
def has_space(token):
return token.get('join') != 'right'
def get_contact(token, category):
if (category != 'punctuation'):
contact = None
else:
left_space = token.getprevious() is not None and has_space(token.getprevious())
right_space = has_space(token)
contact = CONTACT_MAP[(left_space, right_space)]
return contact
def get_max_id(root):
return max([int(ss.get('id')) for ss in root.xpath('syntactic_structure')])
def create_xml_structure(syntactic_structure):
structure_element = lxml.Element('syntactic_structure')
structure_element.set('tempId', str(syntactic_structure.id))
comment = lxml.Comment(' example: ' + syntactic_structure.example)
structure_element.append(comment)
structure_element.set('type', 'other')
components = lxml.SubElement(structure_element, 'components')
components.set('order', 'fixed')
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
component = lxml.SubElement(components, 'component')
component.set('cid', str(index))
component.set('type', 'core')
component.set('label', component_map['label'])
structure_element.set('label', '-'.join([c.get('label') for c in components]))
dependencies = lxml.SubElement(structure_element, 'dependencies')
for dependency_map in syntactic_structure.dependencies:
dependency = lxml.SubElement(dependencies, 'dependency')
[from_index, label, to_index] = [dependency_map[key] for key in ['from', 'label', 'to']]
if (label == DEPENDENCY_ROOT_LABEL):
from_index = DEPENDENCY_ROOT_SYMBOL
dependency.set('from', from_index)
dependency.set('label', label)
dependency.set('to', to_index)
definition = lxml.SubElement(structure_element, 'definition')
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
component = lxml.SubElement(definition, 'component')
component.set('cid', str(index))
restriction = lxml.SubElement(component, 'restriction')
restriction.set('type', 'morphology')
for key in ['POS', 'type', 'form', 'case', 'clitic']:
if (key in component_map['features']):
feature = lxml.SubElement(restriction, 'feature')
feature.set(key, component_map['features'][key])
if (component_map['contact'] is not None):
space_restriction = lxml.SubElement(component, 'restriction')
space_restriction.set('type', 'space')
space_feature = lxml.SubElement(space_restriction, 'feature')
space_feature.set('contact', component_map['contact'])
return structure_element
if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Assign structure ids to single-component structures.')
arg_parser.add_argument('-infile', type=str, help='Input TEI')
arg_parser.add_argument('-instruct', type=str, help='Structures input file')
arg_parser.add_argument('-outfile', type=str, help='Output TEI')
arg_parser.add_argument('-outstruct', type=str, help='Structures output file')
arguments = arg_parser.parse_args()
assign(arguments.infile, arguments.instruct, arguments.outfile, arguments.outstruct)