diff --git a/package/structure_assignment/assign_collocation_structures.py b/package/structure_assignment/assign_collocation_structures.py new file mode 100644 index 0000000..5744a54 --- /dev/null +++ b/package/structure_assignment/assign_collocation_structures.py @@ -0,0 +1,57 @@ +import argparse +import csv +import codecs +import re +import lxml.etree as lxml + +def xpath_find(element,expression): + return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'}) + +def get_xml_id(element): + return element.get('{http://www.w3.org/XML/1998/namespace}id') + +def get_id_counter(xml_id): + return int(re.search(r'^s(\d+)\.\d+(?:\.\d+)?$', xml_id).group(1)) + +def assign(input_file_name, csv_file_name, output_file_name): + + csv_file = codecs.open(csv_file_name, 'r') + reader = csv.DictReader(csv_file, delimiter='\t') + mwe_map = {} + for row in reader: + structure_id = row['Structure_ID'] + token_ids = [row[key] for key in sorted(row.keys()) if key.endswith('_Token_ID') and len(row[key]) > 0] + index = get_id_counter(token_ids[0]) + component_count = len(token_ids) + if (index not in mwe_map): + mwe_map[index] = set() + mwe_map[index].add((structure_id, component_count)) + csv_file.close() + + xml_tree = lxml.parse(input_file_name) + xml_root = xml_tree.getroot() + mwes_xml = xpath_find(xml_root, './/tei:s') + for mwe_xml in mwes_xml: + index = get_id_counter(get_xml_id(mwe_xml)) + mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip() + token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc')) + structure_ids = set() + if (index in mwe_map): + for (structure_id, component_count) in mwe_map[index]: + if (component_count == token_count): + structure_ids.add(int(structure_id)) + if (len(structure_ids) > 1): + print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')') + elif (len(structure_ids) == 1): + mwe_xml.set('structure_id', str(list(structure_ids)[0])) + xml_tree.write(output_file_name, encoding='UTF-8', pretty_print=True) + + +if (__name__ == '__main__'): + + arg_parser = argparse.ArgumentParser(description='Assign collocation structure ids to parsed lexical units.') + arg_parser.add_argument('-infile', type=str, help='Input TEI') + arg_parser.add_argument('-csv', type=str, help='CSV file') + arg_parser.add_argument('-outfile', type=str, help='Output TEI') + arguments = arg_parser.parse_args() + assign(arguments.infile, arguments.structures, arguments.outfile) diff --git a/package/structure_assignment/assign_other_structures.py b/package/structure_assignment/assign_other_structures.py new file mode 100644 index 0000000..0551646 --- /dev/null +++ b/package/structure_assignment/assign_other_structures.py @@ -0,0 +1,222 @@ +import argparse +import lxml.etree as lxml + +from conversion_utils.jos_msds_and_properties import Converter, Msd, Properties + +CONVERTER = Converter() + +FEATURE_CATEGORY_MAP = {'noun':{'case'}, + 'adjective':{'case'}, + 'numeral':{'case', 'form'}, + 'verb':{'type'}} + +LABEL_MAP = {'form':{'digit':'a', 'roman':'r', 'letter':'b'}, + 'case':{'nominative':'1', 'genitive':'2', 'dative':'3', 'accusative':'4', 'locative':'5', 'instrumental':'6'}, + 'type':{'main':'g', 'auxiliary':'p', 'reflexive':'p'}} + +CONTACT_MAP = {(False, False): 'both', (False, True): 'left', (True, False): 'right', (True, True): 'neither'} + +DEPENDENCY_ROOT_SYMBOL = '#' +DEPENDENCY_ROOT_LABEL = 'modra' + +class SyntacticStructure: + def __init__(self): + self.id = None + self.type = None + self.components = [] + self.dependencies = [] + def set_components(self, component_maps): + self.components.clear() + for component_map in component_maps: + contact = component_map.pop('contact') + label = self._generate_label(component_map) + self.components.append({'features': component_map, 'contact':contact, 'label':label}) + def set_dependencies(self, dependency_tuples): + self.dependencies.clear() + if (len(dependency_tuples) > 1): + for (from_index, to_index, label) in dependency_tuples: + self.dependencies.append({'from':from_index, 'to':to_index, 'label':label}) + def set_example(self, parsed_unit): + elements = xpath_find(parsed_unit, 'tei:w|tei:pc') + self.example = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip() + def _generate_label(self, feature_map): + category = feature_map['POS'] + label = CONVERTER.specifications.find_category_by_name(category, 'en').codes.get('sl').lower() + if (self.type == 'other'): + for name in ['form', 'case', 'type']: + try: + label += LABEL_MAP[name][feature_map[name]] + except KeyError: + pass + return label + def __str__(self): + return str(self.components) + str(self.dependencies) + def __eq__(self, other): + return other.components == self.components and other.dependencies == self.dependencies + + +def assign(unit_input_file_name, structure_old_file_name, unit_output_file_name, structure_new_file_name): + + parser = lxml.XMLParser(remove_blank_text=True) + unit_tree = lxml.parse(unit_input_file_name, parser=parser) + unit_root = unit_tree.getroot() + structure_tree = lxml.parse(structure_old_file_name, parser=parser) + structure_root = structure_tree.getroot() + + find_and_assign_structures(unit_root, structure_root) + + unit_tree.write(unit_output_file_name, encoding='UTF-8', pretty_print=True) + structure_tree.write(structure_new_file_name, encoding='UTF-8', pretty_print=True) + +def find_and_assign_structures(unit_root, structure_root): # TODO: check and test + syntactic_structures = parse_xml_structures(structure_root) + last_id = get_max_id(structure_root) + for unit in xpath_find(unit_root, 'tei:text/tei:body/tei:p/tei:s'): + if (unit.get('structure_id') is None): + new_syntactic_structure = SyntacticStructure() + new_syntactic_structure.type = 'single' if len(xpath_find(unit, 'tei:w|tei:pc')) == 1 else 'other' + component_maps = make_component_map(unit) + new_syntactic_structure.set_components(component_maps) + dependency_tuples = make_dependency_tuples(unit) + new_syntactic_structure.set_dependencies(dependency_tuples) + syntactic_structure = next((ss for ss in syntactic_structures if ss == new_syntactic_structure), None) + if (syntactic_structure is None): + syntactic_structure = new_syntactic_structure + last_id += 1 + syntactic_structure.id = last_id + syntactic_structure.set_example(unit) + syntactic_structures.append(syntactic_structure) + structure_element = create_xml_structure(syntactic_structure) + structure_root.append(structure_element) + unit.set('structure_id', str(syntactic_structure.id)) + +def parse_xml_structures(root): + syntactic_structures = [] + for structure_element in root.xpath('syntactic_structure[@type!="collocation"]'): + syntactic_structure = SyntacticStructure() + syntactic_structure.id = int(structure_element.get('id')) + syntactic_structure.type = structure_element.get('type') + dependency_tuples = [] + for dependency in structure_element.xpath('system[@type="JOS"]/dependencies/dependency'): + dependency_tuples.append((dependency.get('from'), dependency.get('to'), dependency.get('label'))) + syntactic_structure.set_dependencies(dependency_tuples) + component_maps = [] + for component in structure_element.xpath('system[@type="JOS"]/definition/component'): + morphology_features = component.xpath('restriction[@type="morphology"]/feature') + component_map = {} + for feature in morphology_features: + key = feature.attrib.keys()[0] + if (key == 'POS' or syntactic_structure.type != 'single'): + component_map[key] = feature.get(key) + try: + contact = component.xpath('restriction[@type="space"]/feature')[0].get('contact') + except: + contact = None + component_map['contact'] = contact + component_maps.append(component_map) + syntactic_structure.set_components(component_maps) + syntactic_structures.append(syntactic_structure) + return syntactic_structures + +def xpath_find(element,expression): + return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'}) + +def make_component_map(unit): + component_maps = [] + tokens = xpath_find(unit, 'tei:w|tei:pc') + for (index, token) in enumerate(tokens, start=1): + component_map = {} + msd = Msd(token.get('ana')[len('MTE:'):], 'sl') + properties = CONVERTER.msd_to_properties(msd, 'en') + component_map['POS'] = properties.category + component_map['contact'] = get_contact(token, properties.category) + if (len(tokens) > 1): + if (msd.code[0] == 'Z'): + if (msd.code in {'Zp------k', 'Zp---d--k'}): + component_map['type'] = 'reflexive' + component_map['clitic'] = 'yes' + elif (properties.category in FEATURE_CATEGORY_MAP): + feature_map = {**properties.lexeme_feature_map, **properties.form_feature_map} + for name in FEATURE_CATEGORY_MAP[properties.category]: + if (name in feature_map): + component_map[name] = feature_map[name] + component_maps.append(component_map) + return component_maps + +def make_dependency_tuples(unit): + dependency_tuples = [] + dependencies = xpath_find(unit, 'tei:linkGrp[@type="JOS-SYN"]/tei:link') + dependencies.sort(key=lambda dep: int(dep.get('target')[dep.get('target').rindex('.')+1:])) + for dependency in dependencies: + relation = dependency.get('ana')[len('jos-syn:'):] + [from_index, to_index] = [string[string.rindex('.')+1:] if string.count('.') == 2 else '#' for string in dependency.get('target').split(' ')] + dependency_tuples.append((from_index, to_index, relation)) + return dependency_tuples + +def has_space(token): + return token.get('join') != 'right' + +def get_contact(token, category): + if (category != 'punctuation'): + contact = None + else: + left_space = token.getprevious() is not None and has_space(token.getprevious()) + right_space = has_space(token) + contact = CONTACT_MAP[(left_space, right_space)] + return contact + +def get_max_id(root): + return max([int(ss.get('id')) for ss in root.xpath('syntactic_structure')]) + +def create_xml_structure(syntactic_structure): + structure_element = lxml.Element('syntactic_structure') + structure_element.set('tempId', str(syntactic_structure.id)) + comment = lxml.Comment(' example: ' + syntactic_structure.example) + structure_element.append(comment) + structure_element.set('type', 'other') + system = lxml.SubElement(structure_element, 'system') + system.set('type', 'JOS') + components = lxml.SubElement(system, 'components') + components.set('order', 'fixed') + for (index, component_map) in enumerate(syntactic_structure.components, start=1): + component = lxml.SubElement(components, 'component') + component.set('cid', str(index)) + component.set('type', 'core') + component.set('label', component_map['label']) + structure_element.set('label', '-'.join([c.get('label') for c in components])) + dependencies = lxml.SubElement(system, 'dependencies') + for dependency_map in syntactic_structure.dependencies: + dependency = lxml.SubElement(dependencies, 'dependency') + [from_index, label, to_index] = [dependency_map[key] for key in ['from', 'label', 'to']] + if (label == DEPENDENCY_ROOT_LABEL): + from_index = DEPENDENCY_ROOT_SYMBOL + dependency.set('from', from_index) + dependency.set('label', label) + dependency.set('to', to_index) + definition = lxml.SubElement(system, 'definition') + for (index, component_map) in enumerate(syntactic_structure.components, start=1): + component = lxml.SubElement(definition, 'component') + component.set('cid', str(index)) + restriction = lxml.SubElement(component, 'restriction') + restriction.set('type', 'morphology') + for key in ['POS', 'type', 'form', 'case', 'clitic']: + if (key in component_map['features']): + feature = lxml.SubElement(restriction, 'feature') + feature.set(key, component_map['features'][key]) + if (component_map['contact'] is not None): + space_restriction = lxml.SubElement(component, 'restriction') + space_restriction.set('type', 'space') + space_feature = lxml.SubElement(space_restriction, 'feature') + space_feature.set('contact', component_map['contact']) + return structure_element + +if (__name__ == '__main__'): + + arg_parser = argparse.ArgumentParser(description='Assign structure ids to single-component structures.') + arg_parser.add_argument('-infile', type=str, help='Input TEI') + arg_parser.add_argument('-instruct', type=str, help='Structures input file') + arg_parser.add_argument('-outfile', type=str, help='Output TEI') + arg_parser.add_argument('-outstruct', type=str, help='Structures output file') + + arguments = arg_parser.parse_args() + assign(arguments.infile, arguments.instruct, arguments.outfile, arguments.outstruct) diff --git a/package/structure_assignment/constants.py b/package/structure_assignment/constants.py index fbae48c..57ad992 100644 --- a/package/structure_assignment/constants.py +++ b/package/structure_assignment/constants.py @@ -3,7 +3,6 @@ FILE_MAP = {'strings-list': 'strings.txt', 'obeliks-tweaked': 'obeliks_tweaked.conllu', 'classla-parsed': 'classla_raw.conllu', 'classla-translated': 'classla_translated.conllu', - 'dict': 'dict.xml', 'structure-schema': 'structures.xsd', 'tei-initial': 'tei_initial.xml', 'tei-ids-collocation': 'tei_ids_collocations.xml', diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index c2f4be7..ac1a27c 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -8,11 +8,11 @@ import classla from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu -from nova_slovnica.translate_jos import translate as translate_jos -from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei -from nova_slovnica.assign_collocation_structures import assign as assign_collocation_structures -from nova_slovnica.assign_other_structures import assign as assign_other_structures -from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary +from conversion_utils.translate_conllu_jos import translate as translate_jos +from conversion_utils.conllu_to_tei import convert_file as conllu_to_tei +from structure_assignment.assign_collocation_structures import assign as assign_collocation_structures +from structure_assignment.assign_other_structures import assign as assign_other_structures +from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary class Runner: @@ -132,9 +132,8 @@ class Pipeline: def do_translate_jos(self): print('Translating JOS ...') input_file_name = self.file_map['classla-parsed'] - dictionary_file_name = self.file_map['dict'] output_file_name = self.file_map['classla-translated'] - translate_jos(input_file_name, dictionary_file_name, output_file_name) + translate_jos(input_file_name, output_file_name) def do_conllu_to_tei(self): print('Converting to TEI ...') diff --git a/scripts/setup.sh b/scripts/setup.sh index 9a1f201..0602946 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -7,9 +7,7 @@ mkdir lib resources ## get dependencies cd lib -git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git -git clone git@gitea.cjvt.si:generic/data_admin.git git clone git@gitea.cjvt.si:generic/xml_schemas.git git clone git@gitea.cjvt.si:generic/conversion_utils.git cd .. @@ -23,7 +21,6 @@ pip install psycopg2cffi pip install sqlalchemy pip install classla python -c "import classla; classla.download('sl', type='standard_jos', dir='resources/classla')" -pip install lib/nova_slovnica/python/package/ pip install lib/luscenje_struktur/ pip install lib/conversion_utils/ pip install package/ @@ -32,8 +29,7 @@ deactivate ## put needed resources in place cd resources ln -s ../lib/luscenje_struktur/wani.py . -ln -s ../lib/nova_slovnica/resources/dict.xml . -ln -s ../lib/data_admin/resources/structures.xsd . +ln -s ../lib/xml_schemas/resources/schema/structures.xsd . ln -s ../lib/xml_schemas/resources/schema/inventory.xsd . ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd . cd ..