diff --git a/package/structure_assignment/constants.py b/package/structure_assignment/constants.py index 3bad673..fbae48c 100644 --- a/package/structure_assignment/constants.py +++ b/package/structure_assignment/constants.py @@ -6,17 +6,11 @@ FILE_MAP = {'strings-list': 'strings.txt', 'dict': 'dict.xml', 'structure-schema': 'structures.xsd', 'tei-initial': 'tei_initial.xml', - 'tei-single': 'tei_single.xml', - 'tei-single-ids': 'tei_single_with_ids.xml', - 'tei-multiple': 'tei_multiple.xml', - 'tei-multiple-ids-1': 'tei_multiple_with_ids1.xml', - 'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml', - 'mwes-1': 'mwes1.csv', - 'mwes-2': 'mwes2.csv', + 'tei-ids-collocation': 'tei_ids_collocations.xml', + 'tei-ids-all': 'tei_ids_all.xml', + 'collocations': 'collocation_matches.csv', 'structures-old': 'structures_old.xml', 'structures-new': 'structures_new.xml', - 'dictionary-single': 'dictionary_single.xml', - 'dictionary-multiple': 'dictionary_multiple.xml', 'dictionary': 'dictionary.xml', 'dictionary-schema': 'monolingual_dictionaries.xsd' } diff --git a/package/structure_assignment/merge_dictionaries.py b/package/structure_assignment/merge_dictionaries.py deleted file mode 100644 index 7d3372c..0000000 --- a/package/structure_assignment/merge_dictionaries.py +++ /dev/null @@ -1,27 +0,0 @@ -import argparse -import re -import lxml.etree as lxml - -def get_entries(input_file_name): - return list(lxml.parse(input_file_name).getroot()) - - -def merge(single_file_name, multiple_file_name, output_file_name): - entries = get_entries(single_file_name) + get_entries(multiple_file_name) - entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1))) - - root = lxml.Element('dictionary') - for entry in entries: - del entry.attrib['sid'] - root.append(entry) - tree = lxml.ElementTree(root) - tree.write(output_file_name, encoding='UTF-8', pretty_print=True) - - -if (__name__ == '__main__'): - arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') - arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary') - arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary') - arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary') - arguments = arg_parser.parse_args() - merge(arguments.single, arguments.multiple, arguments.outfile) diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index 9e9cf93..c2f4be7 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -10,12 +10,9 @@ from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu from nova_slovnica.translate_jos import translate as translate_jos from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei -from structure_assignment.split_tei import split as split_tei -from nova_slovnica.assign_single_structures import assign as assign_single -from nova_slovnica.assign_structures import assign as assign_multiple +from nova_slovnica.assign_collocation_structures import assign as assign_collocation_structures +from nova_slovnica.assign_other_structures import assign as assign_other_structures from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary -from nova_slovnica.create_structures import create as create_structures -from structure_assignment.merge_dictionaries import merge as merge_dictionaries class Runner: @@ -83,17 +80,10 @@ class Runner: pipeline.do_conllu_to_tei() def _parse_to_dictionary_sequence(self, pipeline): - pipeline.do_split_tei() - pipeline.do_assign_single() - pipeline.do_tei_to_dictionary_single() - pipeline.do_find_structure_units_first() - pipeline.do_assign_multiple_first() - pipeline.do_create_structures() - pipeline.do_find_structure_units_second() - pipeline.do_assign_multiple_second() - pipeline.do_tei_to_dictionary_multiple() - pipeline.do_merge_dictionaries() - + pipeline.do_find_collocation_structure_units() + pipeline.do_assign_collocation_structures() + pipeline.do_assign_other_structures() + pipeline.do_tei_to_dictionary() class Pipeline: @@ -152,49 +142,16 @@ class Pipeline: output_file_name = self.file_map['tei-initial'] conllu_to_tei(input_file_name, output_file_name) - def do_split_tei(self): - print('Splitting TEI ...') - input_file_name = self.file_map['tei-initial'] - output_single_file_name = self.file_map['tei-single'] - output_multiple_file_name = self.file_map['tei-multiple'] - split_tei(input_file_name, output_single_file_name, output_multiple_file_name) - - def do_assign_single(self): - print('Assigning single structures ...') - input_file_name = self.file_map['tei-single'] - structure_file_name = self.file_map['structures-old'] - output_file_name = self.file_map['tei-single-ids'] - assign_single(input_file_name, structure_file_name, output_file_name) - - def do_tei_to_dictionary_single(self): - print('Converting single TEI to dictionary ...') - input_file_name = self.file_map['tei-single-ids'] - output_file_name = self.file_map['dictionary-single'] - tei_to_dictionary(input_file_name, output_file_name) - - def do_tei_to_dictionary_multiple(self): - print('Converting multiple TEI to dictionary ...') - input_file_name = self.file_map['tei-multiple-ids-2'] - output_file_name = self.file_map['dictionary-multiple'] - tei_to_dictionary(input_file_name, output_file_name) - - def do_find_structure_units_first(self): - print('Finding units for existing structures ...') - self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1']) - - def do_find_structure_units_second(self): - print('Finding units for extended structures ...') - self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2']) - - def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name): + def do_find_collocation_structure_units(self): + print('Finding units for existing collocation structures ...') from wani import main as wani_main namespace = SimpleNamespace() # relevant values - namespace.structures = structure_file_name - namespace.input = [tei_file_name] - namespace.all = csv_file_name + namespace.structures = self.file_map['structures-old'] + namespace.input = [self.file_map['tei-initial']] + namespace.all = self.file_map['collocations'] namespace.skip_id_check = True namespace.fixed_restriction_order = True namespace.new_tei = True @@ -221,39 +178,26 @@ class Pipeline: wani_main(namespace) - - def _find_min_other_id(self, key): - try: - root = lxml.parse(self.file_map[key]) - other_ids = [int(oid) for oid in root.xpath('syntactic_structure[@type="other"]/@id')] - min_id = min(other_ids) - except: - min_id = 109 # This is the current value in structures.xml, and is not expected to change. Ugly, but code shouldn't reach here ... - return min_id - - def do_assign_multiple_first(self): - print('Assigning ids based on existing structures ...') - min_other_id = self._find_min_other_id('structures-old') - assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id) - - def do_assign_multiple_second(self): - print('Assigning ids based on extended structures ...') - min_other_id = self._find_min_other_id('structures-new') - assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id) - - def do_create_structures(self): - print('Creating missing structures ...') - input_file_name = self.file_map['structures-old'] - tei_file_name = self.file_map['tei-multiple-ids-1'] - output_file_name = self.file_map['structures-new'] - create_structures(input_file_name, tei_file_name, output_file_name) - - def do_merge_dictionaries(self): - print('Merging single and multiple dictionaries ...') - single_file_name = self.file_map['dictionary-single'] - multiple_file_name = self.file_map['dictionary-multiple'] + def do_assign_collocation_structures(self): + print('Assigning ids of collocation structures ...') + input_file_name = self.file_map['tei-initial'] + collocations_file_name = self.file_map['collocations'] + output_file_name = self.file_map['tei-ids-collocation'] + assign_collocation_structures(input_file_name, collocations_file_name, output_file_name) + + def do_assign_other_structures(self): + print('Assigning ids of single and other structures, creating if necessary ...') + input_file_name = self.file_map['tei-ids-collocation'] + structure_old_file_name = self.file_map['structures-old'] + output_file_name = self.file_map['tei-ids-all'] + structure_new_file_name = self.file_map['structures-new'] + assign_other_structures(input_file_name, structure_old_file_name, output_file_name, structure_new_file_name) + + def do_tei_to_dictionary(self): + print('Converting TEI to dictionary ...') + input_file_name = self.file_map['tei-ids-all'] output_file_name = self.file_map['dictionary'] - merge_dictionaries(single_file_name, multiple_file_name, output_file_name) + tei_to_dictionary(input_file_name, output_file_name) def _do_validate(self, schema_file_name, xml_file_name): xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name)) diff --git a/package/structure_assignment/split_tei.py b/package/structure_assignment/split_tei.py deleted file mode 100644 index f1b36f7..0000000 --- a/package/structure_assignment/split_tei.py +++ /dev/null @@ -1,38 +0,0 @@ -import argparse -import lxml.etree as lxml - - -def xpath_find(element,expression): - return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'}) - - -def count_tokens(paragraph): - return len(xpath_find(paragraph, './/tei:w|.//tei:pc')) - - -def split(input_file_name, single_file_name, multiple_file_name): - - tree = lxml.parse(input_file_name) - root = tree.getroot() - paragraphs = xpath_find(root, './/tei:p') - for paragraph in paragraphs: - if (count_tokens(paragraph) > 1): - paragraph.getparent().remove(paragraph) - tree.write(single_file_name, encoding='UTF-8', pretty_print=True) - - tree = lxml.parse(input_file_name) - root = tree.getroot() - paragraphs = xpath_find(root, './/tei:p') - for paragraph in paragraphs: - if (count_tokens(paragraph) == 1): - paragraph.getparent().remove(paragraph) - tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True) - - -if (__name__ == '__main__'): - arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') - arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file') - arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file') - arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file') - arguments = arg_parser.parse_args() - split(arguments.infile, arguments.single, arguments.multiple)