Redmine #2198: Adapted pipeline to limit wani.py to collocation structures
This commit is contained in:
		
							parent
							
								
									3c38cdbcae
								
							
						
					
					
						commit
						616e7823df
					
				| @ -6,17 +6,11 @@ FILE_MAP = {'strings-list': 'strings.txt', | |||||||
|             'dict': 'dict.xml', |             'dict': 'dict.xml', | ||||||
|             'structure-schema': 'structures.xsd', |             'structure-schema': 'structures.xsd', | ||||||
|             'tei-initial': 'tei_initial.xml', |             'tei-initial': 'tei_initial.xml', | ||||||
|             'tei-single': 'tei_single.xml', |             'tei-ids-collocation': 'tei_ids_collocations.xml', | ||||||
|             'tei-single-ids': 'tei_single_with_ids.xml', |             'tei-ids-all': 'tei_ids_all.xml', | ||||||
|             'tei-multiple': 'tei_multiple.xml', |             'collocations': 'collocation_matches.csv', | ||||||
|             'tei-multiple-ids-1': 'tei_multiple_with_ids1.xml', |  | ||||||
|             'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml', |  | ||||||
|             'mwes-1': 'mwes1.csv', |  | ||||||
|             'mwes-2': 'mwes2.csv', |  | ||||||
|             'structures-old': 'structures_old.xml', |             'structures-old': 'structures_old.xml', | ||||||
|             'structures-new': 'structures_new.xml', |             'structures-new': 'structures_new.xml', | ||||||
|             'dictionary-single': 'dictionary_single.xml', |  | ||||||
|             'dictionary-multiple': 'dictionary_multiple.xml', |  | ||||||
|             'dictionary': 'dictionary.xml', |             'dictionary': 'dictionary.xml', | ||||||
|             'dictionary-schema': 'monolingual_dictionaries.xsd' |             'dictionary-schema': 'monolingual_dictionaries.xsd' | ||||||
| } | } | ||||||
|  | |||||||
| @ -1,27 +0,0 @@ | |||||||
| import argparse |  | ||||||
| import re |  | ||||||
| import lxml.etree as lxml |  | ||||||
| 
 |  | ||||||
| def get_entries(input_file_name): |  | ||||||
|     return list(lxml.parse(input_file_name).getroot()) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def merge(single_file_name, multiple_file_name, output_file_name): |  | ||||||
|     entries = get_entries(single_file_name) + get_entries(multiple_file_name) |  | ||||||
|     entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1))) |  | ||||||
| 
 |  | ||||||
|     root = lxml.Element('dictionary') |  | ||||||
|     for entry in entries: |  | ||||||
|         del entry.attrib['sid'] |  | ||||||
|         root.append(entry) |  | ||||||
|     tree = lxml.ElementTree(root) |  | ||||||
|     tree.write(output_file_name, encoding='UTF-8', pretty_print=True) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if (__name__ == '__main__'): |  | ||||||
|     arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') |  | ||||||
|     arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary') |  | ||||||
|     arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary') |  | ||||||
|     arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary') |  | ||||||
|     arguments = arg_parser.parse_args() |  | ||||||
|     merge(arguments.single, arguments.multiple, arguments.outfile) |  | ||||||
| @ -10,12 +10,9 @@ from structure_assignment.constants import * | |||||||
| from structure_assignment.tweak_conllu import tweak as tweak_conllu | from structure_assignment.tweak_conllu import tweak as tweak_conllu | ||||||
| from nova_slovnica.translate_jos import translate as translate_jos | from nova_slovnica.translate_jos import translate as translate_jos | ||||||
| from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei | from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei | ||||||
| from structure_assignment.split_tei import split as split_tei | from nova_slovnica.assign_collocation_structures import assign as assign_collocation_structures | ||||||
| from nova_slovnica.assign_single_structures import assign as assign_single | from nova_slovnica.assign_other_structures import assign as assign_other_structures | ||||||
| from nova_slovnica.assign_structures import assign as assign_multiple |  | ||||||
| from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary | from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary | ||||||
| from nova_slovnica.create_structures import create as create_structures |  | ||||||
| from structure_assignment.merge_dictionaries import merge as merge_dictionaries |  | ||||||
| 
 | 
 | ||||||
| class Runner: | class Runner: | ||||||
| 
 | 
 | ||||||
| @ -83,17 +80,10 @@ class Runner: | |||||||
|         pipeline.do_conllu_to_tei() |         pipeline.do_conllu_to_tei() | ||||||
| 
 | 
 | ||||||
|     def _parse_to_dictionary_sequence(self, pipeline): |     def _parse_to_dictionary_sequence(self, pipeline): | ||||||
|         pipeline.do_split_tei() |         pipeline.do_find_collocation_structure_units() | ||||||
|         pipeline.do_assign_single() |         pipeline.do_assign_collocation_structures() | ||||||
|         pipeline.do_tei_to_dictionary_single() |         pipeline.do_assign_other_structures() | ||||||
|         pipeline.do_find_structure_units_first() |         pipeline.do_tei_to_dictionary() | ||||||
|         pipeline.do_assign_multiple_first() |  | ||||||
|         pipeline.do_create_structures() |  | ||||||
|         pipeline.do_find_structure_units_second() |  | ||||||
|         pipeline.do_assign_multiple_second() |  | ||||||
|         pipeline.do_tei_to_dictionary_multiple() |  | ||||||
|         pipeline.do_merge_dictionaries() |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| class Pipeline: | class Pipeline: | ||||||
| 
 | 
 | ||||||
| @ -152,49 +142,16 @@ class Pipeline: | |||||||
|         output_file_name = self.file_map['tei-initial'] |         output_file_name = self.file_map['tei-initial'] | ||||||
|         conllu_to_tei(input_file_name, output_file_name) |         conllu_to_tei(input_file_name, output_file_name) | ||||||
| 
 | 
 | ||||||
|     def do_split_tei(self): |     def do_find_collocation_structure_units(self): | ||||||
|         print('Splitting TEI ...') |         print('Finding units for existing collocation structures ...') | ||||||
|         input_file_name = self.file_map['tei-initial'] |  | ||||||
|         output_single_file_name = self.file_map['tei-single'] |  | ||||||
|         output_multiple_file_name = self.file_map['tei-multiple'] |  | ||||||
|         split_tei(input_file_name, output_single_file_name, output_multiple_file_name) |  | ||||||
| 
 |  | ||||||
|     def do_assign_single(self): |  | ||||||
|         print('Assigning single structures ...') |  | ||||||
|         input_file_name = self.file_map['tei-single'] |  | ||||||
|         structure_file_name = self.file_map['structures-old'] |  | ||||||
|         output_file_name = self.file_map['tei-single-ids'] |  | ||||||
|         assign_single(input_file_name, structure_file_name, output_file_name) |  | ||||||
| 
 |  | ||||||
|     def do_tei_to_dictionary_single(self): |  | ||||||
|         print('Converting single TEI to dictionary ...') |  | ||||||
|         input_file_name = self.file_map['tei-single-ids'] |  | ||||||
|         output_file_name = self.file_map['dictionary-single'] |  | ||||||
|         tei_to_dictionary(input_file_name, output_file_name) |  | ||||||
| 
 |  | ||||||
|     def do_tei_to_dictionary_multiple(self): |  | ||||||
|         print('Converting multiple TEI to dictionary ...') |  | ||||||
|         input_file_name = self.file_map['tei-multiple-ids-2'] |  | ||||||
|         output_file_name = self.file_map['dictionary-multiple'] |  | ||||||
|         tei_to_dictionary(input_file_name, output_file_name) |  | ||||||
| 
 |  | ||||||
|     def do_find_structure_units_first(self): |  | ||||||
|         print('Finding units for existing structures ...') |  | ||||||
|         self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1']) |  | ||||||
| 
 |  | ||||||
|     def do_find_structure_units_second(self): |  | ||||||
|         print('Finding units for extended structures ...') |  | ||||||
|         self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2']) |  | ||||||
| 
 |  | ||||||
|     def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name): |  | ||||||
| 
 | 
 | ||||||
|         from wani import main as wani_main |         from wani import main as wani_main | ||||||
|         namespace = SimpleNamespace() |         namespace = SimpleNamespace() | ||||||
| 
 | 
 | ||||||
|         # relevant values |         # relevant values | ||||||
|         namespace.structures = structure_file_name |         namespace.structures = self.file_map['structures-old'] | ||||||
|         namespace.input = [tei_file_name] |         namespace.input = [self.file_map['tei-initial']] | ||||||
|         namespace.all = csv_file_name |         namespace.all = self.file_map['collocations'] | ||||||
|         namespace.skip_id_check = True |         namespace.skip_id_check = True | ||||||
|         namespace.fixed_restriction_order = True |         namespace.fixed_restriction_order = True | ||||||
|         namespace.new_tei = True |         namespace.new_tei = True | ||||||
| @ -221,39 +178,26 @@ class Pipeline: | |||||||
|           |           | ||||||
|         wani_main(namespace) |         wani_main(namespace) | ||||||
| 
 | 
 | ||||||
|  |     def do_assign_collocation_structures(self): | ||||||
|  |         print('Assigning ids of collocation structures ...') | ||||||
|  |         input_file_name = self.file_map['tei-initial'] | ||||||
|  |         collocations_file_name = self.file_map['collocations'] | ||||||
|  |         output_file_name = self.file_map['tei-ids-collocation'] | ||||||
|  |         assign_collocation_structures(input_file_name, collocations_file_name, output_file_name) | ||||||
| 
 | 
 | ||||||
|     def _find_min_other_id(self, key): |     def do_assign_other_structures(self): | ||||||
|         try: |         print('Assigning ids of single and other structures, creating if necessary ...') | ||||||
|             root = lxml.parse(self.file_map[key]) |         input_file_name = self.file_map['tei-ids-collocation'] | ||||||
|             other_ids = [int(oid) for oid in root.xpath('syntactic_structure[@type="other"]/@id')] |         structure_old_file_name = self.file_map['structures-old'] | ||||||
|             min_id = min(other_ids) |         output_file_name = self.file_map['tei-ids-all'] | ||||||
|         except: |         structure_new_file_name = self.file_map['structures-new'] | ||||||
|             min_id = 109 # This is the current value in structures.xml, and is not expected to change. Ugly, but code shouldn't reach here ... |         assign_other_structures(input_file_name, structure_old_file_name, output_file_name, structure_new_file_name) | ||||||
|         return min_id |  | ||||||
| 
 | 
 | ||||||
|     def do_assign_multiple_first(self): |     def do_tei_to_dictionary(self): | ||||||
|         print('Assigning ids based on existing structures ...') |         print('Converting TEI to dictionary ...') | ||||||
|         min_other_id = self._find_min_other_id('structures-old') |         input_file_name = self.file_map['tei-ids-all'] | ||||||
|         assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id) |  | ||||||
| 
 |  | ||||||
|     def do_assign_multiple_second(self): |  | ||||||
|         print('Assigning ids based on extended structures ...') |  | ||||||
|         min_other_id = self._find_min_other_id('structures-new') |  | ||||||
|         assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id) |  | ||||||
| 
 |  | ||||||
|     def do_create_structures(self): |  | ||||||
|         print('Creating missing structures ...') |  | ||||||
|         input_file_name = self.file_map['structures-old'] |  | ||||||
|         tei_file_name = self.file_map['tei-multiple-ids-1'] |  | ||||||
|         output_file_name = self.file_map['structures-new'] |  | ||||||
|         create_structures(input_file_name, tei_file_name, output_file_name) |  | ||||||
| 
 |  | ||||||
|     def do_merge_dictionaries(self): |  | ||||||
|         print('Merging single and multiple dictionaries ...') |  | ||||||
|         single_file_name = self.file_map['dictionary-single'] |  | ||||||
|         multiple_file_name = self.file_map['dictionary-multiple'] |  | ||||||
|         output_file_name = self.file_map['dictionary'] |         output_file_name = self.file_map['dictionary'] | ||||||
|         merge_dictionaries(single_file_name, multiple_file_name, output_file_name) |         tei_to_dictionary(input_file_name, output_file_name) | ||||||
| 
 | 
 | ||||||
|     def _do_validate(self, schema_file_name, xml_file_name): |     def _do_validate(self, schema_file_name, xml_file_name): | ||||||
|         xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name)) |         xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name)) | ||||||
|  | |||||||
| @ -1,38 +0,0 @@ | |||||||
| import argparse |  | ||||||
| import lxml.etree as lxml |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def xpath_find(element,expression): |  | ||||||
|     return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'}) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def count_tokens(paragraph): |  | ||||||
|     return len(xpath_find(paragraph, './/tei:w|.//tei:pc')) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def split(input_file_name, single_file_name, multiple_file_name): |  | ||||||
| 
 |  | ||||||
|     tree = lxml.parse(input_file_name) |  | ||||||
|     root = tree.getroot() |  | ||||||
|     paragraphs = xpath_find(root, './/tei:p') |  | ||||||
|     for paragraph in paragraphs: |  | ||||||
|         if (count_tokens(paragraph) > 1): |  | ||||||
|             paragraph.getparent().remove(paragraph) |  | ||||||
|     tree.write(single_file_name, encoding='UTF-8', pretty_print=True) |  | ||||||
| 
 |  | ||||||
|     tree = lxml.parse(input_file_name) |  | ||||||
|     root = tree.getroot() |  | ||||||
|     paragraphs = xpath_find(root, './/tei:p') |  | ||||||
|     for paragraph in paragraphs: |  | ||||||
|         if (count_tokens(paragraph) == 1): |  | ||||||
|             paragraph.getparent().remove(paragraph) |  | ||||||
|     tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if (__name__ == '__main__'): |  | ||||||
|     arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') |  | ||||||
|     arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file') |  | ||||||
|     arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file') |  | ||||||
|     arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file') |  | ||||||
|     arguments = arg_parser.parse_args() |  | ||||||
|     split(arguments.infile, arguments.single, arguments.multiple) |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user