From cb6960f58a011c183de3c1d692bba4e3b9656e2e Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Mon, 15 Mar 2021 16:24:01 +0100 Subject: [PATCH] Redmine #1835: minor improvements --- package/structure_assignment/pipeline.py | 19 ++++++++++++++++++- resources/.gitignore | 2 +- scripts/process.py | 18 +++++++++++------- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index 3fff914..dfb2913 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -27,7 +27,7 @@ def create_nlp(resource_directory): class Pipeline: - def __init__(self, nlp, resource_directory): + def __init__(self, resource_directory, nlp=None): self.nlp = nlp self.tmp_directory = tempfile.mkdtemp() resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)] @@ -42,16 +42,19 @@ class Pipeline: shutil.copyfile(file_name, self.file_map[file_key]) def do_tokenise(self): + print('Tokenising with obeliks ...') input_file_name = self.file_map['strings-list'] output_file_name = self.file_map['obeliks-tokenised'] obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True) def do_tweak_conllu(self): + print('Tweaking conllu ...') input_file_name = self.file_map['obeliks-tokenised'] output_file_name = self.file_map['obeliks-tweaked'] tweak_conllu(input_file_name, output_file_name) def do_parse(self): + print('Parsing with classla ...') input_file_name = self.file_map['obeliks-tweaked'] output_file_name = self.file_map['classla-parsed'] doc = Document(text=None) @@ -61,42 +64,50 @@ class Pipeline: result.conll_file.write_conll(output_file_name) def do_translate_jos(self): + print('Translating JOS ...') input_file_name = self.file_map['classla-parsed'] dictionary_file_name = self.file_map['dict'] output_file_name = self.file_map['classla-translated'] translate_jos(input_file_name, dictionary_file_name, output_file_name) def do_conllu_to_tei(self): + print('Converting to TEI ...') input_file_name = self.file_map['classla-translated'] output_file_name = self.file_map['tei-initial'] conllu_to_tei(input_file_name, output_file_name) def do_split_tei(self): + print('Splitting TEI ...') input_file_name = self.file_map['tei-initial'] output_single_file_name = self.file_map['tei-single'] output_multiple_file_name = self.file_map['tei-multiple'] split_tei(input_file_name, output_single_file_name, output_multiple_file_name) def do_assign_single(self): + print('Assigning single structures ...') input_file_name = self.file_map['tei-single'] structure_file_name = self.file_map['structures-old'] output_file_name = self.file_map['tei-single-ids'] assign_single(input_file_name, structure_file_name, output_file_name) def do_tei_to_dictionary_single(self): + print('Converting single TEI to dictionary ...') input_file_name = self.file_map['tei-single-ids'] output_file_name = self.file_map['dictionary-single'] tei_to_dictionary(input_file_name, output_file_name) def do_tei_to_dictionary_multiple(self): + print('Converting multiple TEI to dictionary ...') input_file_name = self.file_map['tei-multiple-ids-2'] output_file_name = self.file_map['dictionary-multiple'] tei_to_dictionary(input_file_name, output_file_name) def do_find_structure_units_first(self): + print('Finding units for existing structures ...') self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1']) def do_find_structure_units_second(self): + print('Finding units for extended structures ...') self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2']) def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name): @@ -145,20 +156,24 @@ class Pipeline: return min_id def do_assign_multiple_first(self): + print('Assigning ids based on existing structures ...') min_other_id = self._find_min_other_id('structures-old') assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id) def do_assign_multiple_second(self): + print('Assigning ids based on extended structures ...') min_other_id = self._find_min_other_id('structures-new') assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id) def do_create_structures(self): + print('Creating missing structures ...') input_file_name = self.file_map['structures-old'] tei_file_name = self.file_map['tei-multiple-ids-1'] output_file_name = self.file_map['structures-new'] create_structures(input_file_name, tei_file_name, output_file_name) def do_merge_dictionaries(self): + print('Merging single and multiple dictionaries ...') single_file_name = self.file_map['dictionary-single'] multiple_file_name = self.file_map['dictionary-multiple'] output_file_name = self.file_map['dictionary'] @@ -170,11 +185,13 @@ class Pipeline: xml_schema.assertValid(xml_tree) def do_validate_structures(self): + print('Validating structures ...') schema_file_name = self.file_map['structure-schema'] xml_file_name = self.file_map['structures-new'] self._do_validate(schema_file_name, xml_file_name) def do_validate_dictionary(self): + print('Validating dictionary ...') schema_file_name = self.file_map['dictionary-schema'] xml_file_name = self.file_map['dictionary'] self._do_validate(schema_file_name, xml_file_name) diff --git a/resources/.gitignore b/resources/.gitignore index b40c628..5006d66 100644 --- a/resources/.gitignore +++ b/resources/.gitignore @@ -1,7 +1,7 @@ /classla /dict.xml -/obeliks.jar /structures.xml /structures.xsd /inventory.xsd /monolingual_dictionaries.xsd +/wani.py diff --git a/scripts/process.py b/scripts/process.py index 504a1e4..fcfb48a 100644 --- a/scripts/process.py +++ b/scripts/process.py @@ -1,23 +1,27 @@ import argparse +import tempfile +import os from structure_assignment.pipeline import Pipeline, create_nlp resource_directory = '../resources' def run_all(input_file_name, output_file_name, nlp, structure_file_name): - tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this + tmp_file_name = tempfile.mksfile() string_to_parse(input_file_name, tmp_file_name, nlp) parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) + os.remove(tmp_file_name) validate_structures(structure_file_name) validate_dictionary(output_file_name) def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name): - tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this + tmp_file_name = tempfile.mksfile() string_to_parse(input_file_name, tmp_file_name, nlp) parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) + os.remove(tmp_file_name) def strings_to_parse(input_file_name, output_file_name, nlp): - pipeline = Pipeline(nlp, resource_directory) + pipeline = Pipeline(resource_directory, nlp) pipeline.import_file(input_file_name, 'strings-list') pipeline.do_tokenise() pipeline.do_tweak_conllu() @@ -29,7 +33,7 @@ def strings_to_parse(input_file_name, output_file_name, nlp): pipeline.cleanup() def parse_to_dictionary(input_file_name, output_file_name, structure_file_name): - pipeline = Pipeline(None, resource_directory) + pipeline = Pipeline(resource_directory) pipeline.import_file(input_file_name, 'tei-initial') pipeline.do_split_tei() pipeline.do_assign_single() @@ -46,13 +50,13 @@ def parse_to_dictionary(input_file_name, output_file_name, structure_file_name): pipeline.cleanup() def validate_structures(input_file_name): - pipeline = Pipeline(None, resource_directory) + pipeline = Pipeline(resource_directory) pipeline.import_file(input_file_name, 'structures-new') pipeline.do_validate_structures() pipeline.cleanup() def validate_dictionary(input_file_name): - pipeline = Pipeline(None, resource_directory) + pipeline = Pipeline(resource_directory) pipeline.import_file(input_file_name, 'dictionary') pipeline.do_validate_dictionary() pipeline.cleanup() @@ -85,4 +89,4 @@ if (__name__ == '__main__'): elif (part_name == 'validate_dictionary'): validate_dictionary(input_file_name) elif (part_name == 'all'): - run_all(input_file_name) + run_all(input_file_name, output_file_name, nlp, structure_file_name)