Redmine #1835: minor improvements

This commit is contained in:
Cyprian Laskowski 2021-03-15 16:24:01 +01:00
parent 08c291b5db
commit cb6960f58a
3 changed files with 30 additions and 9 deletions

View File

@ -27,7 +27,7 @@ def create_nlp(resource_directory):
class Pipeline: class Pipeline:
def __init__(self, nlp, resource_directory): def __init__(self, resource_directory, nlp=None):
self.nlp = nlp self.nlp = nlp
self.tmp_directory = tempfile.mkdtemp() self.tmp_directory = tempfile.mkdtemp()
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)] resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
@ -42,16 +42,19 @@ class Pipeline:
shutil.copyfile(file_name, self.file_map[file_key]) shutil.copyfile(file_name, self.file_map[file_key])
def do_tokenise(self): def do_tokenise(self):
print('Tokenising with obeliks ...')
input_file_name = self.file_map['strings-list'] input_file_name = self.file_map['strings-list']
output_file_name = self.file_map['obeliks-tokenised'] output_file_name = self.file_map['obeliks-tokenised']
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True) obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
def do_tweak_conllu(self): def do_tweak_conllu(self):
print('Tweaking conllu ...')
input_file_name = self.file_map['obeliks-tokenised'] input_file_name = self.file_map['obeliks-tokenised']
output_file_name = self.file_map['obeliks-tweaked'] output_file_name = self.file_map['obeliks-tweaked']
tweak_conllu(input_file_name, output_file_name) tweak_conllu(input_file_name, output_file_name)
def do_parse(self): def do_parse(self):
print('Parsing with classla ...')
input_file_name = self.file_map['obeliks-tweaked'] input_file_name = self.file_map['obeliks-tweaked']
output_file_name = self.file_map['classla-parsed'] output_file_name = self.file_map['classla-parsed']
doc = Document(text=None) doc = Document(text=None)
@ -61,42 +64,50 @@ class Pipeline:
result.conll_file.write_conll(output_file_name) result.conll_file.write_conll(output_file_name)
def do_translate_jos(self): def do_translate_jos(self):
print('Translating JOS ...')
input_file_name = self.file_map['classla-parsed'] input_file_name = self.file_map['classla-parsed']
dictionary_file_name = self.file_map['dict'] dictionary_file_name = self.file_map['dict']
output_file_name = self.file_map['classla-translated'] output_file_name = self.file_map['classla-translated']
translate_jos(input_file_name, dictionary_file_name, output_file_name) translate_jos(input_file_name, dictionary_file_name, output_file_name)
def do_conllu_to_tei(self): def do_conllu_to_tei(self):
print('Converting to TEI ...')
input_file_name = self.file_map['classla-translated'] input_file_name = self.file_map['classla-translated']
output_file_name = self.file_map['tei-initial'] output_file_name = self.file_map['tei-initial']
conllu_to_tei(input_file_name, output_file_name) conllu_to_tei(input_file_name, output_file_name)
def do_split_tei(self): def do_split_tei(self):
print('Splitting TEI ...')
input_file_name = self.file_map['tei-initial'] input_file_name = self.file_map['tei-initial']
output_single_file_name = self.file_map['tei-single'] output_single_file_name = self.file_map['tei-single']
output_multiple_file_name = self.file_map['tei-multiple'] output_multiple_file_name = self.file_map['tei-multiple']
split_tei(input_file_name, output_single_file_name, output_multiple_file_name) split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
def do_assign_single(self): def do_assign_single(self):
print('Assigning single structures ...')
input_file_name = self.file_map['tei-single'] input_file_name = self.file_map['tei-single']
structure_file_name = self.file_map['structures-old'] structure_file_name = self.file_map['structures-old']
output_file_name = self.file_map['tei-single-ids'] output_file_name = self.file_map['tei-single-ids']
assign_single(input_file_name, structure_file_name, output_file_name) assign_single(input_file_name, structure_file_name, output_file_name)
def do_tei_to_dictionary_single(self): def do_tei_to_dictionary_single(self):
print('Converting single TEI to dictionary ...')
input_file_name = self.file_map['tei-single-ids'] input_file_name = self.file_map['tei-single-ids']
output_file_name = self.file_map['dictionary-single'] output_file_name = self.file_map['dictionary-single']
tei_to_dictionary(input_file_name, output_file_name) tei_to_dictionary(input_file_name, output_file_name)
def do_tei_to_dictionary_multiple(self): def do_tei_to_dictionary_multiple(self):
print('Converting multiple TEI to dictionary ...')
input_file_name = self.file_map['tei-multiple-ids-2'] input_file_name = self.file_map['tei-multiple-ids-2']
output_file_name = self.file_map['dictionary-multiple'] output_file_name = self.file_map['dictionary-multiple']
tei_to_dictionary(input_file_name, output_file_name) tei_to_dictionary(input_file_name, output_file_name)
def do_find_structure_units_first(self): def do_find_structure_units_first(self):
print('Finding units for existing structures ...')
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1']) self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
def do_find_structure_units_second(self): def do_find_structure_units_second(self):
print('Finding units for extended structures ...')
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2']) self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name): def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
@ -145,20 +156,24 @@ class Pipeline:
return min_id return min_id
def do_assign_multiple_first(self): def do_assign_multiple_first(self):
print('Assigning ids based on existing structures ...')
min_other_id = self._find_min_other_id('structures-old') min_other_id = self._find_min_other_id('structures-old')
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id) assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id)
def do_assign_multiple_second(self): def do_assign_multiple_second(self):
print('Assigning ids based on extended structures ...')
min_other_id = self._find_min_other_id('structures-new') min_other_id = self._find_min_other_id('structures-new')
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id) assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id)
def do_create_structures(self): def do_create_structures(self):
print('Creating missing structures ...')
input_file_name = self.file_map['structures-old'] input_file_name = self.file_map['structures-old']
tei_file_name = self.file_map['tei-multiple-ids-1'] tei_file_name = self.file_map['tei-multiple-ids-1']
output_file_name = self.file_map['structures-new'] output_file_name = self.file_map['structures-new']
create_structures(input_file_name, tei_file_name, output_file_name) create_structures(input_file_name, tei_file_name, output_file_name)
def do_merge_dictionaries(self): def do_merge_dictionaries(self):
print('Merging single and multiple dictionaries ...')
single_file_name = self.file_map['dictionary-single'] single_file_name = self.file_map['dictionary-single']
multiple_file_name = self.file_map['dictionary-multiple'] multiple_file_name = self.file_map['dictionary-multiple']
output_file_name = self.file_map['dictionary'] output_file_name = self.file_map['dictionary']
@ -170,11 +185,13 @@ class Pipeline:
xml_schema.assertValid(xml_tree) xml_schema.assertValid(xml_tree)
def do_validate_structures(self): def do_validate_structures(self):
print('Validating structures ...')
schema_file_name = self.file_map['structure-schema'] schema_file_name = self.file_map['structure-schema']
xml_file_name = self.file_map['structures-new'] xml_file_name = self.file_map['structures-new']
self._do_validate(schema_file_name, xml_file_name) self._do_validate(schema_file_name, xml_file_name)
def do_validate_dictionary(self): def do_validate_dictionary(self):
print('Validating dictionary ...')
schema_file_name = self.file_map['dictionary-schema'] schema_file_name = self.file_map['dictionary-schema']
xml_file_name = self.file_map['dictionary'] xml_file_name = self.file_map['dictionary']
self._do_validate(schema_file_name, xml_file_name) self._do_validate(schema_file_name, xml_file_name)

View File

@ -1,7 +1,7 @@
/classla /classla
/dict.xml /dict.xml
/obeliks.jar
/structures.xml /structures.xml
/structures.xsd /structures.xsd
/inventory.xsd /inventory.xsd
/monolingual_dictionaries.xsd /monolingual_dictionaries.xsd
/wani.py

View File

@ -1,23 +1,27 @@
import argparse import argparse
import tempfile
import os
from structure_assignment.pipeline import Pipeline, create_nlp from structure_assignment.pipeline import Pipeline, create_nlp
resource_directory = '../resources' resource_directory = '../resources'
def run_all(input_file_name, output_file_name, nlp, structure_file_name): def run_all(input_file_name, output_file_name, nlp, structure_file_name):
tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this tmp_file_name = tempfile.mksfile()
string_to_parse(input_file_name, tmp_file_name, nlp) string_to_parse(input_file_name, tmp_file_name, nlp)
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
os.remove(tmp_file_name)
validate_structures(structure_file_name) validate_structures(structure_file_name)
validate_dictionary(output_file_name) validate_dictionary(output_file_name)
def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name): def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name):
tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this tmp_file_name = tempfile.mksfile()
string_to_parse(input_file_name, tmp_file_name, nlp) string_to_parse(input_file_name, tmp_file_name, nlp)
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name) parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
os.remove(tmp_file_name)
def strings_to_parse(input_file_name, output_file_name, nlp): def strings_to_parse(input_file_name, output_file_name, nlp):
pipeline = Pipeline(nlp, resource_directory) pipeline = Pipeline(resource_directory, nlp)
pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_file_name, 'strings-list')
pipeline.do_tokenise() pipeline.do_tokenise()
pipeline.do_tweak_conllu() pipeline.do_tweak_conllu()
@ -29,7 +33,7 @@ def strings_to_parse(input_file_name, output_file_name, nlp):
pipeline.cleanup() pipeline.cleanup()
def parse_to_dictionary(input_file_name, output_file_name, structure_file_name): def parse_to_dictionary(input_file_name, output_file_name, structure_file_name):
pipeline = Pipeline(None, resource_directory) pipeline = Pipeline(resource_directory)
pipeline.import_file(input_file_name, 'tei-initial') pipeline.import_file(input_file_name, 'tei-initial')
pipeline.do_split_tei() pipeline.do_split_tei()
pipeline.do_assign_single() pipeline.do_assign_single()
@ -46,13 +50,13 @@ def parse_to_dictionary(input_file_name, output_file_name, structure_file_name):
pipeline.cleanup() pipeline.cleanup()
def validate_structures(input_file_name): def validate_structures(input_file_name):
pipeline = Pipeline(None, resource_directory) pipeline = Pipeline(resource_directory)
pipeline.import_file(input_file_name, 'structures-new') pipeline.import_file(input_file_name, 'structures-new')
pipeline.do_validate_structures() pipeline.do_validate_structures()
pipeline.cleanup() pipeline.cleanup()
def validate_dictionary(input_file_name): def validate_dictionary(input_file_name):
pipeline = Pipeline(None, resource_directory) pipeline = Pipeline(resource_directory)
pipeline.import_file(input_file_name, 'dictionary') pipeline.import_file(input_file_name, 'dictionary')
pipeline.do_validate_dictionary() pipeline.do_validate_dictionary()
pipeline.cleanup() pipeline.cleanup()
@ -85,4 +89,4 @@ if (__name__ == '__main__'):
elif (part_name == 'validate_dictionary'): elif (part_name == 'validate_dictionary'):
validate_dictionary(input_file_name) validate_dictionary(input_file_name)
elif (part_name == 'all'): elif (part_name == 'all'):
run_all(input_file_name) run_all(input_file_name, output_file_name, nlp, structure_file_name)