Redmine #1835: minor improvements
This commit is contained in:
parent
08c291b5db
commit
cb6960f58a
|
@ -27,7 +27,7 @@ def create_nlp(resource_directory):
|
||||||
|
|
||||||
class Pipeline:
|
class Pipeline:
|
||||||
|
|
||||||
def __init__(self, nlp, resource_directory):
|
def __init__(self, resource_directory, nlp=None):
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.tmp_directory = tempfile.mkdtemp()
|
self.tmp_directory = tempfile.mkdtemp()
|
||||||
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
|
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
|
||||||
|
@ -42,16 +42,19 @@ class Pipeline:
|
||||||
shutil.copyfile(file_name, self.file_map[file_key])
|
shutil.copyfile(file_name, self.file_map[file_key])
|
||||||
|
|
||||||
def do_tokenise(self):
|
def do_tokenise(self):
|
||||||
|
print('Tokenising with obeliks ...')
|
||||||
input_file_name = self.file_map['strings-list']
|
input_file_name = self.file_map['strings-list']
|
||||||
output_file_name = self.file_map['obeliks-tokenised']
|
output_file_name = self.file_map['obeliks-tokenised']
|
||||||
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
|
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
|
||||||
|
|
||||||
def do_tweak_conllu(self):
|
def do_tweak_conllu(self):
|
||||||
|
print('Tweaking conllu ...')
|
||||||
input_file_name = self.file_map['obeliks-tokenised']
|
input_file_name = self.file_map['obeliks-tokenised']
|
||||||
output_file_name = self.file_map['obeliks-tweaked']
|
output_file_name = self.file_map['obeliks-tweaked']
|
||||||
tweak_conllu(input_file_name, output_file_name)
|
tweak_conllu(input_file_name, output_file_name)
|
||||||
|
|
||||||
def do_parse(self):
|
def do_parse(self):
|
||||||
|
print('Parsing with classla ...')
|
||||||
input_file_name = self.file_map['obeliks-tweaked']
|
input_file_name = self.file_map['obeliks-tweaked']
|
||||||
output_file_name = self.file_map['classla-parsed']
|
output_file_name = self.file_map['classla-parsed']
|
||||||
doc = Document(text=None)
|
doc = Document(text=None)
|
||||||
|
@ -61,42 +64,50 @@ class Pipeline:
|
||||||
result.conll_file.write_conll(output_file_name)
|
result.conll_file.write_conll(output_file_name)
|
||||||
|
|
||||||
def do_translate_jos(self):
|
def do_translate_jos(self):
|
||||||
|
print('Translating JOS ...')
|
||||||
input_file_name = self.file_map['classla-parsed']
|
input_file_name = self.file_map['classla-parsed']
|
||||||
dictionary_file_name = self.file_map['dict']
|
dictionary_file_name = self.file_map['dict']
|
||||||
output_file_name = self.file_map['classla-translated']
|
output_file_name = self.file_map['classla-translated']
|
||||||
translate_jos(input_file_name, dictionary_file_name, output_file_name)
|
translate_jos(input_file_name, dictionary_file_name, output_file_name)
|
||||||
|
|
||||||
def do_conllu_to_tei(self):
|
def do_conllu_to_tei(self):
|
||||||
|
print('Converting to TEI ...')
|
||||||
input_file_name = self.file_map['classla-translated']
|
input_file_name = self.file_map['classla-translated']
|
||||||
output_file_name = self.file_map['tei-initial']
|
output_file_name = self.file_map['tei-initial']
|
||||||
conllu_to_tei(input_file_name, output_file_name)
|
conllu_to_tei(input_file_name, output_file_name)
|
||||||
|
|
||||||
def do_split_tei(self):
|
def do_split_tei(self):
|
||||||
|
print('Splitting TEI ...')
|
||||||
input_file_name = self.file_map['tei-initial']
|
input_file_name = self.file_map['tei-initial']
|
||||||
output_single_file_name = self.file_map['tei-single']
|
output_single_file_name = self.file_map['tei-single']
|
||||||
output_multiple_file_name = self.file_map['tei-multiple']
|
output_multiple_file_name = self.file_map['tei-multiple']
|
||||||
split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
|
split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
|
||||||
|
|
||||||
def do_assign_single(self):
|
def do_assign_single(self):
|
||||||
|
print('Assigning single structures ...')
|
||||||
input_file_name = self.file_map['tei-single']
|
input_file_name = self.file_map['tei-single']
|
||||||
structure_file_name = self.file_map['structures-old']
|
structure_file_name = self.file_map['structures-old']
|
||||||
output_file_name = self.file_map['tei-single-ids']
|
output_file_name = self.file_map['tei-single-ids']
|
||||||
assign_single(input_file_name, structure_file_name, output_file_name)
|
assign_single(input_file_name, structure_file_name, output_file_name)
|
||||||
|
|
||||||
def do_tei_to_dictionary_single(self):
|
def do_tei_to_dictionary_single(self):
|
||||||
|
print('Converting single TEI to dictionary ...')
|
||||||
input_file_name = self.file_map['tei-single-ids']
|
input_file_name = self.file_map['tei-single-ids']
|
||||||
output_file_name = self.file_map['dictionary-single']
|
output_file_name = self.file_map['dictionary-single']
|
||||||
tei_to_dictionary(input_file_name, output_file_name)
|
tei_to_dictionary(input_file_name, output_file_name)
|
||||||
|
|
||||||
def do_tei_to_dictionary_multiple(self):
|
def do_tei_to_dictionary_multiple(self):
|
||||||
|
print('Converting multiple TEI to dictionary ...')
|
||||||
input_file_name = self.file_map['tei-multiple-ids-2']
|
input_file_name = self.file_map['tei-multiple-ids-2']
|
||||||
output_file_name = self.file_map['dictionary-multiple']
|
output_file_name = self.file_map['dictionary-multiple']
|
||||||
tei_to_dictionary(input_file_name, output_file_name)
|
tei_to_dictionary(input_file_name, output_file_name)
|
||||||
|
|
||||||
def do_find_structure_units_first(self):
|
def do_find_structure_units_first(self):
|
||||||
|
print('Finding units for existing structures ...')
|
||||||
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
|
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
|
||||||
|
|
||||||
def do_find_structure_units_second(self):
|
def do_find_structure_units_second(self):
|
||||||
|
print('Finding units for extended structures ...')
|
||||||
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
|
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
|
||||||
|
|
||||||
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
|
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
|
||||||
|
@ -145,20 +156,24 @@ class Pipeline:
|
||||||
return min_id
|
return min_id
|
||||||
|
|
||||||
def do_assign_multiple_first(self):
|
def do_assign_multiple_first(self):
|
||||||
|
print('Assigning ids based on existing structures ...')
|
||||||
min_other_id = self._find_min_other_id('structures-old')
|
min_other_id = self._find_min_other_id('structures-old')
|
||||||
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id)
|
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id)
|
||||||
|
|
||||||
def do_assign_multiple_second(self):
|
def do_assign_multiple_second(self):
|
||||||
|
print('Assigning ids based on extended structures ...')
|
||||||
min_other_id = self._find_min_other_id('structures-new')
|
min_other_id = self._find_min_other_id('structures-new')
|
||||||
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id)
|
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id)
|
||||||
|
|
||||||
def do_create_structures(self):
|
def do_create_structures(self):
|
||||||
|
print('Creating missing structures ...')
|
||||||
input_file_name = self.file_map['structures-old']
|
input_file_name = self.file_map['structures-old']
|
||||||
tei_file_name = self.file_map['tei-multiple-ids-1']
|
tei_file_name = self.file_map['tei-multiple-ids-1']
|
||||||
output_file_name = self.file_map['structures-new']
|
output_file_name = self.file_map['structures-new']
|
||||||
create_structures(input_file_name, tei_file_name, output_file_name)
|
create_structures(input_file_name, tei_file_name, output_file_name)
|
||||||
|
|
||||||
def do_merge_dictionaries(self):
|
def do_merge_dictionaries(self):
|
||||||
|
print('Merging single and multiple dictionaries ...')
|
||||||
single_file_name = self.file_map['dictionary-single']
|
single_file_name = self.file_map['dictionary-single']
|
||||||
multiple_file_name = self.file_map['dictionary-multiple']
|
multiple_file_name = self.file_map['dictionary-multiple']
|
||||||
output_file_name = self.file_map['dictionary']
|
output_file_name = self.file_map['dictionary']
|
||||||
|
@ -170,11 +185,13 @@ class Pipeline:
|
||||||
xml_schema.assertValid(xml_tree)
|
xml_schema.assertValid(xml_tree)
|
||||||
|
|
||||||
def do_validate_structures(self):
|
def do_validate_structures(self):
|
||||||
|
print('Validating structures ...')
|
||||||
schema_file_name = self.file_map['structure-schema']
|
schema_file_name = self.file_map['structure-schema']
|
||||||
xml_file_name = self.file_map['structures-new']
|
xml_file_name = self.file_map['structures-new']
|
||||||
self._do_validate(schema_file_name, xml_file_name)
|
self._do_validate(schema_file_name, xml_file_name)
|
||||||
|
|
||||||
def do_validate_dictionary(self):
|
def do_validate_dictionary(self):
|
||||||
|
print('Validating dictionary ...')
|
||||||
schema_file_name = self.file_map['dictionary-schema']
|
schema_file_name = self.file_map['dictionary-schema']
|
||||||
xml_file_name = self.file_map['dictionary']
|
xml_file_name = self.file_map['dictionary']
|
||||||
self._do_validate(schema_file_name, xml_file_name)
|
self._do_validate(schema_file_name, xml_file_name)
|
||||||
|
|
2
resources/.gitignore
vendored
2
resources/.gitignore
vendored
|
@ -1,7 +1,7 @@
|
||||||
/classla
|
/classla
|
||||||
/dict.xml
|
/dict.xml
|
||||||
/obeliks.jar
|
|
||||||
/structures.xml
|
/structures.xml
|
||||||
/structures.xsd
|
/structures.xsd
|
||||||
/inventory.xsd
|
/inventory.xsd
|
||||||
/monolingual_dictionaries.xsd
|
/monolingual_dictionaries.xsd
|
||||||
|
/wani.py
|
||||||
|
|
|
@ -1,23 +1,27 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
|
||||||
from structure_assignment.pipeline import Pipeline, create_nlp
|
from structure_assignment.pipeline import Pipeline, create_nlp
|
||||||
|
|
||||||
resource_directory = '../resources'
|
resource_directory = '../resources'
|
||||||
|
|
||||||
def run_all(input_file_name, output_file_name, nlp, structure_file_name):
|
def run_all(input_file_name, output_file_name, nlp, structure_file_name):
|
||||||
tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this
|
tmp_file_name = tempfile.mksfile()
|
||||||
string_to_parse(input_file_name, tmp_file_name, nlp)
|
string_to_parse(input_file_name, tmp_file_name, nlp)
|
||||||
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
|
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
|
||||||
|
os.remove(tmp_file_name)
|
||||||
validate_structures(structure_file_name)
|
validate_structures(structure_file_name)
|
||||||
validate_dictionary(output_file_name)
|
validate_dictionary(output_file_name)
|
||||||
|
|
||||||
def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name):
|
def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name):
|
||||||
tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this
|
tmp_file_name = tempfile.mksfile()
|
||||||
string_to_parse(input_file_name, tmp_file_name, nlp)
|
string_to_parse(input_file_name, tmp_file_name, nlp)
|
||||||
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
|
parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
|
||||||
|
os.remove(tmp_file_name)
|
||||||
|
|
||||||
def strings_to_parse(input_file_name, output_file_name, nlp):
|
def strings_to_parse(input_file_name, output_file_name, nlp):
|
||||||
pipeline = Pipeline(nlp, resource_directory)
|
pipeline = Pipeline(resource_directory, nlp)
|
||||||
pipeline.import_file(input_file_name, 'strings-list')
|
pipeline.import_file(input_file_name, 'strings-list')
|
||||||
pipeline.do_tokenise()
|
pipeline.do_tokenise()
|
||||||
pipeline.do_tweak_conllu()
|
pipeline.do_tweak_conllu()
|
||||||
|
@ -29,7 +33,7 @@ def strings_to_parse(input_file_name, output_file_name, nlp):
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def parse_to_dictionary(input_file_name, output_file_name, structure_file_name):
|
def parse_to_dictionary(input_file_name, output_file_name, structure_file_name):
|
||||||
pipeline = Pipeline(None, resource_directory)
|
pipeline = Pipeline(resource_directory)
|
||||||
pipeline.import_file(input_file_name, 'tei-initial')
|
pipeline.import_file(input_file_name, 'tei-initial')
|
||||||
pipeline.do_split_tei()
|
pipeline.do_split_tei()
|
||||||
pipeline.do_assign_single()
|
pipeline.do_assign_single()
|
||||||
|
@ -46,13 +50,13 @@ def parse_to_dictionary(input_file_name, output_file_name, structure_file_name):
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def validate_structures(input_file_name):
|
def validate_structures(input_file_name):
|
||||||
pipeline = Pipeline(None, resource_directory)
|
pipeline = Pipeline(resource_directory)
|
||||||
pipeline.import_file(input_file_name, 'structures-new')
|
pipeline.import_file(input_file_name, 'structures-new')
|
||||||
pipeline.do_validate_structures()
|
pipeline.do_validate_structures()
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
def validate_dictionary(input_file_name):
|
def validate_dictionary(input_file_name):
|
||||||
pipeline = Pipeline(None, resource_directory)
|
pipeline = Pipeline(resource_directory)
|
||||||
pipeline.import_file(input_file_name, 'dictionary')
|
pipeline.import_file(input_file_name, 'dictionary')
|
||||||
pipeline.do_validate_dictionary()
|
pipeline.do_validate_dictionary()
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
@ -85,4 +89,4 @@ if (__name__ == '__main__'):
|
||||||
elif (part_name == 'validate_dictionary'):
|
elif (part_name == 'validate_dictionary'):
|
||||||
validate_dictionary(input_file_name)
|
validate_dictionary(input_file_name)
|
||||||
elif (part_name == 'all'):
|
elif (part_name == 'all'):
|
||||||
run_all(input_file_name)
|
run_all(input_file_name, output_file_name, nlp, structure_file_name)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user