import shutil import codecs import tempfile import lxml.etree as lxml import classla import cordex import classla.models.parser as classla_manual from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu from conversion_utils.translate_conllu_jos import translate as translate_jos from conversion_utils.conllu_to_tei import convert_file as conllu_to_tei from structure_assignment.assign_collocation_structures import assign as assign_collocation_structures from structure_assignment.assign_other_structures import assign as assign_other_structures from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary class Runner: def __init__(self, nlp_needed, classla_directory=None): self.classla_directory = classla_directory if (nlp_needed): NLP_CONFIG_MAP['dir'] = classla_directory self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline(self.nlp) pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_structure_file_name, 'structures-old') self._strings_to_parse_sequence(pipeline) self._parse_to_dictionary_sequence(pipeline) pipeline.do_validate_structures() pipeline.export_file(output_structure_file_name, 'structures-new') pipeline.do_validate_dictionary() pipeline.export_file(output_file_name, 'dictionary') self.cleanup(pipeline) def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline(self.nlp) pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_structure_file_name, 'structures-old') self._strings_to_parse_sequence(pipeline) self._parse_to_dictionary_sequence(pipeline) pipeline.export_file(output_file_name, 'dictionary') pipeline.export_file(output_structure_file_name, 'structures-new') self.cleanup(pipeline) def strings_to_parse(self, input_file_name, output_file_name): pipeline = Pipeline(self.nlp) pipeline.import_file(input_file_name, 'strings-list') self._strings_to_parse_sequence(pipeline) pipeline.export_file(output_file_name, 'tei-initial') self.cleanup(pipeline) def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy classla_conllu_file_name = '/tmp/classla.conlu' merged_conllu_file_name = '/tmp/merged.conlu' parsed_conllu_file_name = '/tmp/parsed.conlu' pipeline = Pipeline(self.nlp) pipeline.import_file(strings_file_name, 'strings-list') pipeline.do_tokenise() pipeline.do_tweak_conllu() pipeline.do_parse() pipeline.export_file(classla_conllu_file_name, 'classla-parsed') classla_conllu_file = codecs.open(classla_conllu_file_name, 'r') tagged_conllu_file = codecs.open(input_file_name, 'r') merged_conllu_file = codecs.open(merged_conllu_file_name, 'w') for (classla_line, tagged_line) in zip(classla_conllu_file, tagged_conllu_file): classla_line = classla_line.strip() tagged_line = tagged_line.strip() if ((len(classla_line) == 0 and len(tagged_line) == 0) or (classla_line.startswith('#') and tagged_line.startswith('#'))): merged_line = classla_line else: classla_columns = classla_line.split('\t') tagged_columns = tagged_line.split('\t') assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(tagged_line) assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(classla_line) assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_columns[1], tagged_columns[1]) merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)] merged_line = '\t'.join(merged_columns) merged_conllu_file.write(merged_line + '\n') merged_conllu_file.close() tagged_conllu_file.close() classla_conllu_file.close() classla_map = { 'save_dir':self.classla_directory + '/sl/depparse', 'save_name':'standard_jos.pt', 'eval_file':merged_conllu_file_name, 'output_file':parsed_conllu_file_name, 'gold_file':merged_conllu_file_name, 'shorthand':'sl_ssj', 'mode':'predict', 'pretrain_file':self.classla_directory + '/sl/pretrain/standard.pt' } classla_arguments = [] for (key, value) in classla_map.items(): classla_arguments += ['--' + key, value] classla_manual.main(args=classla_arguments) pipeline.import_file(parsed_conllu_file_name, 'classla-parsed') pipeline.do_translate_jos() pipeline.do_conllu_to_tei() pipeline.import_file(input_structure_file_name, 'structures-old') self._parse_to_dictionary_sequence(pipeline) pipeline.export_file(output_file_name, 'dictionary') pipeline.export_file(output_structure_file_name, 'structures-new') self.cleanup(pipeline) def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline() pipeline.import_file(input_file_name, 'tei-initial') pipeline.import_file(input_structure_file_name, 'structures-old') self._parse_to_dictionary_sequence(pipeline) pipeline.export_file(output_file_name, 'dictionary') pipeline.export_file(output_structure_file_name, 'structures-new') self.cleanup(pipeline) def validate_structures(self, input_file_name): pipeline = Pipeline() pipeline.import_file(input_file_name, 'structures-new') pipeline.do_validate_structures() self.cleanup(pipeline) def validate_dictionary(self, input_file_name): pipeline = Pipeline() pipeline.import_file(input_file_name, 'dictionary') pipeline.do_validate_dictionary() self.cleanup(pipeline) def _strings_to_parse_sequence(self, pipeline): pipeline.do_tokenise() pipeline.do_tweak_conllu() pipeline.do_parse() pipeline.do_translate_jos() pipeline.do_conllu_to_tei() def _parse_to_dictionary_sequence(self, pipeline): pipeline.do_find_collocation_structure_units() pipeline.do_assign_collocation_structures() pipeline.do_assign_other_structures() pipeline.do_tei_to_dictionary() def cleanup(self, pipeline): pipeline.cleanup() class Pipeline: def __init__(self, nlp=None): self.nlp = nlp self.tmp_directory = tempfile.mkdtemp() self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()} def import_file(self, file_name, file_key): shutil.copyfile(file_name, self.file_map[file_key]) def do_tokenise(self): print('Tokenising with obeliks ...') input_file_name = self.file_map['strings-list'] output_file_name = self.file_map['obeliks-tokenised'] with open(input_file_name, 'r') as input_file: input_conllu = input_file.read() tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.nlp.dir) output_conllu = tokeniser(input_conllu).to_conll() with open(output_file_name, 'w') as output_file: output_file.write(output_conllu) def do_tweak_conllu(self): print('Tweaking conllu ...') input_file_name = self.file_map['obeliks-tokenised'] output_file_name = self.file_map['obeliks-tweaked'] tweak_conllu(input_file_name, output_file_name) def do_parse(self): print('Parsing with classla ...') input_file_name = self.file_map['obeliks-tweaked'] output_file_name = self.file_map['classla-parsed'] with open(input_file_name, 'r') as input_file: input_conllu = input_file.read() doc = self.nlp(input_conllu) with open(output_file_name, 'w') as output_file: output_file.write(doc.to_conll()) def do_translate_jos(self): print('Translating JOS ...') input_file_name = self.file_map['classla-parsed'] scope = 'msd' output_file_name = self.file_map['classla-translated'] translate_jos(input_file_name, scope, output_file_name) def do_conllu_to_tei(self): print('Converting to TEI ...') input_file_name = self.file_map['classla-translated'] output_file_name = self.file_map['tei-initial'] conllu_to_tei(input_file_name, output_file_name) def do_find_collocation_structure_units(self): print('Finding units for existing collocation structures ...') structure_file_name = self.file_map['structures-old'] input_file_name = self.file_map['tei-initial'] output_file_name = self.file_map['collocations'] mapper_file_name = self.file_map['collocation-mapper'] extractor = cordex.Pipeline(structure_file_name, fixed_restriction_order=True, statistics=False, collocation_sentence_map_dest=mapper_file_name, jos_msd_lang='sl') extraction = extractor(input_file_name) extraction.write(output_file_name) def do_assign_collocation_structures(self): print('Assigning ids of collocation structures ...') input_file_name = self.file_map['tei-initial'] structure_file_name = self.file_map['structures-old'] collocations_file_name = self.file_map['collocations'] mapper_file_name = self.file_map['collocation-mapper'] output_file_name = self.file_map['tei-ids-collocation'] assign_collocation_structures(input_file_name, structure_file_name, collocations_file_name, mapper_file_name, output_file_name) def do_assign_other_structures(self): print('Assigning ids of single and other structures, creating if necessary ...') input_file_name = self.file_map['tei-ids-collocation'] structure_old_file_name = self.file_map['structures-old'] output_file_name = self.file_map['tei-ids-all'] structure_new_file_name = self.file_map['structures-new'] assign_other_structures(input_file_name, structure_old_file_name, output_file_name, structure_new_file_name) def do_tei_to_dictionary(self): print('Converting TEI to dictionary ...') input_file_name = self.file_map['tei-ids-all'] output_file_name = self.file_map['dictionary'] tei_to_dictionary(input_file_name, output_file_name) def _do_validate(self, schema_file_name, xml_file_name): xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name)) xml_tree = lxml.parse(xml_file_name) xml_schema.assertValid(xml_tree) def do_validate_structures(self): print('Validating structures ...') schema_file_name = self.file_map['structure-schema'] xml_file_name = self.file_map['structures-new'] self._do_validate(schema_file_name, xml_file_name) def do_validate_dictionary(self): print('Validating dictionary ...') schema_file_name = self.file_map['dictionary-schema'] xml_file_name = self.file_map['dictionary'] self._do_validate(schema_file_name, xml_file_name) def export_file(self, file_name, file_key): shutil.copyfile(self.file_map[file_key], file_name) def cleanup(self): shutil.rmtree(self.tmp_directory, True)