import shutil import codecs import tempfile from types import SimpleNamespace import lxml.etree as lxml import classla import classla.models.parser as classla_manual from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu from conversion_utils.translate_conllu_jos import translate as translate_jos from conversion_utils.conllu_to_tei import convert_file as conllu_to_tei from structure_assignment.assign_collocation_structures import assign as assign_collocation_structures from structure_assignment.assign_other_structures import assign as assign_other_structures from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary class Runner: def __init__(self, nlp_needed, classla_directory=None, wani_file_name=None): self.classla_directory = classla_directory if (nlp_needed): NLP_CONFIG_MAP['dir'] = classla_directory self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP) if (wani_file_name is not None): self._provide_wani(wani_file_name) def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package self.wani_directory = tempfile.mkdtemp() shutil.copy(wani_file_name, self.wani_directory) import sys sys.path.insert(0, self.wani_directory) def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline(self.nlp) pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_structure_file_name, 'structures-old') self._strings_to_parse_sequence(pipeline) self._parse_to_dictionary_sequence(pipeline) pipeline.do_validate_structures() pipeline.export_file(output_structure_file_name, 'structures-new') pipeline.do_validate_dictionary() pipeline.export_file(output_file_name, 'dictionary') self.cleanup(pipeline) def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline(self.nlp) pipeline.import_file(input_file_name, 'strings-list') pipeline.import_file(input_structure_file_name, 'structures-old') self._strings_to_parse_sequence(pipeline) self._parse_to_dictionary_sequence(pipeline) pipeline.export_file(output_file_name, 'dictionary') pipeline.export_file(output_structure_file_name, 'structures-new') self.cleanup(pipeline) def strings_to_parse(self, input_file_name, output_file_name): pipeline = Pipeline(self.nlp) pipeline.import_file(input_file_name, 'strings-list') self._strings_to_parse_sequence(pipeline) pipeline.export_file(output_file_name, 'tei-initial') self.cleanup(pipeline) def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy classla_conllu_file_name = '/tmp/classla.conlu' merged_conllu_file_name = '/tmp/merged.conlu' parsed_conllu_file_name = '/tmp/parsed.conlu' pipeline = Pipeline(self.nlp) pipeline.import_file(strings_file_name, 'strings-list') pipeline.do_tokenise() pipeline.do_tweak_conllu() pipeline.do_parse() pipeline.export_file(classla_conllu_file_name, 'classla-parsed') classla_conllu_file = codecs.open(classla_conllu_file_name, 'r') tagged_conllu_file = codecs.open(input_file_name, 'r') merged_conllu_file = codecs.open(merged_conllu_file_name, 'w') for (classla_line, tagged_line) in zip(classla_conllu_file, tagged_conllu_file): classla_line = classla_line.strip() tagged_line = tagged_line.strip() if ((len(classla_line) == 0 and len(tagged_line) == 0) or (classla_line.startswith('#') and tagged_line.startswith('#'))): merged_line = classla_line else: classla_columns = classla_line.split('\t') tagged_columns = tagged_line.split('\t') assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(len(tagged_line)) assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(len(classla_line)) assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_columns[1], tagged_columns[1]) merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)] merged_line = '\t'.join(merged_columns) merged_conllu_file.write(merged_line + '\n') merged_conllu_file.close() tagged_conllu_file.close() classla_conllu_file.close() classla_map = { 'save_dir':self.classla_directory + '/sl/depparse', 'save_name':'standard_jos.pt', 'eval_file':merged_conllu_file_name, 'output_file':parsed_conllu_file_name, 'gold_file':merged_conllu_file_name, 'shorthand':'sl_ssj', 'mode':'predict', 'pretrain_file':self.classla_directory + '/sl/pretrain/standard.pt' } classla_arguments = [] for (key, value) in classla_map.items(): classla_arguments += ['--' + key, value] classla_manual.main(args=classla_arguments) pipeline.import_file(parsed_conllu_file_name, 'classla-parsed') pipeline.do_translate_jos() pipeline.do_conllu_to_tei() pipeline.import_file(input_structure_file_name, 'structures-old') self._parse_to_dictionary_sequence(pipeline) pipeline.export_file(output_file_name, 'dictionary') pipeline.export_file(output_structure_file_name, 'structures-new') self.cleanup(pipeline) def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline() pipeline.import_file(input_file_name, 'tei-initial') pipeline.import_file(input_structure_file_name, 'structures-old') self._parse_to_dictionary_sequence(pipeline) pipeline.export_file(output_file_name, 'dictionary') pipeline.export_file(output_structure_file_name, 'structures-new') self.cleanup(pipeline) def validate_structures(self, input_file_name): pipeline = Pipeline() pipeline.import_file(input_file_name, 'structures-new') pipeline.do_validate_structures() self.cleanup(pipeline) def validate_dictionary(self, input_file_name): pipeline = Pipeline() pipeline.import_file(input_file_name, 'dictionary') pipeline.do_validate_dictionary() self.cleanup(pipeline) def _strings_to_parse_sequence(self, pipeline): pipeline.do_tokenise() pipeline.do_tweak_conllu() pipeline.do_parse() pipeline.do_translate_jos() pipeline.do_conllu_to_tei() def _parse_to_dictionary_sequence(self, pipeline): pipeline.do_find_collocation_structure_units() pipeline.do_assign_collocation_structures() pipeline.do_assign_other_structures() pipeline.do_tei_to_dictionary() def cleanup(self, pipeline): shutil.rmtree(self.wani_directory, True) pipeline.cleanup() class Pipeline: def __init__(self, nlp=None): self.nlp = nlp self.tmp_directory = tempfile.mkdtemp() self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()} def import_file(self, file_name, file_key): shutil.copyfile(file_name, self.file_map[file_key]) def do_tokenise(self): print('Tokenising with obeliks ...') input_file_name = self.file_map['strings-list'] output_file_name = self.file_map['obeliks-tokenised'] with open(input_file_name, 'r') as input_file: input_conllu = input_file.read() tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.nlp.dir) output_conllu = tokeniser(input_conllu).to_conll() with open(output_file_name, 'w') as output_file: output_file.write(output_conllu) def do_tweak_conllu(self): print('Tweaking conllu ...') input_file_name = self.file_map['obeliks-tokenised'] output_file_name = self.file_map['obeliks-tweaked'] tweak_conllu(input_file_name, output_file_name) def do_parse(self): print('Parsing with classla ...') input_file_name = self.file_map['obeliks-tweaked'] output_file_name = self.file_map['classla-parsed'] with open(input_file_name, 'r') as input_file: input_conllu = input_file.read() doc = self.nlp(input_conllu) with open(output_file_name, 'w') as output_file: output_file.write(doc.to_conll()) def do_translate_jos(self): print('Translating JOS ...') input_file_name = self.file_map['classla-parsed'] output_file_name = self.file_map['classla-translated'] translate_jos(input_file_name, output_file_name) def do_conllu_to_tei(self): print('Converting to TEI ...') input_file_name = self.file_map['classla-translated'] output_file_name = self.file_map['tei-initial'] conllu_to_tei(input_file_name, output_file_name) def do_find_collocation_structure_units(self): print('Finding units for existing collocation structures ...') from wani import main as wani_main namespace = SimpleNamespace() # relevant values namespace.structures = self.file_map['structures-old'] namespace.input = [self.file_map['tei-initial']] namespace.all = self.file_map['collocations'] namespace.skip_id_check = True namespace.fixed_restriction_order = True namespace.new_tei = True # default values namespace.sloleks_db = None namespace.out = None namespace.out_no_stat = None namespace.stats = None namespace.no_msd_translate = False namespace.min_freq = 0 namespace.verbose = 'info' namespace.count_files = False namespace.multiple_output = False namespace.load_sloleks = False namespace.sort_by = -1 namespace.sort_reversed = False namespace.db = None namespace.collocation_sentence_map_dest = None namespace.new_db = False namespace.pc_tag = 'pc' namespace.separator = '\t' namespace.ignore_punctuations = False wani_main(namespace) def do_assign_collocation_structures(self): print('Assigning ids of collocation structures ...') input_file_name = self.file_map['tei-initial'] collocations_file_name = self.file_map['collocations'] output_file_name = self.file_map['tei-ids-collocation'] assign_collocation_structures(input_file_name, collocations_file_name, output_file_name) def do_assign_other_structures(self): print('Assigning ids of single and other structures, creating if necessary ...') input_file_name = self.file_map['tei-ids-collocation'] structure_old_file_name = self.file_map['structures-old'] output_file_name = self.file_map['tei-ids-all'] structure_new_file_name = self.file_map['structures-new'] assign_other_structures(input_file_name, structure_old_file_name, output_file_name, structure_new_file_name) def do_tei_to_dictionary(self): print('Converting TEI to dictionary ...') input_file_name = self.file_map['tei-ids-all'] output_file_name = self.file_map['dictionary'] tei_to_dictionary(input_file_name, output_file_name) def _do_validate(self, schema_file_name, xml_file_name): xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name)) xml_tree = lxml.parse(xml_file_name) xml_schema.assertValid(xml_tree) def do_validate_structures(self): print('Validating structures ...') schema_file_name = self.file_map['structure-schema'] xml_file_name = self.file_map['structures-new'] self._do_validate(schema_file_name, xml_file_name) def do_validate_dictionary(self): print('Validating dictionary ...') schema_file_name = self.file_map['dictionary-schema'] xml_file_name = self.file_map['dictionary'] self._do_validate(schema_file_name, xml_file_name) def export_file(self, file_name, file_key): shutil.copyfile(self.file_map[file_key], file_name) def cleanup(self): shutil.rmtree(self.tmp_directory, True)