You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

289 lines
13 KiB

import shutil
import codecs
import tempfile
from types import SimpleNamespace
import lxml.etree as lxml
import classla
import classla.models.parser as classla_manual
from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu
from conversion_utils.translate_conllu_jos import translate as translate_jos
from conversion_utils.conllu_to_tei import convert_file as conllu_to_tei
from structure_assignment.assign_collocation_structures import assign as assign_collocation_structures
from structure_assignment.assign_other_structures import assign as assign_other_structures
from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
class Runner:
def __init__(self, nlp_needed, classla_directory=None, wani_file_name=None):
self.classla_directory = classla_directory
if (nlp_needed):
NLP_CONFIG_MAP['dir'] = classla_directory
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
if (wani_file_name is not None):
self._provide_wani(wani_file_name)
def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package
self.wani_directory = tempfile.mkdtemp()
shutil.copy(wani_file_name, self.wani_directory)
import sys
sys.path.insert(0, self.wani_directory)
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline(self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
pipeline.import_file(input_structure_file_name, 'structures-old')
self._strings_to_parse_sequence(pipeline)
self._parse_to_dictionary_sequence(pipeline)
pipeline.do_validate_structures()
pipeline.export_file(output_structure_file_name, 'structures-new')
pipeline.do_validate_dictionary()
pipeline.export_file(output_file_name, 'dictionary')
self.cleanup(pipeline)
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline(self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
pipeline.import_file(input_structure_file_name, 'structures-old')
self._strings_to_parse_sequence(pipeline)
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(output_structure_file_name, 'structures-new')
self.cleanup(pipeline)
def strings_to_parse(self, input_file_name, output_file_name):
pipeline = Pipeline(self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
self._strings_to_parse_sequence(pipeline)
pipeline.export_file(output_file_name, 'tei-initial')
self.cleanup(pipeline)
def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy
classla_conllu_file_name = '/tmp/classla.conlu'
merged_conllu_file_name = '/tmp/merged.conlu'
parsed_conllu_file_name = '/tmp/parsed.conlu'
pipeline = Pipeline(self.nlp)
pipeline.import_file(strings_file_name, 'strings-list')
pipeline.do_tokenise()
pipeline.do_tweak_conllu()
pipeline.do_parse()
pipeline.export_file(classla_conllu_file_name, 'classla-parsed')
classla_conllu_file = codecs.open(classla_conllu_file_name, 'r')
tagged_conllu_file = codecs.open(input_file_name, 'r')
merged_conllu_file = codecs.open(merged_conllu_file_name, 'w')
for (classla_line, tagged_line) in zip(classla_conllu_file, tagged_conllu_file):
classla_line = classla_line.strip()
tagged_line = tagged_line.strip()
if ((len(classla_line) == 0 and len(tagged_line) == 0)
or (classla_line.startswith('#') and tagged_line.startswith('#'))):
merged_line = classla_line
else:
classla_columns = classla_line.split('\t')
tagged_columns = tagged_line.split('\t')
assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(len(tagged_line))
assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(len(classla_line))
assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_columns[1], tagged_columns[1])
merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)]
merged_line = '\t'.join(merged_columns)
merged_conllu_file.write(merged_line + '\n')
merged_conllu_file.close()
tagged_conllu_file.close()
classla_conllu_file.close()
classla_map = {
'save_dir':self.classla_directory + '/sl/depparse',
'save_name':'standard_jos.pt',
'eval_file':merged_conllu_file_name,
'output_file':parsed_conllu_file_name,
'gold_file':merged_conllu_file_name,
'shorthand':'sl_ssj',
'mode':'predict',
'pretrain_file':self.classla_directory + '/sl/pretrain/standard.pt'
}
classla_arguments = []
for (key, value) in classla_map.items():
classla_arguments += ['--' + key, value]
classla_manual.main(args=classla_arguments)
pipeline.import_file(parsed_conllu_file_name, 'classla-parsed')
pipeline.do_translate_jos()
pipeline.do_conllu_to_tei()
pipeline.import_file(input_structure_file_name, 'structures-old')
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(output_structure_file_name, 'structures-new')
self.cleanup(pipeline)
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline()
pipeline.import_file(input_file_name, 'tei-initial')
pipeline.import_file(input_structure_file_name, 'structures-old')
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(output_structure_file_name, 'structures-new')
self.cleanup(pipeline)
def validate_structures(self, input_file_name):
pipeline = Pipeline()
pipeline.import_file(input_file_name, 'structures-new')
pipeline.do_validate_structures()
self.cleanup(pipeline)
def validate_dictionary(self, input_file_name):
pipeline = Pipeline()
pipeline.import_file(input_file_name, 'dictionary')
pipeline.do_validate_dictionary()
self.cleanup(pipeline)
def _strings_to_parse_sequence(self, pipeline):
pipeline.do_tokenise()
pipeline.do_tweak_conllu()
pipeline.do_parse()
pipeline.do_translate_jos()
pipeline.do_conllu_to_tei()
def _parse_to_dictionary_sequence(self, pipeline):
pipeline.do_find_collocation_structure_units()
pipeline.do_assign_collocation_structures()
pipeline.do_assign_other_structures()
pipeline.do_tei_to_dictionary()
def cleanup(self, pipeline):
shutil.rmtree(self.wani_directory, True)
pipeline.cleanup()
class Pipeline:
def __init__(self, nlp=None):
self.nlp = nlp
self.tmp_directory = tempfile.mkdtemp()
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
def import_file(self, file_name, file_key):
shutil.copyfile(file_name, self.file_map[file_key])
def do_tokenise(self):
print('Tokenising with obeliks ...')
input_file_name = self.file_map['strings-list']
output_file_name = self.file_map['obeliks-tokenised']
with open(input_file_name, 'r') as input_file:
input_conllu = input_file.read()
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.nlp.dir)
output_conllu = tokeniser(input_conllu).to_conll()
with open(output_file_name, 'w') as output_file:
output_file.write(output_conllu)
def do_tweak_conllu(self):
print('Tweaking conllu ...')
input_file_name = self.file_map['obeliks-tokenised']
output_file_name = self.file_map['obeliks-tweaked']
tweak_conllu(input_file_name, output_file_name)
def do_parse(self):
print('Parsing with classla ...')
input_file_name = self.file_map['obeliks-tweaked']
output_file_name = self.file_map['classla-parsed']
with open(input_file_name, 'r') as input_file:
input_conllu = input_file.read()
doc = self.nlp(input_conllu)
with open(output_file_name, 'w') as output_file:
output_file.write(doc.to_conll())
def do_translate_jos(self):
print('Translating JOS ...')
input_file_name = self.file_map['classla-parsed']
output_file_name = self.file_map['classla-translated']
translate_jos(input_file_name, output_file_name)
def do_conllu_to_tei(self):
print('Converting to TEI ...')
input_file_name = self.file_map['classla-translated']
output_file_name = self.file_map['tei-initial']
conllu_to_tei(input_file_name, output_file_name)
def do_find_collocation_structure_units(self):
print('Finding units for existing collocation structures ...')
from wani import main as wani_main
namespace = SimpleNamespace()
# relevant values
namespace.structures = self.file_map['structures-old']
namespace.input = [self.file_map['tei-initial']]
namespace.all = self.file_map['collocations']
namespace.skip_id_check = True
namespace.fixed_restriction_order = True
namespace.new_tei = True
# default values
namespace.sloleks_db = None
namespace.out = None
namespace.out_no_stat = None
namespace.stats = None
namespace.no_msd_translate = False
namespace.min_freq = 0
namespace.verbose = 'info'
namespace.count_files = False
namespace.multiple_output = False
namespace.load_sloleks = False
namespace.sort_by = -1
namespace.sort_reversed = False
namespace.db = None
namespace.collocation_sentence_map_dest = None
namespace.new_db = False
namespace.pc_tag = 'pc'
namespace.separator = '\t'
namespace.ignore_punctuations = False
wani_main(namespace)
def do_assign_collocation_structures(self):
print('Assigning ids of collocation structures ...')
input_file_name = self.file_map['tei-initial']
collocations_file_name = self.file_map['collocations']
output_file_name = self.file_map['tei-ids-collocation']
assign_collocation_structures(input_file_name, collocations_file_name, output_file_name)
def do_assign_other_structures(self):
print('Assigning ids of single and other structures, creating if necessary ...')
input_file_name = self.file_map['tei-ids-collocation']
structure_old_file_name = self.file_map['structures-old']
output_file_name = self.file_map['tei-ids-all']
structure_new_file_name = self.file_map['structures-new']
assign_other_structures(input_file_name, structure_old_file_name, output_file_name, structure_new_file_name)
def do_tei_to_dictionary(self):
print('Converting TEI to dictionary ...')
input_file_name = self.file_map['tei-ids-all']
output_file_name = self.file_map['dictionary']
tei_to_dictionary(input_file_name, output_file_name)
def _do_validate(self, schema_file_name, xml_file_name):
xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name))
xml_tree = lxml.parse(xml_file_name)
xml_schema.assertValid(xml_tree)
def do_validate_structures(self):
print('Validating structures ...')
schema_file_name = self.file_map['structure-schema']
xml_file_name = self.file_map['structures-new']
self._do_validate(schema_file_name, xml_file_name)
def do_validate_dictionary(self):
print('Validating dictionary ...')
schema_file_name = self.file_map['dictionary-schema']
xml_file_name = self.file_map['dictionary']
self._do_validate(schema_file_name, xml_file_name)
def export_file(self, file_name, file_key):
shutil.copyfile(self.file_map[file_key], file_name)
def cleanup(self):
shutil.rmtree(self.tmp_directory, True)