You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
289 lines
13 KiB
289 lines
13 KiB
import shutil
|
|
import codecs
|
|
import tempfile
|
|
from types import SimpleNamespace
|
|
import lxml.etree as lxml
|
|
|
|
import classla
|
|
import classla.models.parser as classla_manual
|
|
|
|
from structure_assignment.constants import *
|
|
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
|
from conversion_utils.translate_conllu_jos import translate as translate_jos
|
|
from conversion_utils.conllu_to_tei import convert_file as conllu_to_tei
|
|
from structure_assignment.assign_collocation_structures import assign as assign_collocation_structures
|
|
from structure_assignment.assign_other_structures import assign as assign_other_structures
|
|
from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
|
|
|
|
class Runner:
|
|
|
|
def __init__(self, nlp_needed, classla_directory=None, wani_file_name=None):
|
|
self.classla_directory = classla_directory
|
|
if (nlp_needed):
|
|
NLP_CONFIG_MAP['dir'] = classla_directory
|
|
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
|
if (wani_file_name is not None):
|
|
self._provide_wani(wani_file_name)
|
|
|
|
def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package
|
|
self.wani_directory = tempfile.mkdtemp()
|
|
shutil.copy(wani_file_name, self.wani_directory)
|
|
import sys
|
|
sys.path.insert(0, self.wani_directory)
|
|
|
|
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
|
pipeline = Pipeline(self.nlp)
|
|
pipeline.import_file(input_file_name, 'strings-list')
|
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
|
self._strings_to_parse_sequence(pipeline)
|
|
self._parse_to_dictionary_sequence(pipeline)
|
|
pipeline.do_validate_structures()
|
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
|
pipeline.do_validate_dictionary()
|
|
pipeline.export_file(output_file_name, 'dictionary')
|
|
self.cleanup(pipeline)
|
|
|
|
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
|
pipeline = Pipeline(self.nlp)
|
|
pipeline.import_file(input_file_name, 'strings-list')
|
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
|
self._strings_to_parse_sequence(pipeline)
|
|
self._parse_to_dictionary_sequence(pipeline)
|
|
pipeline.export_file(output_file_name, 'dictionary')
|
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
|
self.cleanup(pipeline)
|
|
|
|
def strings_to_parse(self, input_file_name, output_file_name):
|
|
pipeline = Pipeline(self.nlp)
|
|
pipeline.import_file(input_file_name, 'strings-list')
|
|
self._strings_to_parse_sequence(pipeline)
|
|
pipeline.export_file(output_file_name, 'tei-initial')
|
|
self.cleanup(pipeline)
|
|
|
|
def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy
|
|
|
|
classla_conllu_file_name = '/tmp/classla.conlu'
|
|
merged_conllu_file_name = '/tmp/merged.conlu'
|
|
parsed_conllu_file_name = '/tmp/parsed.conlu'
|
|
|
|
pipeline = Pipeline(self.nlp)
|
|
pipeline.import_file(strings_file_name, 'strings-list')
|
|
pipeline.do_tokenise()
|
|
pipeline.do_tweak_conllu()
|
|
pipeline.do_parse()
|
|
pipeline.export_file(classla_conllu_file_name, 'classla-parsed')
|
|
|
|
classla_conllu_file = codecs.open(classla_conllu_file_name, 'r')
|
|
tagged_conllu_file = codecs.open(input_file_name, 'r')
|
|
merged_conllu_file = codecs.open(merged_conllu_file_name, 'w')
|
|
for (classla_line, tagged_line) in zip(classla_conllu_file, tagged_conllu_file):
|
|
classla_line = classla_line.strip()
|
|
tagged_line = tagged_line.strip()
|
|
if ((len(classla_line) == 0 and len(tagged_line) == 0)
|
|
or (classla_line.startswith('#') and tagged_line.startswith('#'))):
|
|
merged_line = classla_line
|
|
else:
|
|
classla_columns = classla_line.split('\t')
|
|
tagged_columns = tagged_line.split('\t')
|
|
assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(len(tagged_line))
|
|
assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(len(classla_line))
|
|
assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_columns[1], tagged_columns[1])
|
|
merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)]
|
|
merged_line = '\t'.join(merged_columns)
|
|
merged_conllu_file.write(merged_line + '\n')
|
|
merged_conllu_file.close()
|
|
tagged_conllu_file.close()
|
|
classla_conllu_file.close()
|
|
|
|
classla_map = {
|
|
'save_dir':self.classla_directory + '/sl/depparse',
|
|
'save_name':'standard_jos.pt',
|
|
'eval_file':merged_conllu_file_name,
|
|
'output_file':parsed_conllu_file_name,
|
|
'gold_file':merged_conllu_file_name,
|
|
'shorthand':'sl_ssj',
|
|
'mode':'predict',
|
|
'pretrain_file':self.classla_directory + '/sl/pretrain/standard.pt'
|
|
}
|
|
classla_arguments = []
|
|
for (key, value) in classla_map.items():
|
|
classla_arguments += ['--' + key, value]
|
|
classla_manual.main(args=classla_arguments)
|
|
|
|
pipeline.import_file(parsed_conllu_file_name, 'classla-parsed')
|
|
pipeline.do_translate_jos()
|
|
pipeline.do_conllu_to_tei()
|
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
|
self._parse_to_dictionary_sequence(pipeline)
|
|
pipeline.export_file(output_file_name, 'dictionary')
|
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
|
self.cleanup(pipeline)
|
|
|
|
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
|
pipeline = Pipeline()
|
|
pipeline.import_file(input_file_name, 'tei-initial')
|
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
|
self._parse_to_dictionary_sequence(pipeline)
|
|
pipeline.export_file(output_file_name, 'dictionary')
|
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
|
self.cleanup(pipeline)
|
|
|
|
def validate_structures(self, input_file_name):
|
|
pipeline = Pipeline()
|
|
pipeline.import_file(input_file_name, 'structures-new')
|
|
pipeline.do_validate_structures()
|
|
self.cleanup(pipeline)
|
|
|
|
def validate_dictionary(self, input_file_name):
|
|
pipeline = Pipeline()
|
|
pipeline.import_file(input_file_name, 'dictionary')
|
|
pipeline.do_validate_dictionary()
|
|
self.cleanup(pipeline)
|
|
|
|
def _strings_to_parse_sequence(self, pipeline):
|
|
pipeline.do_tokenise()
|
|
pipeline.do_tweak_conllu()
|
|
pipeline.do_parse()
|
|
pipeline.do_translate_jos()
|
|
pipeline.do_conllu_to_tei()
|
|
|
|
def _parse_to_dictionary_sequence(self, pipeline):
|
|
pipeline.do_find_collocation_structure_units()
|
|
pipeline.do_assign_collocation_structures()
|
|
pipeline.do_assign_other_structures()
|
|
pipeline.do_tei_to_dictionary()
|
|
|
|
def cleanup(self, pipeline):
|
|
shutil.rmtree(self.wani_directory, True)
|
|
pipeline.cleanup()
|
|
|
|
|
|
class Pipeline:
|
|
|
|
def __init__(self, nlp=None):
|
|
self.nlp = nlp
|
|
self.tmp_directory = tempfile.mkdtemp()
|
|
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
|
|
|
|
def import_file(self, file_name, file_key):
|
|
shutil.copyfile(file_name, self.file_map[file_key])
|
|
|
|
def do_tokenise(self):
|
|
print('Tokenising with obeliks ...')
|
|
input_file_name = self.file_map['strings-list']
|
|
output_file_name = self.file_map['obeliks-tokenised']
|
|
with open(input_file_name, 'r') as input_file:
|
|
input_conllu = input_file.read()
|
|
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.nlp.dir)
|
|
output_conllu = tokeniser(input_conllu).to_conll()
|
|
with open(output_file_name, 'w') as output_file:
|
|
output_file.write(output_conllu)
|
|
|
|
def do_tweak_conllu(self):
|
|
print('Tweaking conllu ...')
|
|
input_file_name = self.file_map['obeliks-tokenised']
|
|
output_file_name = self.file_map['obeliks-tweaked']
|
|
tweak_conllu(input_file_name, output_file_name)
|
|
|
|
def do_parse(self):
|
|
print('Parsing with classla ...')
|
|
input_file_name = self.file_map['obeliks-tweaked']
|
|
output_file_name = self.file_map['classla-parsed']
|
|
with open(input_file_name, 'r') as input_file:
|
|
input_conllu = input_file.read()
|
|
doc = self.nlp(input_conllu)
|
|
with open(output_file_name, 'w') as output_file:
|
|
output_file.write(doc.to_conll())
|
|
|
|
def do_translate_jos(self):
|
|
print('Translating JOS ...')
|
|
input_file_name = self.file_map['classla-parsed']
|
|
output_file_name = self.file_map['classla-translated']
|
|
translate_jos(input_file_name, output_file_name)
|
|
|
|
def do_conllu_to_tei(self):
|
|
print('Converting to TEI ...')
|
|
input_file_name = self.file_map['classla-translated']
|
|
output_file_name = self.file_map['tei-initial']
|
|
conllu_to_tei(input_file_name, output_file_name)
|
|
|
|
def do_find_collocation_structure_units(self):
|
|
print('Finding units for existing collocation structures ...')
|
|
|
|
from wani import main as wani_main
|
|
namespace = SimpleNamespace()
|
|
|
|
# relevant values
|
|
namespace.structures = self.file_map['structures-old']
|
|
namespace.input = [self.file_map['tei-initial']]
|
|
namespace.all = self.file_map['collocations']
|
|
namespace.skip_id_check = True
|
|
namespace.fixed_restriction_order = True
|
|
namespace.new_tei = True
|
|
|
|
# default values
|
|
namespace.sloleks_db = None
|
|
namespace.out = None
|
|
namespace.out_no_stat = None
|
|
namespace.stats = None
|
|
namespace.no_msd_translate = False
|
|
namespace.min_freq = 0
|
|
namespace.verbose = 'info'
|
|
namespace.count_files = False
|
|
namespace.multiple_output = False
|
|
namespace.load_sloleks = False
|
|
namespace.sort_by = -1
|
|
namespace.sort_reversed = False
|
|
namespace.db = None
|
|
namespace.collocation_sentence_map_dest = None
|
|
namespace.new_db = False
|
|
namespace.pc_tag = 'pc'
|
|
namespace.separator = '\t'
|
|
namespace.ignore_punctuations = False
|
|
|
|
wani_main(namespace)
|
|
|
|
def do_assign_collocation_structures(self):
|
|
print('Assigning ids of collocation structures ...')
|
|
input_file_name = self.file_map['tei-initial']
|
|
collocations_file_name = self.file_map['collocations']
|
|
output_file_name = self.file_map['tei-ids-collocation']
|
|
assign_collocation_structures(input_file_name, collocations_file_name, output_file_name)
|
|
|
|
def do_assign_other_structures(self):
|
|
print('Assigning ids of single and other structures, creating if necessary ...')
|
|
input_file_name = self.file_map['tei-ids-collocation']
|
|
structure_old_file_name = self.file_map['structures-old']
|
|
output_file_name = self.file_map['tei-ids-all']
|
|
structure_new_file_name = self.file_map['structures-new']
|
|
assign_other_structures(input_file_name, structure_old_file_name, output_file_name, structure_new_file_name)
|
|
|
|
def do_tei_to_dictionary(self):
|
|
print('Converting TEI to dictionary ...')
|
|
input_file_name = self.file_map['tei-ids-all']
|
|
output_file_name = self.file_map['dictionary']
|
|
tei_to_dictionary(input_file_name, output_file_name)
|
|
|
|
def _do_validate(self, schema_file_name, xml_file_name):
|
|
xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name))
|
|
xml_tree = lxml.parse(xml_file_name)
|
|
xml_schema.assertValid(xml_tree)
|
|
|
|
def do_validate_structures(self):
|
|
print('Validating structures ...')
|
|
schema_file_name = self.file_map['structure-schema']
|
|
xml_file_name = self.file_map['structures-new']
|
|
self._do_validate(schema_file_name, xml_file_name)
|
|
|
|
def do_validate_dictionary(self):
|
|
print('Validating dictionary ...')
|
|
schema_file_name = self.file_map['dictionary-schema']
|
|
xml_file_name = self.file_map['dictionary']
|
|
self._do_validate(schema_file_name, xml_file_name)
|
|
|
|
def export_file(self, file_name, file_key):
|
|
shutil.copyfile(self.file_map[file_key], file_name)
|
|
|
|
def cleanup(self):
|
|
shutil.rmtree(self.tmp_directory, True)
|