You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

280 lines
12 KiB

import os
import shutil
import tempfile
from types import SimpleNamespace
import lxml.etree as lxml
import classla
from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu
from nova_slovnica.translate_jos import translate as translate_jos
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
from structure_assignment.split_tei import split as split_tei
from nova_slovnica.assign_single_structures import assign as assign_single
from nova_slovnica.assign_structures import assign as assign_multiple
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
from nova_slovnica.create_structures import create as create_structures
from structure_assignment.merge_dictionaries import merge as merge_dictionaries
class Runner:
def __init__(self, resource_directory, nlp_needed):
self.resource_directory = resource_directory
if (nlp_needed):
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline(self.resource_directory, self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
pipeline.import_file(input_structure_file_name, 'structures-old')
self._strings_to_parse_sequence(pipeline)
self._parse_to_dictionary_sequence(pipeline)
pipeline.do_validate_structures()
pipeline.export_file(output_structure_file_name, 'structures-new')
pipeline.do_validate_dictionary()
pipeline.export_file(output_file_name, 'dictionary')
pipeline.cleanup()
def strings_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline(self.resource_directory, self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
pipeline.import_file(input_structure_file_name, 'structures-old')
self._strings_to_parse_sequence(pipeline)
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(output_structure_file_name, 'structures-new')
pipeline.cleanup()
def strings_to_parse(self, input_file_name, output_file_name):
pipeline = Pipeline(self.resource_directory, self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
self._strings_to_parse_sequence(pipeline)
pipeline.export_file(output_file_name, 'tei-initial')
pipeline.cleanup()
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline(self.resource_directory)
pipeline.import_file(input_file_name, 'tei-initial')
pipeline.import_file(input_structure_file_name, 'structures-old')
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(output_structure_file_name, 'structures-new')
pipeline.cleanup()
def validate_structures(self, input_file_name):
pipeline = Pipeline(self.resource_directory)
pipeline.import_file(input_file_name, 'structures-new')
pipeline.do_validate_structures()
pipeline.cleanup()
def validate_dictionary(self, input_file_name):
pipeline = Pipeline(self.resource_directory)
pipeline.import_file(input_file_name, 'dictionary')
pipeline.do_validate_dictionary()
pipeline.cleanup()
def _strings_to_parse_sequence(self, pipeline):
pipeline.do_tokenise()
pipeline.do_tweak_conllu()
pipeline.do_parse()
pipeline.do_translate_jos()
pipeline.do_conllu_to_tei()
def _parse_to_dictionary_sequence(self, pipeline):
pipeline.do_split_tei()
pipeline.do_assign_single()
pipeline.do_tei_to_dictionary_single()
pipeline.do_find_structure_units_first()
pipeline.do_assign_multiple_first()
pipeline.do_create_structures()
pipeline.do_find_structure_units_second()
pipeline.do_assign_multiple_second()
pipeline.do_tei_to_dictionary_multiple()
pipeline.do_merge_dictionaries()
class Pipeline:
def __init__(self, resource_directory, nlp=None):
self.nlp = nlp
self.tmp_directory = tempfile.mkdtemp()
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
for resource_file_name in resource_file_names:
if (os.path.isfile(resource_file_name)):
shutil.copy(resource_file_name, self.tmp_directory)
import sys
sys.path.insert(0, self.tmp_directory)
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
self.classla_directory = resource_directory + '/classla'
def import_file(self, file_name, file_key):
shutil.copyfile(file_name, self.file_map[file_key])
def do_tokenise(self):
print('Tokenising with obeliks ...')
input_file_name = self.file_map['strings-list']
output_file_name = self.file_map['obeliks-tokenised']
with open(input_file_name, 'r') as input_file:
input_conllu = input_file.read()
tokeniser = classla.Pipeline('sl', processors='tokenize', dir=self.classla_directory)
output_conllu = tokeniser(input_conllu).to_conll()
with open(output_file_name, 'w') as output_file:
output_file.write(output_conllu)
def do_tweak_conllu(self):
print('Tweaking conllu ...')
input_file_name = self.file_map['obeliks-tokenised']
output_file_name = self.file_map['obeliks-tweaked']
tweak_conllu(input_file_name, output_file_name)
def do_parse(self):
print('Parsing with classla ...')
input_file_name = self.file_map['obeliks-tweaked']
output_file_name = self.file_map['classla-parsed']
with open(input_file_name, 'r') as input_file:
input_conllu = input_file.read()
doc = self.nlp(input_conllu)
with open(output_file_name, 'w') as output_file:
output_file.write(doc.to_conll())
def do_translate_jos(self):
print('Translating JOS ...')
input_file_name = self.file_map['classla-parsed']
dictionary_file_name = self.file_map['dict']
output_file_name = self.file_map['classla-translated']
translate_jos(input_file_name, dictionary_file_name, output_file_name)
def do_conllu_to_tei(self):
print('Converting to TEI ...')
input_file_name = self.file_map['classla-translated']
output_file_name = self.file_map['tei-initial']
conllu_to_tei(input_file_name, output_file_name)
def do_split_tei(self):
print('Splitting TEI ...')
input_file_name = self.file_map['tei-initial']
output_single_file_name = self.file_map['tei-single']
output_multiple_file_name = self.file_map['tei-multiple']
split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
def do_assign_single(self):
print('Assigning single structures ...')
input_file_name = self.file_map['tei-single']
structure_file_name = self.file_map['structures-old']
output_file_name = self.file_map['tei-single-ids']
assign_single(input_file_name, structure_file_name, output_file_name)
def do_tei_to_dictionary_single(self):
print('Converting single TEI to dictionary ...')
input_file_name = self.file_map['tei-single-ids']
output_file_name = self.file_map['dictionary-single']
tei_to_dictionary(input_file_name, output_file_name)
def do_tei_to_dictionary_multiple(self):
print('Converting multiple TEI to dictionary ...')
input_file_name = self.file_map['tei-multiple-ids-2']
output_file_name = self.file_map['dictionary-multiple']
tei_to_dictionary(input_file_name, output_file_name)
def do_find_structure_units_first(self):
print('Finding units for existing structures ...')
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
def do_find_structure_units_second(self):
print('Finding units for extended structures ...')
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
from wani import main as wani_main
namespace = SimpleNamespace()
# relevant values
namespace.structures = structure_file_name
namespace.input = [tei_file_name]
namespace.all = csv_file_name
namespace.skip_id_check = True
namespace.fixed_restriction_order = True
namespace.new_tei = True
# default values
namespace.sloleks_db = None
namespace.out = None
namespace.out_no_stat = None
namespace.stats = None
namespace.no_msd_translate = False
namespace.min_freq = 0
namespace.verbose = 'info'
namespace.count_files = False
namespace.multiple_output = False
namespace.load_sloleks = False
namespace.sort_by = -1
namespace.sort_reversed = False
namespace.db = None
namespace.collocation_sentence_map_dest = None
namespace.new_db = False
namespace.pc_tag = 'pc'
namespace.separator = '\t'
namespace.ignore_punctuations = False
wani_main(namespace)
def _find_min_other_id(self, key):
try:
root = lxml.parse(self.file_map[key])
other_ids = [int(oid) for oid in root.xpath('syntactic_structure[@type="other"]/@id')]
min_id = min(other_ids)
except:
min_id = 109 # This is the current value in structures.xml, and is not expected to change. Ugly, but code shouldn't reach here ...
return min_id
def do_assign_multiple_first(self):
print('Assigning ids based on existing structures ...')
min_other_id = self._find_min_other_id('structures-old')
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id)
def do_assign_multiple_second(self):
print('Assigning ids based on extended structures ...')
min_other_id = self._find_min_other_id('structures-new')
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id)
def do_create_structures(self):
print('Creating missing structures ...')
input_file_name = self.file_map['structures-old']
tei_file_name = self.file_map['tei-multiple-ids-1']
output_file_name = self.file_map['structures-new']
create_structures(input_file_name, tei_file_name, output_file_name)
def do_merge_dictionaries(self):
print('Merging single and multiple dictionaries ...')
single_file_name = self.file_map['dictionary-single']
multiple_file_name = self.file_map['dictionary-multiple']
output_file_name = self.file_map['dictionary']
merge_dictionaries(single_file_name, multiple_file_name, output_file_name)
def _do_validate(self, schema_file_name, xml_file_name):
xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name))
xml_tree = lxml.parse(xml_file_name)
xml_schema.assertValid(xml_tree)
def do_validate_structures(self):
print('Validating structures ...')
schema_file_name = self.file_map['structure-schema']
xml_file_name = self.file_map['structures-new']
self._do_validate(schema_file_name, xml_file_name)
def do_validate_dictionary(self):
print('Validating dictionary ...')
schema_file_name = self.file_map['dictionary-schema']
xml_file_name = self.file_map['dictionary']
self._do_validate(schema_file_name, xml_file_name)
def export_file(self, file_name, file_key):
shutil.copyfile(self.file_map[file_key], file_name)
def cleanup(self):
shutil.rmtree(self.tmp_directory, True)