You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

278 lines
12 KiB

import os
import shutil
import tempfile
from types import SimpleNamespace
import lxml.etree as lxml
import obeliks
import classla
from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu
from nova_slovnica.translate_jos import translate as translate_jos
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
from structure_assignment.split_tei import split as split_tei
from nova_slovnica.assign_single_structures import assign as assign_single
from nova_slovnica.assign_structures import assign as assign_multiple
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
from nova_slovnica.create_structures import create as create_structures
from structure_assignment.merge_dictionaries import merge as merge_dictionaries
class Runner:
def __init__(self, resource_directory, nlp_needed):
self.resource_directory = resource_directory
if (nlp_needed):
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
def run_all(input_file_name, output_file_name, structure_file_name):
pipeline = Pipeline(self.resource_directory, self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
self._strings_to_parse_sequence(pipeline)
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(structure_file_name, 'structures-new')
self._validate_structures(structure_file_name)
self._validate_dictionary(output_file_name)
pipeline.cleanup()
def strings_to_dictionary(input_file_name, output_file_name, structure_file_name):
pipeline = Pipeline(self.resource_directory, self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
self._strings_to_parse_sequence(pipeline)
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(structure_file_name, 'structures-new')
pipeline.cleanup()
def strings_to_parse(self, input_file_name, output_file_name):
pipeline = Pipeline(self.resource_directory, self.nlp)
pipeline.import_file(input_file_name, 'strings-list')
self._strings_to_parse_sequence(pipeline)
pipeline.export_file(output_file_name, 'tei-initial')
pipeline.cleanup()
def parse_to_dictionary(self, input_file_name, output_file_name, structure_file_name):
pipeline = Pipeline(self.resource_directory)
pipeline.import_file(input_file_name, 'tei-initial')
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(structure_file_name, 'structures-new')
pipeline.cleanup()
def validate_structures(self, input_file_name):
pipeline = Pipeline(self.resource_directory)
pipeline.import_file(input_file_name, 'structures-new')
self._validate_structures_sequence(pipeline)
pipeline.cleanup()
def validate_dictionary(self, input_file_name):
pipeline = Pipeline(self.resource_directory)
pipeline.import_file(input_file_name, 'dictionary')
self._validate_dictionary_sequence(pipeline)
pipeline.cleanup()
def _strings_to_parse_sequence(self, pipeline):
pipeline.do_tokenise()
pipeline.do_tweak_conllu()
pipeline.do_parse()
pipeline.do_translate_jos()
pipeline.do_conllu_to_tei()
def _parse_to_dictionary_sequence(self, pipeline):
pipeline.do_split_tei()
pipeline.do_assign_single()
pipeline.do_tei_to_dictionary_single()
pipeline.do_find_structure_units_first()
pipeline.do_assign_multiple_first()
pipeline.do_create_structures()
pipeline.do_find_structure_units_second()
pipeline.do_assign_multiple_second()
pipeline.do_tei_to_dictionary_multiple()
pipeline.do_merge_dictionaries()
def _validate_structures_sequence(self, pipeline):
pipeline.do_validate_structures()
def _validate_dictionary_sequence(self, pipeline):
pipeline.do_validate_dictionary()
class Pipeline:
def __init__(self, resource_directory, nlp=None):
self.nlp = nlp
self.tmp_directory = tempfile.mkdtemp()
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
for resource_file_name in resource_file_names:
if (os.path.isfile(resource_file_name)):
shutil.copy(resource_file_name, self.tmp_directory)
import sys
sys.path.insert(0, self.tmp_directory)
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
def import_file(self, file_name, file_key):
shutil.copyfile(file_name, self.file_map[file_key])
def do_tokenise(self):
print('Tokenising with obeliks ...')
input_file_name = self.file_map['strings-list']
output_file_name = self.file_map['obeliks-tokenised']
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
def do_tweak_conllu(self):
print('Tweaking conllu ...')
input_file_name = self.file_map['obeliks-tokenised']
output_file_name = self.file_map['obeliks-tweaked']
tweak_conllu(input_file_name, output_file_name)
def do_parse(self):
print('Parsing with classla ...')
input_file_name = self.file_map['obeliks-tweaked']
output_file_name = self.file_map['classla-parsed']
with open(input_file_name, 'r') as input_file:
input_conllu = input_file.read()
doc = self.nlp(input_conllu)
with open(output_file_name, 'w') as output_file:
output_file.write(doc.to_conll())
def do_translate_jos(self):
print('Translating JOS ...')
input_file_name = self.file_map['classla-parsed']
dictionary_file_name = self.file_map['dict']
output_file_name = self.file_map['classla-translated']
translate_jos(input_file_name, dictionary_file_name, output_file_name)
def do_conllu_to_tei(self):
print('Converting to TEI ...')
input_file_name = self.file_map['classla-translated']
output_file_name = self.file_map['tei-initial']
conllu_to_tei(input_file_name, output_file_name)
def do_split_tei(self):
print('Splitting TEI ...')
input_file_name = self.file_map['tei-initial']
output_single_file_name = self.file_map['tei-single']
output_multiple_file_name = self.file_map['tei-multiple']
split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
def do_assign_single(self):
print('Assigning single structures ...')
input_file_name = self.file_map['tei-single']
structure_file_name = self.file_map['structures-old']
output_file_name = self.file_map['tei-single-ids']
assign_single(input_file_name, structure_file_name, output_file_name)
def do_tei_to_dictionary_single(self):
print('Converting single TEI to dictionary ...')
input_file_name = self.file_map['tei-single-ids']
output_file_name = self.file_map['dictionary-single']
tei_to_dictionary(input_file_name, output_file_name)
def do_tei_to_dictionary_multiple(self):
print('Converting multiple TEI to dictionary ...')
input_file_name = self.file_map['tei-multiple-ids-2']
output_file_name = self.file_map['dictionary-multiple']
tei_to_dictionary(input_file_name, output_file_name)
def do_find_structure_units_first(self):
print('Finding units for existing structures ...')
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
def do_find_structure_units_second(self):
print('Finding units for extended structures ...')
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
from wani import main as wani_main
namespace = SimpleNamespace()
# relevant values
namespace.structures = structure_file_name
namespace.input = [tei_file_name]
namespace.all = csv_file_name
namespace.skip_id_check = True
namespace.fixed_restriction_order = True
namespace.new_tei = True
# default values
namespace.sloleks_db = None
namespace.out = None
namespace.out_no_stat = None
namespace.stats = None
namespace.no_msd_translate = False
namespace.min_freq = 0
namespace.verbose = 'info'
namespace.count_files = False
namespace.multiple_output = False
namespace.load_sloleks = False
namespace.sort_by = -1
namespace.sort_reversed = False
namespace.db = None
namespace.collocation_sentence_map_dest = None
namespace.new_db = False
namespace.pc_tag = 'pc'
namespace.separator = '\t'
namespace.ignore_punctuations = False
wani_main(namespace)
def _find_min_other_id(self, key):
try:
root = lxml.parse(self.file_map[key])
other_ids = [int(oid) for oid in root.xpath('syntactic_structure[@type="other"]/@id')]
min_id = min(other_ids)
except:
min_id = 109 # This is the current value in structures.xml, and is not expected to change. Ugly, but code shouldn't reach here ...
return min_id
def do_assign_multiple_first(self):
print('Assigning ids based on existing structures ...')
min_other_id = self._find_min_other_id('structures-old')
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id)
def do_assign_multiple_second(self):
print('Assigning ids based on extended structures ...')
min_other_id = self._find_min_other_id('structures-new')
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id)
def do_create_structures(self):
print('Creating missing structures ...')
input_file_name = self.file_map['structures-old']
tei_file_name = self.file_map['tei-multiple-ids-1']
output_file_name = self.file_map['structures-new']
create_structures(input_file_name, tei_file_name, output_file_name)
def do_merge_dictionaries(self):
print('Merging single and multiple dictionaries ...')
single_file_name = self.file_map['dictionary-single']
multiple_file_name = self.file_map['dictionary-multiple']
output_file_name = self.file_map['dictionary']
merge_dictionaries(single_file_name, multiple_file_name, output_file_name)
def _do_validate(self, schema_file_name, xml_file_name):
xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name))
xml_tree = lxml.parse(xml_file_name)
xml_schema.assertValid(xml_tree)
def do_validate_structures(self):
print('Validating structures ...')
schema_file_name = self.file_map['structure-schema']
xml_file_name = self.file_map['structures-new']
self._do_validate(schema_file_name, xml_file_name)
def do_validate_dictionary(self):
print('Validating dictionary ...')
schema_file_name = self.file_map['dictionary-schema']
xml_file_name = self.file_map['dictionary']
self._do_validate(schema_file_name, xml_file_name)
def export_file(self, file_name, file_key):
shutil.copyfile(self.file_map[file_key], file_name)
def cleanup(self):
shutil.rmtree(self.tmp_directory, True)