You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
160 lines
6.5 KiB
160 lines
6.5 KiB
import os
|
|
import shutil
|
|
import tempfile
|
|
from types import SimpleNamespace
|
|
|
|
import obeliks
|
|
|
|
import classla
|
|
from classla import Document
|
|
#from classla.models.common.conll import CoNLLFile
|
|
|
|
from structure_assignment.constants import *
|
|
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
|
from nova_slovnica.translate_jos import translate as translate_jos
|
|
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
|
|
from structure_assignment.split_tei import split as split_tei
|
|
from nova_slovnica.assign_single_structures import assign as assign_single
|
|
from nova_slovnica.assign_structures import assign as assign_multiple
|
|
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
|
|
from nova_slovnica.create_structures import create as create_structures
|
|
from structure_assignment.merge_dictionaries import merge as merge_dictionaries
|
|
|
|
def create_nlp(resource_directory):
|
|
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
|
return classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
|
|
|
class Pipeline:
|
|
|
|
def __init__(self, nlp, resource_directory):
|
|
self.nlp = nlp
|
|
self.tmp_directory = tempfile.mkdtemp()
|
|
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
|
|
for resource_file_name in resource_file_names:
|
|
if (os.path.isfile(resource_file_name)):
|
|
shutil.copy(resource_file_name, self.tmp_directory)
|
|
import sys
|
|
sys.path.insert(0, self.tmp_directory)
|
|
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
|
|
|
|
def import_file(self, file_name, file_key):
|
|
shutil.copyfile(file_name, self.file_map[file_key])
|
|
|
|
def do_tokenise(self):
|
|
input_file_name = self.file_map['strings-list']
|
|
output_file_name = self.file_map['obeliks-tokenised']
|
|
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
|
|
|
|
def do_tweak_conllu(self):
|
|
input_file_name = self.file_map['obeliks-tokenised']
|
|
output_file_name = self.file_map['obeliks-tweaked']
|
|
tweak_conllu(input_file_name, output_file_name)
|
|
|
|
def do_parse(self):
|
|
input_file_name = self.file_map['obeliks-tweaked']
|
|
output_file_name = self.file_map['classla-parsed']
|
|
doc = Document(text=None)
|
|
conll_file = CoNLLFile(filename=input_file_name)
|
|
doc.conll_file = conll_file
|
|
result = nlp(doc)
|
|
result.conll_file.write_conll(output_file_name)
|
|
|
|
def do_translate_jos(self):
|
|
input_file_name = self.file_map['classla-parsed']
|
|
dictionary_file_name = self.file_map['dict']
|
|
output_file_name = self.file_map['classla-translated']
|
|
translate_jos(input_file_name, dictionary_file_name, output_file_name)
|
|
|
|
def do_conllu_to_tei(self):
|
|
input_file_name = self.file_map['classla-translated']
|
|
output_file_name = self.file_map['tei-initial']
|
|
conllu_to_tei(input_file_name, output_file_name)
|
|
|
|
def do_split_tei(self):
|
|
input_file_name = self.file_map['tei-initial']
|
|
output_single_file_name = self.file_map['tei-single']
|
|
output_multiple_file_name = self.file_map['tei-multiple']
|
|
split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
|
|
|
|
def do_assign_single(self):
|
|
input_file_name = self.file_map['tei-single']
|
|
structure_file_name = self.file_map['structures-old']
|
|
output_file_name = self.file_map['tei-single-ids']
|
|
assign_single(input_file_name, structure_file_name, output_file_name)
|
|
|
|
def do_tei_to_dictionary_single(self):
|
|
input_file_name = self.file_map['tei-single-ids']
|
|
output_file_name = self.file_map['dictionary-single']
|
|
tei_to_dictionary(input_file_name, output_file_name)
|
|
|
|
def do_tei_to_dictionary_multiple(self):
|
|
input_file_name = self.file_map['tei-multiple-ids-2']
|
|
output_file_name = self.file_map['dictionary-multiple']
|
|
tei_to_dictionary(input_file_name, output_file_name)
|
|
|
|
def do_find_structure_units_first(self):
|
|
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
|
|
|
|
def do_find_structure_units_second(self):
|
|
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
|
|
|
|
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
|
|
|
|
from wani import main as wani_main
|
|
namespace = SimpleNamespace()
|
|
|
|
# relevant values
|
|
namespace.structures = structure_file_name
|
|
namespace.input = [tei_file_name]
|
|
namespace.all = csv_file_name
|
|
namespace.skip_id_check = True
|
|
namespace.fixed_restriction_order = True
|
|
namespace.new_tei = True
|
|
|
|
# default values
|
|
namespace.sloleks_db = None
|
|
namespace.out = None
|
|
namespace.out_no_stat = None
|
|
namespace.stats = None
|
|
namespace.no_msd_translate = False
|
|
namespace.min_freq = 0
|
|
namespace.verbose = 'info'
|
|
namespace.count_files = False
|
|
namespace.multiple_output = False
|
|
namespace.load_sloleks = False
|
|
namespace.sort_by = -1
|
|
namespace.sort_reversed = False
|
|
namespace.db = None
|
|
namespace.collocation_sentence_map_dest = None
|
|
namespace.new_db = False
|
|
namespace.pc_tag = 'pc'
|
|
namespace.separator = '\t'
|
|
namespace.ignore_punctuations = False
|
|
|
|
wani_main(namespace)
|
|
|
|
|
|
def do_assign_multiple_first(self):
|
|
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'])
|
|
|
|
def do_assign_multiple_second(self):
|
|
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'])
|
|
|
|
def do_create_structures(self):
|
|
input_file_name = self.file_map['structures-old']
|
|
tei_file_name = self.file_map['tei-multiple-ids-1']
|
|
output_file_name = self.file_map['structures-new']
|
|
create_structures(input_file_name, tei_file_name, output_file_name)
|
|
|
|
def do_merge_dictionaries(self):
|
|
single_file_name = self.file_map['dictionary-single']
|
|
multiple_file_name = self.file_map['dictionary-multiple']
|
|
output_file_name = self.file_map['dictionary']
|
|
merge_dictionaries(single_file_name, multiple_file_name, output_file_name)
|
|
|
|
def export_file(self, file_name, file_key):
|
|
shutil.copyfile(self.file_map[file_key], file_name)
|
|
|
|
def cleanup(self):
|
|
shutil.rmtree(self.tmp_directory, True)
|