You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

187 lines
7.7 KiB

import os
import shutil
import tempfile
from types import SimpleNamespace
import lxml.etree as lxml
import obeliks
import classla
from classla import Document
#from classla.models.common.conll import CoNLLFile
from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu
from nova_slovnica.translate_jos import translate as translate_jos
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
from structure_assignment.split_tei import split as split_tei
from nova_slovnica.assign_single_structures import assign as assign_single
from nova_slovnica.assign_structures import assign as assign_multiple
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
from nova_slovnica.create_structures import create as create_structures
from structure_assignment.merge_dictionaries import merge as merge_dictionaries
def create_nlp(resource_directory):
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
return classla.Pipeline('sl', **NLP_CONFIG_MAP)
class Pipeline:
def __init__(self, nlp, resource_directory):
self.nlp = nlp
self.tmp_directory = tempfile.mkdtemp()
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
for resource_file_name in resource_file_names:
if (os.path.isfile(resource_file_name)):
shutil.copy(resource_file_name, self.tmp_directory)
import sys
sys.path.insert(0, self.tmp_directory)
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
def import_file(self, file_name, file_key):
shutil.copyfile(file_name, self.file_map[file_key])
def do_tokenise(self):
input_file_name = self.file_map['strings-list']
output_file_name = self.file_map['obeliks-tokenised']
obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
def do_tweak_conllu(self):
input_file_name = self.file_map['obeliks-tokenised']
output_file_name = self.file_map['obeliks-tweaked']
tweak_conllu(input_file_name, output_file_name)
def do_parse(self):
input_file_name = self.file_map['obeliks-tweaked']
output_file_name = self.file_map['classla-parsed']
doc = Document(text=None)
conll_file = CoNLLFile(filename=input_file_name)
doc.conll_file = conll_file
result = nlp(doc)
result.conll_file.write_conll(output_file_name)
def do_translate_jos(self):
input_file_name = self.file_map['classla-parsed']
dictionary_file_name = self.file_map['dict']
output_file_name = self.file_map['classla-translated']
translate_jos(input_file_name, dictionary_file_name, output_file_name)
def do_conllu_to_tei(self):
input_file_name = self.file_map['classla-translated']
output_file_name = self.file_map['tei-initial']
conllu_to_tei(input_file_name, output_file_name)
def do_split_tei(self):
input_file_name = self.file_map['tei-initial']
output_single_file_name = self.file_map['tei-single']
output_multiple_file_name = self.file_map['tei-multiple']
split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
def do_assign_single(self):
input_file_name = self.file_map['tei-single']
structure_file_name = self.file_map['structures-old']
output_file_name = self.file_map['tei-single-ids']
assign_single(input_file_name, structure_file_name, output_file_name)
def do_tei_to_dictionary_single(self):
input_file_name = self.file_map['tei-single-ids']
output_file_name = self.file_map['dictionary-single']
tei_to_dictionary(input_file_name, output_file_name)
def do_tei_to_dictionary_multiple(self):
input_file_name = self.file_map['tei-multiple-ids-2']
output_file_name = self.file_map['dictionary-multiple']
tei_to_dictionary(input_file_name, output_file_name)
def do_find_structure_units_first(self):
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
def do_find_structure_units_second(self):
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
from wani import main as wani_main
namespace = SimpleNamespace()
# relevant values
namespace.structures = structure_file_name
namespace.input = [tei_file_name]
namespace.all = csv_file_name
namespace.skip_id_check = True
namespace.fixed_restriction_order = True
namespace.new_tei = True
# default values
namespace.sloleks_db = None
namespace.out = None
namespace.out_no_stat = None
namespace.stats = None
namespace.no_msd_translate = False
namespace.min_freq = 0
namespace.verbose = 'info'
namespace.count_files = False
namespace.multiple_output = False
namespace.load_sloleks = False
namespace.sort_by = -1
namespace.sort_reversed = False
namespace.db = None
namespace.collocation_sentence_map_dest = None
namespace.new_db = False
namespace.pc_tag = 'pc'
namespace.separator = '\t'
namespace.ignore_punctuations = False
wani_main(namespace)
def _find_min_other_id(self, key):
try:
root = lxml.parse(self.file_map[key])
other_ids = [int(oid) for oid in root.xpath('syntactic_structure[@type="other"]/@id')]
min_id = min(other_ids)
except:
min_id = 109 # This is the current value in structures.xml, and is not expected to change. Ugly, but code shouldn't reach here ...
return min_id
def do_assign_multiple_first(self):
min_other_id = self._find_min_other_id('structures-old')
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id)
def do_assign_multiple_second(self):
min_other_id = self._find_min_other_id('structures-new')
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id)
def do_create_structures(self):
input_file_name = self.file_map['structures-old']
tei_file_name = self.file_map['tei-multiple-ids-1']
output_file_name = self.file_map['structures-new']
create_structures(input_file_name, tei_file_name, output_file_name)
def do_merge_dictionaries(self):
single_file_name = self.file_map['dictionary-single']
multiple_file_name = self.file_map['dictionary-multiple']
output_file_name = self.file_map['dictionary']
merge_dictionaries(single_file_name, multiple_file_name, output_file_name)
def _do_validate(self, schema_file_name, xml_file_name):
xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name))
xml_tree = lxml.parse(xml_file_name)
xml_schema.assertValid(xml_tree)
def do_validate_structures(self):
schema_file_name = self.file_map['structure-schema']
xml_file_name = self.file_map['structures-new']
self._do_validate(schema_file_name, xml_file_name)
def do_validate_dictionary(self):
schema_file_name = self.file_map['dictionary-schema']
xml_file_name = self.file_map['dictionary']
self._do_validate(schema_file_name, xml_file_name)
def export_file(self, file_name, file_key):
shutil.copyfile(self.file_map[file_key], file_name)
def cleanup(self):
shutil.rmtree(self.tmp_directory, True)