@ -1,4 +1,3 @@
import os
import shutil
import tempfile
from types import SimpleNamespace
@ -16,14 +15,22 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
class Runner :
def __init__ ( self , resource_directory, nlp_needed ) :
self . resource_directory = resource _directory
def __init__ ( self , classla_directory, nlp_needed , wani = None ) :
self . classla_directory = classla _directory
if ( nlp_needed ) :
NLP_CONFIG_MAP [ ' dir ' ] = resource_directory + ' /classla '
self . nlp = classla . Pipeline ( ' sl ' , * * NLP_CONFIG_MAP )
NLP_CONFIG_MAP [ ' dir ' ] = classla_directory
self . nlp = classla . Pipeline ( ' sl ' , * * NLP_CONFIG_MAP )
if ( wani is not None ) :
self . _provide_wani ( wani )
def _provide_wani ( self , wani_file_name ) : # TODO: remove once wani is incorporated into luscenje_struktur package
self . wani_directory = tempfile . mkdtemp ( )
shutil . copy ( wani_file_name , self . wani_directory )
import sys
sys . path . insert ( 0 , self . wani_directory )
def run_all ( self , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) :
pipeline = Pipeline ( self . resource_directory , self . nlp )
pipeline = Pipeline ( self . nlp)
pipeline . import_file ( input_file_name , ' strings-list ' )
pipeline . import_file ( input_structure_file_name , ' structures-old ' )
self . _strings_to_parse_sequence ( pipeline )
@ -35,7 +42,7 @@ class Runner:
pipeline . cleanup ( )
def strings_to_dictionary ( self , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) :
pipeline = Pipeline ( self . resource_directory, self . nlp)
pipeline = Pipeline ( self . nlp)
pipeline . import_file ( input_file_name , ' strings-list ' )
pipeline . import_file ( input_structure_file_name , ' structures-old ' )
self . _strings_to_parse_sequence ( pipeline )
@ -45,14 +52,14 @@ class Runner:
pipeline . cleanup ( )
def strings_to_parse ( self , input_file_name , output_file_name ) :
pipeline = Pipeline ( self . resource_directory, self . nlp)
pipeline = Pipeline ( self . nlp)
pipeline . import_file ( input_file_name , ' strings-list ' )
self . _strings_to_parse_sequence ( pipeline )
pipeline . export_file ( output_file_name , ' tei-initial ' )
pipeline . cleanup ( )
def parse_to_dictionary ( self , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) :
pipeline = Pipeline ( self . resource_directory )
pipeline = Pipeline ( )
pipeline . import_file ( input_file_name , ' tei-initial ' )
pipeline . import_file ( input_structure_file_name , ' structures-old ' )
self . _parse_to_dictionary_sequence ( pipeline )
@ -61,13 +68,13 @@ class Runner:
pipeline . cleanup ( )
def validate_structures ( self , input_file_name ) :
pipeline = Pipeline ( self . resource_directory )
pipeline = Pipeline ( )
pipeline . import_file ( input_file_name , ' structures-new ' )
pipeline . do_validate_structures ( )
pipeline . cleanup ( )
def validate_dictionary ( self , input_file_name ) :
pipeline = Pipeline ( self . resource_directory )
pipeline = Pipeline ( )
pipeline . import_file ( input_file_name , ' dictionary ' )
pipeline . do_validate_dictionary ( )
pipeline . cleanup ( )
@ -87,17 +94,10 @@ class Runner:
class Pipeline :
def __init__ ( self , resource_directory, nlp= None ) :
def __init__ ( self , nlp= None ) :
self . nlp = nlp
self . tmp_directory = tempfile . mkdtemp ( )
resource_file_names = [ resource_directory + ' / ' + f for f in os . listdir ( resource_directory ) ]
for resource_file_name in resource_file_names :
if ( os . path . isfile ( resource_file_name ) ) :
shutil . copy ( resource_file_name , self . tmp_directory )
import sys
sys . path . insert ( 0 , self . tmp_directory )
self . file_map = { key : self . tmp_directory + ' / ' + FILE_MAP [ key ] for key in FILE_MAP . keys ( ) }
self . classla_directory = resource_directory + ' /classla '
def import_file ( self , file_name , file_key ) :
shutil . copyfile ( file_name , self . file_map [ file_key ] )
@ -108,7 +108,7 @@ class Pipeline:
output_file_name = self . file_map [ ' obeliks-tokenised ' ]
with open ( input_file_name , ' r ' ) as input_file :
input_conllu = input_file . read ( )
tokeniser = classla . Pipeline ( ' sl ' , processors = ' tokenize ' , dir = self . classla_directory )
tokeniser = classla . Pipeline ( ' sl ' , processors = ' tokenize ' , dir = self . nlp. dir )
output_conllu = tokeniser ( input_conllu ) . to_conll ( )
with open ( output_file_name , ' w ' ) as output_file :
output_file . write ( output_conllu )
@ -220,3 +220,4 @@ class Pipeline:
def cleanup ( self ) :
shutil . rmtree ( self . tmp_directory , True )
shutil . rmtree ( self . wani_directory , True )