@ -1,9 +1,9 @@
import shutil
import tempfile
from types import SimpleNamespace
import lxml . etree as lxml
import classla
import cordex
from structure_assignment . constants import *
from structure_assignment . tweak_conllu import tweak as tweak_conllu
@ -15,19 +15,11 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
class Runner :
def __init__ ( self , nlp_needed , classla_directory = None , wani_file_name = None ):
def __init__ ( self , nlp_needed , classla_directory = None ):
self . classla_directory = classla_directory
if ( nlp_needed ) :
NLP_CONFIG_MAP [ ' dir ' ] = classla_directory
self . nlp = classla . Pipeline ( ' sl ' , * * NLP_CONFIG_MAP )
if ( wani_file_name is not None ) :
self . _provide_wani ( wani_file_name )
def _provide_wani ( self , wani_file_name ) : # TODO: remove once wani is incorporated into luscenje_struktur package
self . wani_directory = tempfile . mkdtemp ( )
shutil . copy ( wani_file_name , self . wani_directory )
import sys
sys . path . insert ( 0 , self . wani_directory )
def run_all ( self , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) :
pipeline = Pipeline ( self . nlp )
@ -93,7 +85,6 @@ class Runner:
pipeline . do_tei_to_dictionary ( )
def cleanup ( self , pipeline ) :
shutil . rmtree ( self . wani_directory , True )
pipeline . cleanup ( )
@ -149,38 +140,13 @@ class Pipeline:
def do_find_collocation_structure_units ( self ) :
print ( ' Finding units for existing collocation structures ... ' )
from wani import main as wani_main
namespace = SimpleNamespace ( )
# relevant values
namespace . structures = self . file_map [ ' structures-old ' ]
namespace . input = [ self . file_map [ ' tei-initial ' ] ]
namespace . all = self . file_map [ ' collocations ' ]
namespace . skip_id_check = True
namespace . fixed_restriction_order = True
namespace . new_tei = True
# default values
namespace . sloleks_db = None
namespace . out = None
namespace . out_no_stat = None
namespace . stats = None
namespace . no_msd_translate = False
namespace . min_freq = 0
namespace . verbose = ' info '
namespace . count_files = False
namespace . multiple_output = False
namespace . load_sloleks = False
namespace . sort_by = - 1
namespace . sort_reversed = False
namespace . db = None
namespace . collocation_sentence_map_dest = None
namespace . new_db = False
namespace . pc_tag = ' pc '
namespace . separator = ' \t '
namespace . ignore_punctuations = False
wani_main ( namespace )
structure_file_name = self . file_map [ ' structures-old ' ]
input_file_name = self . file_map [ ' tei-initial ' ]
output_file_name = self . file_map [ ' collocations ' ]
extractor = cordex . Pipeline ( structure_file_name , fixed_restriction_order = True , statistics = False , collocation_sentence_map_dest = None )
extraction = extractor ( input_file_name )
extraction . write ( output_file_name , token_output = True )
def do_assign_collocation_structures ( self ) :
print ( ' Assigning ids of collocation structures ... ' )