import shutil
import codecs
import tempfile
from types import SimpleNamespace
import lxml . etree as lxml
import classla
import classla . models . parser as classla_manual
from structure_assignment . constants import *
from structure_assignment . tweak_conllu import tweak as tweak_conllu
from conversion_utils . translate_conllu_jos import translate as translate_jos
from conversion_utils . conllu_to_tei import convert_file as conllu_to_tei
from structure_assignment . assign_collocation_structures import assign as assign_collocation_structures
from structure_assignment . assign_other_structures import assign as assign_other_structures
from conversion_utils . tei_to_dictionary import convert as tei_to_dictionary
class Runner :
def __init__ ( self , nlp_needed , classla_directory = None , wani_file_name = None ) :
self . classla_directory = classla_directory
if ( nlp_needed ) :
NLP_CONFIG_MAP [ ' dir ' ] = classla_directory
self . nlp = classla . Pipeline ( ' sl ' , * * NLP_CONFIG_MAP )
if ( wani_file_name is not None ) :
self . _provide_wani ( wani_file_name )
def _provide_wani ( self , wani_file_name ) : # TODO: remove once wani is incorporated into luscenje_struktur package
self . wani_directory = tempfile . mkdtemp ( )
shutil . copy ( wani_file_name , self . wani_directory )
import sys
sys . path . insert ( 0 , self . wani_directory )
def run_all ( self , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) :
pipeline = Pipeline ( self . nlp )
pipeline . import_file ( input_file_name , ' strings-list ' )
pipeline . import_file ( input_structure_file_name , ' structures-old ' )
self . _strings_to_parse_sequence ( pipeline )
self . _parse_to_dictionary_sequence ( pipeline )
pipeline . do_validate_structures ( )
pipeline . export_file ( output_structure_file_name , ' structures-new ' )
pipeline . do_validate_dictionary ( )
pipeline . export_file ( output_file_name , ' dictionary ' )
self . cleanup ( pipeline )
def strings_to_dictionary ( self , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) :
pipeline = Pipeline ( self . nlp )
pipeline . import_file ( input_file_name , ' strings-list ' )
pipeline . import_file ( input_structure_file_name , ' structures-old ' )
self . _strings_to_parse_sequence ( pipeline )
self . _parse_to_dictionary_sequence ( pipeline )
pipeline . export_file ( output_file_name , ' dictionary ' )
pipeline . export_file ( output_structure_file_name , ' structures-new ' )
self . cleanup ( pipeline )
def strings_to_parse ( self , input_file_name , output_file_name ) :
pipeline = Pipeline ( self . nlp )
pipeline . import_file ( input_file_name , ' strings-list ' )
self . _strings_to_parse_sequence ( pipeline )
pipeline . export_file ( output_file_name , ' tei-initial ' )
self . cleanup ( pipeline )
def tagged_to_dictionary ( self , strings_file_name , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) : # TODO: refactor/tidy
classla_conllu_file_name = ' /tmp/classla.conlu '
merged_conllu_file_name = ' /tmp/merged.conlu '
parsed_conllu_file_name = ' /tmp/parsed.conlu '
pipeline = Pipeline ( self . nlp )
pipeline . import_file ( strings_file_name , ' strings-list ' )
pipeline . do_tokenise ( )
pipeline . do_tweak_conllu ( )
pipeline . do_parse ( )
pipeline . export_file ( classla_conllu_file_name , ' classla-parsed ' )
classla_conllu_file = codecs . open ( classla_conllu_file_name , ' r ' )
tagged_conllu_file = codecs . open ( input_file_name , ' r ' )
merged_conllu_file = codecs . open ( merged_conllu_file_name , ' w ' )
for ( classla_line , tagged_line ) in zip ( classla_conllu_file , tagged_conllu_file ) :
classla_line = classla_line . strip ( )
tagged_line = tagged_line . strip ( )
if ( ( len ( classla_line ) == 0 and len ( tagged_line ) == 0 )
or ( classla_line . startswith ( ' # ' ) and tagged_line . startswith ( ' # ' ) ) ) :
merged_line = classla_line
else :
classla_columns = classla_line . split ( ' \t ' )
tagged_columns = tagged_line . split ( ' \t ' )
assert len ( classla_columns ) == 10 , ' Missing token in classla-generated conllu ( {} ). ' . format ( len ( tagged_line ) )
assert len ( tagged_columns ) == 10 , ' Missing token in pre-tagged conllu ( {} ). ' . format ( len ( classla_line ) )
assert classla_columns [ 1 ] == tagged_columns [ 1 ] , ' Pre-tagged token form ( {} ) does not match classla-generated token form ( {} ). ' . format ( classla_columns [ 1 ] , tagged_columns [ 1 ] )
merged_columns = [ classla_columns [ i ] if i in ( 3 , 5 , 9 ) else tagged_columns [ i ] for i in range ( 10 ) ]
merged_line = ' \t ' . join ( merged_columns )
merged_conllu_file . write ( merged_line + ' \n ' )
merged_conllu_file . close ( )
tagged_conllu_file . close ( )
classla_conllu_file . close ( )
classla_map = {
' save_dir ' : self . classla_directory + ' /sl/depparse ' ,
' save_name ' : ' standard_jos.pt ' ,
' eval_file ' : merged_conllu_file_name ,
' output_file ' : parsed_conllu_file_name ,
' gold_file ' : merged_conllu_file_name ,
' shorthand ' : ' sl_ssj ' ,
' mode ' : ' predict ' ,
' pretrain_file ' : self . classla_directory + ' /sl/pretrain/standard.pt '
}
classla_arguments = [ ]
for ( key , value ) in classla_map . items ( ) :
classla_arguments + = [ ' -- ' + key , value ]
classla_manual . main ( args = classla_arguments )
pipeline . import_file ( parsed_conllu_file_name , ' classla-parsed ' )
pipeline . do_translate_jos ( )
pipeline . do_conllu_to_tei ( )
pipeline . import_file ( input_structure_file_name , ' structures-old ' )
self . _parse_to_dictionary_sequence ( pipeline )
pipeline . export_file ( output_file_name , ' dictionary ' )
pipeline . export_file ( output_structure_file_name , ' structures-new ' )
self . cleanup ( pipeline )
def parse_to_dictionary ( self , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) :
pipeline = Pipeline ( )
pipeline . import_file ( input_file_name , ' tei-initial ' )
pipeline . import_file ( input_structure_file_name , ' structures-old ' )
self . _parse_to_dictionary_sequence ( pipeline )
pipeline . export_file ( output_file_name , ' dictionary ' )
pipeline . export_file ( output_structure_file_name , ' structures-new ' )
self . cleanup ( pipeline )
def validate_structures ( self , input_file_name ) :
pipeline = Pipeline ( )
pipeline . import_file ( input_file_name , ' structures-new ' )
pipeline . do_validate_structures ( )
self . cleanup ( pipeline )
def validate_dictionary ( self , input_file_name ) :
pipeline = Pipeline ( )
pipeline . import_file ( input_file_name , ' dictionary ' )
pipeline . do_validate_dictionary ( )
self . cleanup ( pipeline )
def _strings_to_parse_sequence ( self , pipeline ) :
pipeline . do_tokenise ( )
pipeline . do_tweak_conllu ( )
pipeline . do_parse ( )
pipeline . do_translate_jos ( )
pipeline . do_conllu_to_tei ( )
def _parse_to_dictionary_sequence ( self , pipeline ) :
pipeline . do_find_collocation_structure_units ( )
pipeline . do_assign_collocation_structures ( )
pipeline . do_assign_other_structures ( )
pipeline . do_tei_to_dictionary ( )
def cleanup ( self , pipeline ) :
shutil . rmtree ( self . wani_directory , True )
pipeline . cleanup ( )
class Pipeline :
def __init__ ( self , nlp = None ) :
self . nlp = nlp
self . tmp_directory = tempfile . mkdtemp ( )
self . file_map = { key : self . tmp_directory + ' / ' + FILE_MAP [ key ] for key in FILE_MAP . keys ( ) }
def import_file ( self , file_name , file_key ) :
shutil . copyfile ( file_name , self . file_map [ file_key ] )
def do_tokenise ( self ) :
print ( ' Tokenising with obeliks ... ' )
input_file_name = self . file_map [ ' strings-list ' ]
output_file_name = self . file_map [ ' obeliks-tokenised ' ]
with open ( input_file_name , ' r ' ) as input_file :
input_conllu = input_file . read ( )
tokeniser = classla . Pipeline ( ' sl ' , processors = ' tokenize ' , dir = self . nlp . dir )
output_conllu = tokeniser ( input_conllu ) . to_conll ( )
with open ( output_file_name , ' w ' ) as output_file :
output_file . write ( output_conllu )
def do_tweak_conllu ( self ) :
print ( ' Tweaking conllu ... ' )
input_file_name = self . file_map [ ' obeliks-tokenised ' ]
output_file_name = self . file_map [ ' obeliks-tweaked ' ]
tweak_conllu ( input_file_name , output_file_name )
def do_parse ( self ) :
print ( ' Parsing with classla ... ' )
input_file_name = self . file_map [ ' obeliks-tweaked ' ]
output_file_name = self . file_map [ ' classla-parsed ' ]
with open ( input_file_name , ' r ' ) as input_file :
input_conllu = input_file . read ( )
doc = self . nlp ( input_conllu )
with open ( output_file_name , ' w ' ) as output_file :
output_file . write ( doc . to_conll ( ) )
def do_translate_jos ( self ) :
print ( ' Translating JOS ... ' )
input_file_name = self . file_map [ ' classla-parsed ' ]
output_file_name = self . file_map [ ' classla-translated ' ]
translate_jos ( input_file_name , output_file_name )
def do_conllu_to_tei ( self ) :
print ( ' Converting to TEI ... ' )
input_file_name = self . file_map [ ' classla-translated ' ]
output_file_name = self . file_map [ ' tei-initial ' ]
conllu_to_tei ( input_file_name , output_file_name )
def do_find_collocation_structure_units ( self ) :
print ( ' Finding units for existing collocation structures ... ' )
from wani import main as wani_main
namespace = SimpleNamespace ( )
# relevant values
namespace . structures = self . file_map [ ' structures-old ' ]
namespace . input = [ self . file_map [ ' tei-initial ' ] ]
namespace . all = self . file_map [ ' collocations ' ]
namespace . skip_id_check = True
namespace . fixed_restriction_order = True
namespace . new_tei = True
# default values
namespace . sloleks_db = None
namespace . out = None
namespace . out_no_stat = None
namespace . stats = None
namespace . no_msd_translate = False
namespace . min_freq = 0
namespace . verbose = ' info '
namespace . count_files = False
namespace . multiple_output = False
namespace . load_sloleks = False
namespace . sort_by = - 1
namespace . sort_reversed = False
namespace . db = None
namespace . collocation_sentence_map_dest = None
namespace . new_db = False
namespace . pc_tag = ' pc '
namespace . separator = ' \t '
namespace . ignore_punctuations = False
wani_main ( namespace )
def do_assign_collocation_structures ( self ) :
print ( ' Assigning ids of collocation structures ... ' )
input_file_name = self . file_map [ ' tei-initial ' ]
collocations_file_name = self . file_map [ ' collocations ' ]
output_file_name = self . file_map [ ' tei-ids-collocation ' ]
assign_collocation_structures ( input_file_name , collocations_file_name , output_file_name )
def do_assign_other_structures ( self ) :
print ( ' Assigning ids of single and other structures, creating if necessary ... ' )
input_file_name = self . file_map [ ' tei-ids-collocation ' ]
structure_old_file_name = self . file_map [ ' structures-old ' ]
output_file_name = self . file_map [ ' tei-ids-all ' ]
structure_new_file_name = self . file_map [ ' structures-new ' ]
assign_other_structures ( input_file_name , structure_old_file_name , output_file_name , structure_new_file_name )
def do_tei_to_dictionary ( self ) :
print ( ' Converting TEI to dictionary ... ' )
input_file_name = self . file_map [ ' tei-ids-all ' ]
output_file_name = self . file_map [ ' dictionary ' ]
tei_to_dictionary ( input_file_name , output_file_name )
def _do_validate ( self , schema_file_name , xml_file_name ) :
xml_schema = lxml . XMLSchema ( lxml . parse ( schema_file_name ) )
xml_tree = lxml . parse ( xml_file_name )
xml_schema . assertValid ( xml_tree )
def do_validate_structures ( self ) :
print ( ' Validating structures ... ' )
schema_file_name = self . file_map [ ' structure-schema ' ]
xml_file_name = self . file_map [ ' structures-new ' ]
self . _do_validate ( schema_file_name , xml_file_name )
def do_validate_dictionary ( self ) :
print ( ' Validating dictionary ... ' )
schema_file_name = self . file_map [ ' dictionary-schema ' ]
xml_file_name = self . file_map [ ' dictionary ' ]
self . _do_validate ( schema_file_name , xml_file_name )
def export_file ( self , file_name , file_key ) :
shutil . copyfile ( self . file_map [ file_key ] , file_name )
def cleanup ( self ) :
shutil . rmtree ( self . tmp_directory , True )