@ -1,9 +1,11 @@
import shutil
import codecs
import tempfile
import lxml . etree as lxml
import classla
import cordex
import classla . models . parser as classla_manual
from structure_assignment . constants import *
from structure_assignment . tweak_conllu import tweak as tweak_conllu
@ -50,6 +52,65 @@ class Runner:
pipeline . export_file ( output_file_name , ' tei-initial ' )
self . cleanup ( pipeline )
def tagged_to_dictionary ( self , strings_file_name , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) : # TODO: refactor/tidy
classla_conllu_file_name = ' /tmp/classla.conlu '
merged_conllu_file_name = ' /tmp/merged.conlu '
parsed_conllu_file_name = ' /tmp/parsed.conlu '
pipeline = Pipeline ( self . nlp )
pipeline . import_file ( strings_file_name , ' strings-list ' )
pipeline . do_tokenise ( )
pipeline . do_tweak_conllu ( )
pipeline . do_parse ( )
pipeline . export_file ( classla_conllu_file_name , ' classla-parsed ' )
classla_conllu_file = codecs . open ( classla_conllu_file_name , ' r ' )
tagged_conllu_file = codecs . open ( input_file_name , ' r ' )
merged_conllu_file = codecs . open ( merged_conllu_file_name , ' w ' )
for ( classla_line , tagged_line ) in zip ( classla_conllu_file , tagged_conllu_file ) :
classla_line = classla_line . strip ( )
tagged_line = tagged_line . strip ( )
if ( ( len ( classla_line ) == 0 and len ( tagged_line ) == 0 )
or ( classla_line . startswith ( ' # ' ) and tagged_line . startswith ( ' # ' ) ) ) :
merged_line = classla_line
else :
classla_columns = classla_line . split ( ' \t ' )
tagged_columns = tagged_line . split ( ' \t ' )
assert len ( classla_columns ) == 10 , ' Missing token in classla-generated conllu ( {} ). ' . format ( tagged_line )
assert len ( tagged_columns ) == 10 , ' Missing token in pre-tagged conllu ( {} ). ' . format ( classla_line )
assert classla_columns [ 1 ] == tagged_columns [ 1 ] , ' Pre-tagged token form ( {} ) does not match classla-generated token form ( {} ). ' . format ( classla_columns [ 1 ] , tagged_columns [ 1 ] )
merged_columns = [ classla_columns [ i ] if i in ( 3 , 5 , 9 ) else tagged_columns [ i ] for i in range ( 10 ) ]
merged_line = ' \t ' . join ( merged_columns )
merged_conllu_file . write ( merged_line + ' \n ' )
merged_conllu_file . close ( )
tagged_conllu_file . close ( )
classla_conllu_file . close ( )
classla_map = {
' save_dir ' : self . classla_directory + ' /sl/depparse ' ,
' save_name ' : ' standard_jos.pt ' ,
' eval_file ' : merged_conllu_file_name ,
' output_file ' : parsed_conllu_file_name ,
' gold_file ' : merged_conllu_file_name ,
' shorthand ' : ' sl_ssj ' ,
' mode ' : ' predict ' ,
' pretrain_file ' : self . classla_directory + ' /sl/pretrain/standard.pt '
}
classla_arguments = [ ]
for ( key , value ) in classla_map . items ( ) :
classla_arguments + = [ ' -- ' + key , value ]
classla_manual . main ( args = classla_arguments )
pipeline . import_file ( parsed_conllu_file_name , ' classla-parsed ' )
pipeline . do_translate_jos ( )
pipeline . do_conllu_to_tei ( )
pipeline . import_file ( input_structure_file_name , ' structures-old ' )
self . _parse_to_dictionary_sequence ( pipeline )
pipeline . export_file ( output_file_name , ' dictionary ' )
pipeline . export_file ( output_structure_file_name , ' structures-new ' )
self . cleanup ( pipeline )
def parse_to_dictionary ( self , input_file_name , output_file_name , input_structure_file_name , output_structure_file_name ) :
pipeline = Pipeline ( )
pipeline . import_file ( input_file_name , ' tei-initial ' )