structure_assignment/scripts/pipeline.py

136 lines
6.1 KiB
Python
Raw Normal View History

import sys
import os
import shutil
import codecs
import re
import lxml.etree as lxml
import classla
from classla import Document
from classla.models.common.conll import CoNLLFile
input_file_name = sys.argv[1]
output_file_name = sys.argv[2]
TMP_DIRECTORY = '../tmp/structure_assignment'
CONLLU_TEI_SCRIPT_NAME = 'conllu_to_xml.py'
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py'
OBELIKS_JAR_FILE_NAME = '../resources/obeliks.jar'
TRANSLATION_FILE_NAME = '../resources/dict.xml'
CLASSLA_MODELS_DIRECTORY = '../resources/classla'
STRUCTURE_CURRENT_FILE_NAME = '../resources/structures.xml'
STRUCTURE_SCHEMA_FILE_NAME = '../resources/structures.xsd'
DICTIONARY_SCHEMA_FILE_NAME = '../resources/monolingual_dictionaries.xsd'
STRING_LIST_FILE_NAME = TMP_DIRECTORY + '/strings.txt'
OBELIKS_RAW_FILE_NAME = TMP_DIRECTORY + '/obeliks_raw.conllu'
OBELIKS_TWEAKED_FILE_NAME = TMP_DIRECTORY + '/obeliks_tweaked.conllu'
CLASSLA_FILE_NAME = TMP_DIRECTORY + '/classla.conllu'
TEI_INIT_FILE_NAME = TMP_DIRECTORY + '/tei_initial.xml'
TEI_STRUCTURE_1_FILE_NAME = TMP_DIRECTORY + '/tei_with_structure_ids1.xml'
TEI_STRUCTURE_2_FILE_NAME = TMP_DIRECTORY + '/tei_with_structure_ids2.xml'
MWE_CSV_1_FILE_NAME = TMP_DIRECTORY + '/mwes1.csv'
MWE_CSV_2_FILE_NAME = TMP_DIRECTORY + '/mwes2.csv'
STRUCTURE_NEW_FILE_NAME = TMP_DIRECTORY + '/structures_new.xml'
DICTIONARY_XML_FILE_NAME = TMP_DIRECTORY + '/dictionary.xml'
NLP_CONFIG_MAP = {
'treebank': 'sl_ssj_jos',
'processors': 'tokenize,pos,lemma,depparse',
'tokenize_pretokenized': True,
'models_dir': CLASSLA_MODELS_DIRECTORY
}
XML_ID_PREFIX = 's'
def run_pipeline(input_file_name, output_file_name):
shutil.rmtree(TMP_DIRECTORY, True)
os.makedirs(TMP_DIRECTORY, exist_ok=True)
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME)
run_obeliks4J(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME)
fix_xml_ids(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME)
run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME)
run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME)
run_mwe_extraction(STRUCTURE_CURRENT_FILE_NAME, TEI_INIT_FILE_NAME, MWE_CSV_1_FILE_NAME)
run_structure_assignment(STRING_LIST_FILE_NAME, TEI_INIT_FILE_NAME, MWE_CSV_1_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME)
run_structure_creation(STRUCTURE_CURRENT_FILE_NAME, TEI_STRUCTURE_1_FILE_NAME, STRUCTURE_NEW_FILE_NAME)
validate_structures(STRUCTURE_NEW_FILE_NAME)
run_mwe_extraction(STRUCTURE_NEW_FILE_NAME, TEI_INIT_FILE_NAME, MWE_CSV_2_FILE_NAME)
run_structure_assignment(STRING_LIST_FILE_NAME, TEI_INIT_FILE_NAME, MWE_CSV_2_FILE_NAME, TEI_STRUCTURE_2_FILE_NAME)
run_dictionary_conversion(TEI_STRUCTURE_2_FILE_NAME, DICTIONARY_XML_FILE_NAME)
validate_dictionary(DICTIONARY_XML_FILE_NAME)
shutil.copyfile(DICTIONARY_XML_FILE_NAME, output_file_name)
def run_obeliks4J(obeliks_file_name, classla_file_name):
print('Running obeliks ...')
obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + STRING_LIST_FILE_NAME + ' -o ' + OBELIKS_RAW_FILE_NAME
os.system(obeliks_command)
def fix_xml_ids(input_file_name, output_file_name):
print('Fixing xml ids ...')
output_file = codecs.open(output_file_name, 'w')
input_file = codecs.open(input_file_name, 'r')
regexp = r'^(# sent_id = )(\d+\.\d+)$'
for line in input_file:
match = re.search(regexp, line)
if (match):
line = match.group(1) + XML_ID_PREFIX + match.group(2) + '\n'
output_file.write(line)
input_file.close()
output_file.close()
def run_classla(obeliks_file_name, classla_file_name):
print('Running classla ...')
doc = Document(text=None)
conll_file = CoNLLFile(filename=obeliks_file_name)
doc.conll_file = conll_file
nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
result = nlp(doc)
result.conll_file.write_conll(classla_file_name)
def run_tei_conversion(classla_file_name, tei_file_name):
print('Converting to tei ...')
convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, classla_file_name, tei_file_name, '--translate', TRANSLATION_FILE_NAME])
os.system(convert_command)
def run_mwe_extraction(structure_file_name, tei_file_name, mwe_csv_file_name):
print('Extracting MWEs from tei ...')
extraction_command = ' '.join(['python', MWE_EXTRACTION_SCRIPT_NAME, structure_file_name, tei_file_name, '--all', mwe_csv_file_name, '--skip-id-check', '--fixed-restriction-order'])
print(extraction_command)
os.system(extraction_command)
def validate_structures(structure_file_name):
print('Validating updated structure specifications ...')
xml_schema = lxml.XMLSchema(lxml.parse(STRUCTURE_SCHEMA_FILE_NAME))
xml_tree = lxml.parse(structure_file_name)
xml_schema.assertValid(xml_tree)
def run_structure_assignment(input_file_name, tei_file_name, mwe_csv_file_name, output_file_name):
print('Assigning structure ids ...')
assignment_command = ' '.join(['python', STRUCTURE_ASSIGNMENT_SCRIPT_NAME, input_file_name, tei_file_name, mwe_csv_file_name, output_file_name])
os.system(assignment_command)
def run_structure_creation(input_file_name, tei_file_name, output_file_name):
print('Creating missing structures ...')
creation_command = ' '.join(['python', STRUCTURE_CREATION_SCRIPT_NAME, '-infile', input_file_name, '-tei', tei_file_name, '-outfile', output_file_name])
os.system(creation_command)
def run_dictionary_conversion(tei_file_name, xml_file_name):
print('Converting to dictionary xml format ...')
convert_command = ' '.join(['python', TEI_DICTIONARY_SCRIPT_NAME, '-infile', tei_file_name, '-outfile', xml_file_name])
os.system(convert_command)
def validate_dictionary(dictionary_file_name):
print('Validating output dictionary file ...')
xml_schema = lxml.XMLSchema(lxml.parse(DICTIONARY_SCHEMA_FILE_NAME))
xml_tree = lxml.parse(dictionary_file_name)
xml_schema.assertValid(xml_tree)
run_pipeline(input_file_name, output_file_name)