structure_assignment/scripts/pipeline1.py

72 lines
2.5 KiB
Python
Raw Normal View History

import argparse
import os
import shutil
import codecs
import re
import classla
from classla import Document
from classla.models.common.conll import CoNLLFile
from constants import *
arg_parser = argparse.ArgumentParser(description='Parse Slovene strings and convert to TEI.')
arg_parser.add_argument('-inlist', type=str, help='Input list file')
arg_parser.add_argument('-outtei', type=str, help='Output TEI file')
arguments = arg_parser.parse_args()
2020-11-10 13:07:44 +00:00
input_file_name = arguments.inlist
output_file_name = arguments.outtei
NLP_CONFIG_MAP = {
'treebank': 'sl_ssj_jos',
'processors': 'tokenize,pos,lemma,depparse',
'tokenize_pretokenized': True,
'models_dir': CLASSLA_MODELS_DIRECTORY
}
XML_ID_PREFIX = 's'
def run_pipeline(input_file_name, output_file_name):
shutil.rmtree(TMP_DIRECTORY, True)
os.makedirs(TMP_DIRECTORY, exist_ok=True)
shutil.copyfile(input_file_name, STRING_LIST_FILE_NAME)
run_obeliks4J(STRING_LIST_FILE_NAME, OBELIKS_RAW_FILE_NAME)
fix_xml_ids(OBELIKS_RAW_FILE_NAME, OBELIKS_TWEAKED_FILE_NAME)
run_classla(OBELIKS_TWEAKED_FILE_NAME, CLASSLA_FILE_NAME)
run_tei_conversion(CLASSLA_FILE_NAME, TEI_INIT_FILE_NAME)
shutil.copyfile(TEI_INIT_FILE_NAME, output_file_name)
def run_obeliks4J(obeliks_file_name, classla_file_name):
print('Running obeliks ...')
obeliks_command = 'java -jar ' + OBELIKS_JAR_FILE_NAME + ' -d -if ' + STRING_LIST_FILE_NAME + ' -o ' + OBELIKS_RAW_FILE_NAME
os.system(obeliks_command)
def fix_xml_ids(input_file_name, output_file_name):
print('Fixing xml ids ...')
output_file = codecs.open(output_file_name, 'w')
input_file = codecs.open(input_file_name, 'r')
regexp = r'^(# sent_id = )(\d+\.\d+)$'
for line in input_file:
match = re.search(regexp, line)
if (match):
line = match.group(1) + XML_ID_PREFIX + match.group(2) + '\n'
output_file.write(line)
input_file.close()
output_file.close()
def run_classla(obeliks_file_name, classla_file_name):
print('Running classla ...')
doc = Document(text=None)
conll_file = CoNLLFile(filename=obeliks_file_name)
doc.conll_file = conll_file
nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
result = nlp(doc)
result.conll_file.write_conll(classla_file_name)
def run_tei_conversion(classla_file_name, tei_file_name):
print('Converting to tei ...')
convert_command = ' '.join(['python', CONLLU_TEI_SCRIPT_NAME, classla_file_name, tei_file_name, '--translate', TRANSLATION_FILE_NAME])
os.system(convert_command)
run_pipeline(input_file_name, output_file_name)