Redmine #1487: added support for tokens_to_dictionary

This commit is contained in:
Cyprian Laskowski 2022-09-28 17:24:44 +02:00
parent fad88a992b
commit 8d453eb20b

View File

@ -1,9 +1,11 @@
import shutil import shutil
import codecs
import tempfile import tempfile
from types import SimpleNamespace from types import SimpleNamespace
import lxml.etree as lxml import lxml.etree as lxml
import classla import classla
import classla.models.parser as classla_manual
from structure_assignment.constants import * from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu from structure_assignment.tweak_conllu import tweak as tweak_conllu
@ -58,6 +60,65 @@ class Runner:
pipeline.export_file(output_file_name, 'tei-initial') pipeline.export_file(output_file_name, 'tei-initial')
self.cleanup(pipeline) self.cleanup(pipeline)
def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy
classla_conllu_file_name = '/tmp/classla.conlu'
merged_conllu_file_name = '/tmp/merged.conlu'
parsed_conllu_file_name = '/tmp/parsed.conlu'
pipeline = Pipeline(self.nlp)
pipeline.import_file(strings_file_name, 'strings-list')
pipeline.do_tokenise()
pipeline.do_tweak_conllu()
pipeline.do_parse()
pipeline.export_file(classla_conllu_file_name, 'classla-parsed')
classla_conllu_file = codecs.open(classla_conllu_file_name, 'r')
tagged_conllu_file = codecs.open(input_file_name, 'r')
merged_conllu_file = codecs.open(merged_conllu_file_name, 'w')
for (classla_line, tagged_line) in zip(classla_conllu_file, tagged_conllu_file):
classla_line = classla_line.strip()
tagged_line = tagged_line.strip()
if ((len(classla_line) == 0 and len(tagged_line) == 0)
or (classla_line.startswith('#') and tagged_line.startswith('#'))):
merged_line = classla_line
else:
classla_columns = classla_line.split('\t')
tagged_columns = tagged_line.split('\t')
assert len(classla_columns) == len(tagged_columns) == 10 # conllu columns
assert classla_columns[0] == tagged_columns[0] # match index
assert classla_columns[1] == tagged_columns[1] # match token
merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)]
merged_line = '\t'.join(merged_columns)
merged_conllu_file.write(merged_line + '\n')
merged_conllu_file.close()
tagged_conllu_file.close()
classla_conllu_file.close()
classla_map = {
'save_dir':self.classla_directory + '/sl/depparse',
'save_name':'standard_jos.pt',
'eval_file':merged_conllu_file_name,
'output_file':parsed_conllu_file_name,
'gold_file':merged_conllu_file_name,
'shorthand':'sl_ssj',
'mode':'predict',
'pretrain_file':self.classla_directory + '/sl/pretrain/standard.pt'
}
classla_arguments = []
for (key, value) in classla_map.items():
classla_arguments += ['--' + key, value]
classla_manual.main(args=classla_arguments)
pipeline.import_file(parsed_conllu_file_name, 'classla-parsed')
pipeline.do_translate_jos()
pipeline.do_conllu_to_tei()
pipeline.import_file(input_structure_file_name, 'structures-old')
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(output_structure_file_name, 'structures-new')
self.cleanup(pipeline)
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline() pipeline = Pipeline()
pipeline.import_file(input_file_name, 'tei-initial') pipeline.import_file(input_file_name, 'tei-initial')