Compare commits

...

2 Commits

@ -4,14 +4,18 @@ Pipeline for parsing a list of arbitrary Slovene strings and assigning
each to a syntactic structure in the DDD database, generating each to a syntactic structure in the DDD database, generating
provisional new structures if necessary. provisional new structures if necessary.
## Setup ## Installation
Most of the scripts come from other repositories and python libraries. Installation requires the [CLASSLA](https://github.com/clarinsi/classla) standard_jos models, as
Run the set-up script: well as (for now) the wani.py script from
[luscenje_struktur](https://gitea.cjvt.si/ozbolt/luscenje_struktur):
``` pip install .
$ scripts/setup.sh python -c "import classla; classla.download('sl', dir='resources/classla', type='standard_jos')"
``` curl -o resources/wani.py https://gitea.cjvt.si/ozbolt/luscenje_struktur/raw/branch/master/wani.py
The classla directory and wani.py file do not necessarily need to be placed under resources/, but
the wrapper script scripts/process.py assumes that they are.
## Usage ## Usage

@ -1,9 +1,11 @@
import shutil import shutil
import codecs
import tempfile import tempfile
from types import SimpleNamespace from types import SimpleNamespace
import lxml.etree as lxml import lxml.etree as lxml
import classla import classla
import classla.models.parser as classla_manual
from structure_assignment.constants import * from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu from structure_assignment.tweak_conllu import tweak as tweak_conllu
@ -58,6 +60,65 @@ class Runner:
pipeline.export_file(output_file_name, 'tei-initial') pipeline.export_file(output_file_name, 'tei-initial')
self.cleanup(pipeline) self.cleanup(pipeline)
def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy
classla_conllu_file_name = '/tmp/classla.conlu'
merged_conllu_file_name = '/tmp/merged.conlu'
parsed_conllu_file_name = '/tmp/parsed.conlu'
pipeline = Pipeline(self.nlp)
pipeline.import_file(strings_file_name, 'strings-list')
pipeline.do_tokenise()
pipeline.do_tweak_conllu()
pipeline.do_parse()
pipeline.export_file(classla_conllu_file_name, 'classla-parsed')
classla_conllu_file = codecs.open(classla_conllu_file_name, 'r')
tagged_conllu_file = codecs.open(input_file_name, 'r')
merged_conllu_file = codecs.open(merged_conllu_file_name, 'w')
for (classla_line, tagged_line) in zip(classla_conllu_file, tagged_conllu_file):
classla_line = classla_line.strip()
tagged_line = tagged_line.strip()
if ((len(classla_line) == 0 and len(tagged_line) == 0)
or (classla_line.startswith('#') and tagged_line.startswith('#'))):
merged_line = classla_line
else:
classla_columns = classla_line.split('\t')
tagged_columns = tagged_line.split('\t')
assert len(classla_columns) == len(tagged_columns) == 10 # conllu columns
assert classla_columns[0] == tagged_columns[0] # match index
assert classla_columns[1] == tagged_columns[1] # match token
merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)]
merged_line = '\t'.join(merged_columns)
merged_conllu_file.write(merged_line + '\n')
merged_conllu_file.close()
tagged_conllu_file.close()
classla_conllu_file.close()
classla_map = {
'save_dir':self.classla_directory + '/sl/depparse',
'save_name':'standard_jos.pt',
'eval_file':merged_conllu_file_name,
'output_file':parsed_conllu_file_name,
'gold_file':merged_conllu_file_name,
'shorthand':'sl_ssj',
'mode':'predict',
'pretrain_file':self.classla_directory + '/sl/pretrain/standard.pt'
}
classla_arguments = []
for (key, value) in classla_map.items():
classla_arguments += ['--' + key, value]
classla_manual.main(args=classla_arguments)
pipeline.import_file(parsed_conllu_file_name, 'classla-parsed')
pipeline.do_translate_jos()
pipeline.do_conllu_to_tei()
pipeline.import_file(input_structure_file_name, 'structures-old')
self._parse_to_dictionary_sequence(pipeline)
pipeline.export_file(output_file_name, 'dictionary')
pipeline.export_file(output_structure_file_name, 'structures-new')
self.cleanup(pipeline)
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
pipeline = Pipeline() pipeline = Pipeline()
pipeline.import_file(input_file_name, 'tei-initial') pipeline.import_file(input_file_name, 'tei-initial')

Loading…
Cancel
Save