Merge branch 'master' into cordex
This commit is contained in:
commit
b36e07253f
6
.gitignore
vendored
6
.gitignore
vendored
|
@ -1,2 +1,6 @@
|
||||||
__pycache__
|
__pycache__
|
||||||
tmp
|
resources
|
||||||
|
tmp
|
||||||
|
venv
|
||||||
|
build
|
||||||
|
*.egg-info
|
||||||
|
|
16
README.md
16
README.md
|
@ -4,14 +4,18 @@ Pipeline for parsing a list of arbitrary Slovene strings and assigning
|
||||||
each to a syntactic structure in the DDD database, generating
|
each to a syntactic structure in the DDD database, generating
|
||||||
provisional new structures if necessary.
|
provisional new structures if necessary.
|
||||||
|
|
||||||
## Setup
|
## Installation
|
||||||
|
|
||||||
Most of the scripts come from other repositories and python libraries.
|
Installation requires the [CLASSLA](https://github.com/clarinsi/classla) standard_jos models, as
|
||||||
Run the set-up script:
|
well as (for now) the wani.py script from
|
||||||
|
[luscenje_struktur](https://gitea.cjvt.si/ozbolt/luscenje_struktur):
|
||||||
|
|
||||||
```
|
pip install .
|
||||||
$ scripts/setup.sh
|
python -c "import classla; classla.download('sl', dir='resources/classla', type='standard_jos')"
|
||||||
```
|
curl -o resources/wani.py https://gitea.cjvt.si/ozbolt/luscenje_struktur/raw/branch/master/wani.py
|
||||||
|
|
||||||
|
The classla directory and wani.py file do not necessarily need to be placed under resources/, but
|
||||||
|
the wrapper script scripts/process.py assumes that they are.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
import shutil
|
import shutil
|
||||||
|
import codecs
|
||||||
import tempfile
|
import tempfile
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
|
|
||||||
import classla
|
import classla
|
||||||
import cordex
|
import cordex
|
||||||
|
import classla.models.parser as classla_manual
|
||||||
|
|
||||||
from structure_assignment.constants import *
|
from structure_assignment.constants import *
|
||||||
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
||||||
|
@ -50,6 +52,65 @@ class Runner:
|
||||||
pipeline.export_file(output_file_name, 'tei-initial')
|
pipeline.export_file(output_file_name, 'tei-initial')
|
||||||
self.cleanup(pipeline)
|
self.cleanup(pipeline)
|
||||||
|
|
||||||
|
def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy
|
||||||
|
|
||||||
|
classla_conllu_file_name = '/tmp/classla.conlu'
|
||||||
|
merged_conllu_file_name = '/tmp/merged.conlu'
|
||||||
|
parsed_conllu_file_name = '/tmp/parsed.conlu'
|
||||||
|
|
||||||
|
pipeline = Pipeline(self.nlp)
|
||||||
|
pipeline.import_file(strings_file_name, 'strings-list')
|
||||||
|
pipeline.do_tokenise()
|
||||||
|
pipeline.do_tweak_conllu()
|
||||||
|
pipeline.do_parse()
|
||||||
|
pipeline.export_file(classla_conllu_file_name, 'classla-parsed')
|
||||||
|
|
||||||
|
classla_conllu_file = codecs.open(classla_conllu_file_name, 'r')
|
||||||
|
tagged_conllu_file = codecs.open(input_file_name, 'r')
|
||||||
|
merged_conllu_file = codecs.open(merged_conllu_file_name, 'w')
|
||||||
|
for (classla_line, tagged_line) in zip(classla_conllu_file, tagged_conllu_file):
|
||||||
|
classla_line = classla_line.strip()
|
||||||
|
tagged_line = tagged_line.strip()
|
||||||
|
if ((len(classla_line) == 0 and len(tagged_line) == 0)
|
||||||
|
or (classla_line.startswith('#') and tagged_line.startswith('#'))):
|
||||||
|
merged_line = classla_line
|
||||||
|
else:
|
||||||
|
classla_columns = classla_line.split('\t')
|
||||||
|
tagged_columns = tagged_line.split('\t')
|
||||||
|
assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(tagged_line)
|
||||||
|
assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(classla_line)
|
||||||
|
assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_columns[1], tagged_columns[1])
|
||||||
|
merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)]
|
||||||
|
merged_line = '\t'.join(merged_columns)
|
||||||
|
merged_conllu_file.write(merged_line + '\n')
|
||||||
|
merged_conllu_file.close()
|
||||||
|
tagged_conllu_file.close()
|
||||||
|
classla_conllu_file.close()
|
||||||
|
|
||||||
|
classla_map = {
|
||||||
|
'save_dir':self.classla_directory + '/sl/depparse',
|
||||||
|
'save_name':'standard_jos.pt',
|
||||||
|
'eval_file':merged_conllu_file_name,
|
||||||
|
'output_file':parsed_conllu_file_name,
|
||||||
|
'gold_file':merged_conllu_file_name,
|
||||||
|
'shorthand':'sl_ssj',
|
||||||
|
'mode':'predict',
|
||||||
|
'pretrain_file':self.classla_directory + '/sl/pretrain/standard.pt'
|
||||||
|
}
|
||||||
|
classla_arguments = []
|
||||||
|
for (key, value) in classla_map.items():
|
||||||
|
classla_arguments += ['--' + key, value]
|
||||||
|
classla_manual.main(args=classla_arguments)
|
||||||
|
|
||||||
|
pipeline.import_file(parsed_conllu_file_name, 'classla-parsed')
|
||||||
|
pipeline.do_translate_jos()
|
||||||
|
pipeline.do_conllu_to_tei()
|
||||||
|
pipeline.import_file(input_structure_file_name, 'structures-old')
|
||||||
|
self._parse_to_dictionary_sequence(pipeline)
|
||||||
|
pipeline.export_file(output_file_name, 'dictionary')
|
||||||
|
pipeline.export_file(output_structure_file_name, 'structures-new')
|
||||||
|
self.cleanup(pipeline)
|
||||||
|
|
||||||
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||||
pipeline = Pipeline()
|
pipeline = Pipeline()
|
||||||
pipeline.import_file(input_file_name, 'tei-initial')
|
pipeline.import_file(input_file_name, 'tei-initial')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user