From fad88a992b236497cccbcf735d47972103761e10 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Wed, 28 Sep 2022 17:24:14 +0200 Subject: [PATCH 1/7] Redmine #1487: tweaked readme --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a2bfcd7..e7de3d8 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,18 @@ Pipeline for parsing a list of arbitrary Slovene strings and assigning each to a syntactic structure in the DDD database, generating provisional new structures if necessary. -## Setup +## Installation -Most of the scripts come from other repositories and python libraries. -Run the set-up script: +Installation requires the [CLASSLA](https://github.com/clarinsi/classla) standard_jos models, as +well as (for now) the wani.py script from +[luscenje_struktur](https://gitea.cjvt.si/ozbolt/luscenje_struktur): -``` -$ scripts/setup.sh -``` + pip install . + python -c "import classla; classla.download('sl', dir='resources/classla', type='standard_jos')" + curl -o resources/wani.py https://gitea.cjvt.si/ozbolt/luscenje_struktur/raw/branch/master/wani.py + +The classla directory and wani.py file do not necessarily need to be placed under resources/, but +the wrapper script scripts/process.py assumes that they are. ## Usage From 8d453eb20bab2b77664756a7d15d5ce769f7101a Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Wed, 28 Sep 2022 17:24:44 +0200 Subject: [PATCH 2/7] Redmine #1487: added support for tokens_to_dictionary --- structure_assignment/pipeline.py | 61 ++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index d940b84..ecf95d6 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -1,9 +1,11 @@ import shutil +import codecs import tempfile from types import SimpleNamespace import lxml.etree as lxml import classla +import classla.models.parser as classla_manual from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu @@ -58,6 +60,65 @@ class Runner: pipeline.export_file(output_file_name, 'tei-initial') self.cleanup(pipeline) + def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): # TODO: refactor/tidy + + classla_conllu_file_name = '/tmp/classla.conlu' + merged_conllu_file_name = '/tmp/merged.conlu' + parsed_conllu_file_name = '/tmp/parsed.conlu' + + pipeline = Pipeline(self.nlp) + pipeline.import_file(strings_file_name, 'strings-list') + pipeline.do_tokenise() + pipeline.do_tweak_conllu() + pipeline.do_parse() + pipeline.export_file(classla_conllu_file_name, 'classla-parsed') + + classla_conllu_file = codecs.open(classla_conllu_file_name, 'r') + tagged_conllu_file = codecs.open(input_file_name, 'r') + merged_conllu_file = codecs.open(merged_conllu_file_name, 'w') + for (classla_line, tagged_line) in zip(classla_conllu_file, tagged_conllu_file): + classla_line = classla_line.strip() + tagged_line = tagged_line.strip() + if ((len(classla_line) == 0 and len(tagged_line) == 0) + or (classla_line.startswith('#') and tagged_line.startswith('#'))): + merged_line = classla_line + else: + classla_columns = classla_line.split('\t') + tagged_columns = tagged_line.split('\t') + assert len(classla_columns) == len(tagged_columns) == 10 # conllu columns + assert classla_columns[0] == tagged_columns[0] # match index + assert classla_columns[1] == tagged_columns[1] # match token + merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)] + merged_line = '\t'.join(merged_columns) + merged_conllu_file.write(merged_line + '\n') + merged_conllu_file.close() + tagged_conllu_file.close() + classla_conllu_file.close() + + classla_map = { + 'save_dir':self.classla_directory + '/sl/depparse', + 'save_name':'standard_jos.pt', + 'eval_file':merged_conllu_file_name, + 'output_file':parsed_conllu_file_name, + 'gold_file':merged_conllu_file_name, + 'shorthand':'sl_ssj', + 'mode':'predict', + 'pretrain_file':self.classla_directory + '/sl/pretrain/standard.pt' + } + classla_arguments = [] + for (key, value) in classla_map.items(): + classla_arguments += ['--' + key, value] + classla_manual.main(args=classla_arguments) + + pipeline.import_file(parsed_conllu_file_name, 'classla-parsed') + pipeline.do_translate_jos() + pipeline.do_conllu_to_tei() + pipeline.import_file(input_structure_file_name, 'structures-old') + self._parse_to_dictionary_sequence(pipeline) + pipeline.export_file(output_file_name, 'dictionary') + pipeline.export_file(output_structure_file_name, 'structures-new') + self.cleanup(pipeline) + def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name): pipeline = Pipeline() pipeline.import_file(input_file_name, 'tei-initial') From 2e2a523eedcc110febd617cb6d2fcac6e7315a29 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Fri, 30 Sep 2022 15:09:59 +0200 Subject: [PATCH 3/7] Redmine #1487: added error messages to assert statements --- structure_assignment/pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index ecf95d6..becfbd6 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -85,9 +85,9 @@ class Runner: else: classla_columns = classla_line.split('\t') tagged_columns = tagged_line.split('\t') - assert len(classla_columns) == len(tagged_columns) == 10 # conllu columns - assert classla_columns[0] == tagged_columns[0] # match index - assert classla_columns[1] == tagged_columns[1] # match token + assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({})'.format(len(tagged_line) + assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({})'.format(len(classla_line) + assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_tokens[0], tagged_columns[0]) merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)] merged_line = '\t'.join(merged_columns) merged_conllu_file.write(merged_line + '\n') From c9553b3c29792ec0a7018b11434f673e6b6000c4 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Fri, 30 Sep 2022 15:14:07 +0200 Subject: [PATCH 4/7] Redmine #1487: syntax error fix --- structure_assignment/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index becfbd6..a076ea0 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -85,8 +85,8 @@ class Runner: else: classla_columns = classla_line.split('\t') tagged_columns = tagged_line.split('\t') - assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({})'.format(len(tagged_line) - assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({})'.format(len(classla_line) + assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(len(tagged_line)) + assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(len(classla_line)) assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_tokens[0], tagged_columns[0]) merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)] merged_line = '\t'.join(merged_columns) From 7bf4903154bc2c1ef102ed2cbbb10721ccea0ebb Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Fri, 30 Sep 2022 15:16:35 +0200 Subject: [PATCH 5/7] Redmine #1487: error message fix --- structure_assignment/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index a076ea0..9110bc5 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -87,7 +87,7 @@ class Runner: tagged_columns = tagged_line.split('\t') assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(len(tagged_line)) assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(len(classla_line)) - assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_tokens[0], tagged_columns[0]) + assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_columns[1], tagged_columns[1]) merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)] merged_line = '\t'.join(merged_columns) merged_conllu_file.write(merged_line + '\n') From b7a0b4d06642c726ed35160b46a599e30d6367a6 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Fri, 30 Sep 2022 15:20:11 +0200 Subject: [PATCH 6/7] Redmine #1487: error message fix --- structure_assignment/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/structure_assignment/pipeline.py b/structure_assignment/pipeline.py index 9110bc5..06eb5db 100644 --- a/structure_assignment/pipeline.py +++ b/structure_assignment/pipeline.py @@ -85,8 +85,8 @@ class Runner: else: classla_columns = classla_line.split('\t') tagged_columns = tagged_line.split('\t') - assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(len(tagged_line)) - assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(len(classla_line)) + assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(tagged_line) + assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(classla_line) assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_columns[1], tagged_columns[1]) merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)] merged_line = '\t'.join(merged_columns) From 86eddd5e8fb93c2c6a86047899d62b55c88824e4 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Wed, 9 Aug 2023 10:28:44 +0200 Subject: [PATCH 7/7] Redmine #1487: updated gitignore list --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1767514..1ee7789 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ __pycache__ -tmp \ No newline at end of file +resources +tmp +venv +build +*.egg-info