Merge branch 'master' into cordex

2023-08-09 10:30:36 +02:00
parent 258a7a12a5 86eddd5e8f
commit b36e07253f
3 changed files with 76 additions and 7 deletions
@@ -1,2 +1,6 @@
 __pycache__
-tmp
+resources
+tmp
+venv
+build
+*.egg-info
@@ -4,14 +4,18 @@ Pipeline for parsing a list of arbitrary Slovene strings and assigning
 each to a syntactic structure in the DDD database, generating
 provisional new structures if necessary.

-## Setup
+## Installation

-Most of the scripts come from other repositories and python libraries.
-Run the set-up script:
+Installation requires the [CLASSLA](https://github.com/clarinsi/classla) standard_jos models, as
+well as (for now) the wani.py script from
+[luscenje_struktur](https://gitea.cjvt.si/ozbolt/luscenje_struktur):

-```
-$ scripts/setup.sh
-```
+    pip install .
+    python -c "import classla; classla.download('sl', dir='resources/classla', type='standard_jos')"
+    curl -o resources/wani.py https://gitea.cjvt.si/ozbolt/luscenje_struktur/raw/branch/master/wani.py
+
+The classla directory and wani.py file do not necessarily need to be placed under resources/, but
+the wrapper script scripts/process.py assumes that they are.

 ## Usage

@@ -1,9 +1,11 @@
 import shutil
+import codecs
 import tempfile
 import lxml.etree as lxml

 import classla
 import cordex
+import classla.models.parser as classla_manual

 from structure_assignment.constants import *
 from structure_assignment.tweak_conllu import tweak as tweak_conllu
@@ -50,6 +52,65 @@ class Runner:
        pipeline.export_file(output_file_name, 'tei-initial')
        self.cleanup(pipeline)

+    def tagged_to_dictionary(self, strings_file_name, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):  # TODO: refactor/tidy
+
+        classla_conllu_file_name = '/tmp/classla.conlu'
+        merged_conllu_file_name = '/tmp/merged.conlu'
+        parsed_conllu_file_name = '/tmp/parsed.conlu'
+
+        pipeline = Pipeline(self.nlp)
+        pipeline.import_file(strings_file_name, 'strings-list')
+        pipeline.do_tokenise()
+        pipeline.do_tweak_conllu()
+        pipeline.do_parse()
+        pipeline.export_file(classla_conllu_file_name, 'classla-parsed')
+
+        classla_conllu_file = codecs.open(classla_conllu_file_name, 'r')
+        tagged_conllu_file = codecs.open(input_file_name, 'r')
+        merged_conllu_file = codecs.open(merged_conllu_file_name, 'w')
+        for (classla_line, tagged_line) in zip(classla_conllu_file, tagged_conllu_file):
+            classla_line = classla_line.strip()
+            tagged_line = tagged_line.strip()
+            if ((len(classla_line) == 0 and len(tagged_line) == 0)
+                or (classla_line.startswith('#') and tagged_line.startswith('#'))):
+                merged_line = classla_line
+            else:
+                classla_columns = classla_line.split('\t')
+                tagged_columns = tagged_line.split('\t')
+                assert len(classla_columns) == 10, 'Missing token in classla-generated conllu ({}).'.format(tagged_line)
+                assert len(tagged_columns) == 10, 'Missing token in pre-tagged conllu ({}).'.format(classla_line)
+                assert classla_columns[1] == tagged_columns[1], 'Pre-tagged token form ({}) does not match classla-generated token form ({}).'.format(classla_columns[1], tagged_columns[1])
+                merged_columns = [classla_columns[i] if i in (3,5,9) else tagged_columns[i] for i in range(10)]
+                merged_line = '\t'.join(merged_columns)
+            merged_conllu_file.write(merged_line + '\n')
+        merged_conllu_file.close()
+        tagged_conllu_file.close()
+        classla_conllu_file.close()
+
+        classla_map = {
+            'save_dir':self.classla_directory + '/sl/depparse',
+            'save_name':'standard_jos.pt',
+            'eval_file':merged_conllu_file_name,
+            'output_file':parsed_conllu_file_name,
+            'gold_file':merged_conllu_file_name,
+            'shorthand':'sl_ssj',
+            'mode':'predict',
+            'pretrain_file':self.classla_directory + '/sl/pretrain/standard.pt'
+        }
+        classla_arguments = []
+        for (key, value) in classla_map.items():
+            classla_arguments += ['--' + key, value]
+        classla_manual.main(args=classla_arguments)
+
+        pipeline.import_file(parsed_conllu_file_name, 'classla-parsed')
+        pipeline.do_translate_jos()
+        pipeline.do_conllu_to_tei()
+        pipeline.import_file(input_structure_file_name, 'structures-old')
+        self._parse_to_dictionary_sequence(pipeline)
+        pipeline.export_file(output_file_name, 'dictionary')
+        pipeline.export_file(output_structure_file_name, 'structures-new')
+        self.cleanup(pipeline)
+
    def parse_to_dictionary(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
        pipeline = Pipeline()
        pipeline.import_file(input_file_name, 'tei-initial')