From cb6960f58a011c183de3c1d692bba4e3b9656e2e Mon Sep 17 00:00:00 2001
From: Cyprian Laskowski <cyp@cjvt.si>
Date: Mon, 15 Mar 2021 16:24:01 +0100
Subject: [PATCH] Redmine #1835: minor improvements

---
 package/structure_assignment/pipeline.py | 19 ++++++++++++++++++-
 resources/.gitignore                     |  2 +-
 scripts/process.py                       | 18 +++++++++++-------
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py
index 3fff914..dfb2913 100644
--- a/package/structure_assignment/pipeline.py
+++ b/package/structure_assignment/pipeline.py
@@ -27,7 +27,7 @@ def create_nlp(resource_directory):
 
 class Pipeline:
 
-    def __init__(self, nlp, resource_directory):
+    def __init__(self, resource_directory, nlp=None):
         self.nlp = nlp
         self.tmp_directory = tempfile.mkdtemp()
         resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
@@ -42,16 +42,19 @@ class Pipeline:
         shutil.copyfile(file_name, self.file_map[file_key])
 
     def do_tokenise(self):
+        print('Tokenising with obeliks ...')
         input_file_name = self.file_map['strings-list']
         output_file_name = self.file_map['obeliks-tokenised']
         obeliks.run(in_file=input_file_name, out_file=output_file_name, conllu=True)
 
     def do_tweak_conllu(self):
+        print('Tweaking conllu ...')
         input_file_name = self.file_map['obeliks-tokenised']
         output_file_name = self.file_map['obeliks-tweaked']
         tweak_conllu(input_file_name, output_file_name)
 
     def do_parse(self):
+        print('Parsing with classla ...')
         input_file_name = self.file_map['obeliks-tweaked']
         output_file_name = self.file_map['classla-parsed']
         doc = Document(text=None)
@@ -61,42 +64,50 @@ class Pipeline:
         result.conll_file.write_conll(output_file_name)
 
     def do_translate_jos(self):
+        print('Translating JOS ...')
         input_file_name = self.file_map['classla-parsed']
         dictionary_file_name = self.file_map['dict']
         output_file_name = self.file_map['classla-translated']
         translate_jos(input_file_name, dictionary_file_name, output_file_name)
 
     def do_conllu_to_tei(self):
+        print('Converting to TEI ...')
         input_file_name = self.file_map['classla-translated']
         output_file_name = self.file_map['tei-initial']
         conllu_to_tei(input_file_name, output_file_name)
 
     def do_split_tei(self):
+        print('Splitting TEI ...')
         input_file_name = self.file_map['tei-initial']
         output_single_file_name = self.file_map['tei-single']
         output_multiple_file_name = self.file_map['tei-multiple']
         split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
 
     def do_assign_single(self):
+        print('Assigning single structures ...')
         input_file_name = self.file_map['tei-single']
         structure_file_name = self.file_map['structures-old']
         output_file_name = self.file_map['tei-single-ids']
         assign_single(input_file_name, structure_file_name, output_file_name)
 
     def do_tei_to_dictionary_single(self):
+        print('Converting single TEI to dictionary ...')
         input_file_name = self.file_map['tei-single-ids']
         output_file_name = self.file_map['dictionary-single']
         tei_to_dictionary(input_file_name, output_file_name)
 
     def do_tei_to_dictionary_multiple(self):
+        print('Converting multiple TEI to dictionary ...')
         input_file_name = self.file_map['tei-multiple-ids-2']
         output_file_name = self.file_map['dictionary-multiple']
         tei_to_dictionary(input_file_name, output_file_name)
 
     def do_find_structure_units_first(self):
+        print('Finding units for existing structures ...')
         self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
 
     def do_find_structure_units_second(self):
+        print('Finding units for extended structures ...')
         self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
 
     def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
@@ -145,20 +156,24 @@ class Pipeline:
         return min_id
 
     def do_assign_multiple_first(self):
+        print('Assigning ids based on existing structures ...')
         min_other_id = self._find_min_other_id('structures-old')
         assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id)
 
     def do_assign_multiple_second(self):
+        print('Assigning ids based on extended structures ...')
         min_other_id = self._find_min_other_id('structures-new')
         assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id)
 
     def do_create_structures(self):
+        print('Creating missing structures ...')
         input_file_name = self.file_map['structures-old']
         tei_file_name = self.file_map['tei-multiple-ids-1']
         output_file_name = self.file_map['structures-new']
         create_structures(input_file_name, tei_file_name, output_file_name)
 
     def do_merge_dictionaries(self):
+        print('Merging single and multiple dictionaries ...')
         single_file_name = self.file_map['dictionary-single']
         multiple_file_name = self.file_map['dictionary-multiple']
         output_file_name = self.file_map['dictionary']
@@ -170,11 +185,13 @@ class Pipeline:
         xml_schema.assertValid(xml_tree)
 
     def do_validate_structures(self):
+        print('Validating structures ...')
         schema_file_name = self.file_map['structure-schema']
         xml_file_name = self.file_map['structures-new']
         self._do_validate(schema_file_name, xml_file_name)
 
     def do_validate_dictionary(self):
+        print('Validating dictionary ...')
         schema_file_name = self.file_map['dictionary-schema']
         xml_file_name = self.file_map['dictionary']
         self._do_validate(schema_file_name, xml_file_name)
diff --git a/resources/.gitignore b/resources/.gitignore
index b40c628..5006d66 100644
--- a/resources/.gitignore
+++ b/resources/.gitignore
@@ -1,7 +1,7 @@
 /classla
 /dict.xml
-/obeliks.jar
 /structures.xml
 /structures.xsd
 /inventory.xsd
 /monolingual_dictionaries.xsd
+/wani.py
diff --git a/scripts/process.py b/scripts/process.py
index 504a1e4..fcfb48a 100644
--- a/scripts/process.py
+++ b/scripts/process.py
@@ -1,23 +1,27 @@
 import argparse
+import tempfile
+import os
 
 from structure_assignment.pipeline import Pipeline, create_nlp
 
 resource_directory = '../resources'
 
 def run_all(input_file_name, output_file_name, nlp, structure_file_name):
-    tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this
+    tmp_file_name = tempfile.mksfile()
     string_to_parse(input_file_name, tmp_file_name, nlp)
     parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
+    os.remove(tmp_file_name)
     validate_structures(structure_file_name)
     validate_dictionary(output_file_name)
 
 def strings_to_dictionary(input_file_name, output_file_name, nlp, structure_file_name):
-    tmp_file_name = '/tmp/tmp.xml' # TODO: do better than this
+    tmp_file_name = tempfile.mksfile()
     string_to_parse(input_file_name, tmp_file_name, nlp)
     parse_to_dictionary(tmp_file_name, output_file_name, structure_file_name)
+    os.remove(tmp_file_name)
 
 def strings_to_parse(input_file_name, output_file_name, nlp):
-    pipeline = Pipeline(nlp, resource_directory)
+    pipeline = Pipeline(resource_directory, nlp)
     pipeline.import_file(input_file_name, 'strings-list')
     pipeline.do_tokenise()
     pipeline.do_tweak_conllu()
@@ -29,7 +33,7 @@ def strings_to_parse(input_file_name, output_file_name, nlp):
     pipeline.cleanup()
 
 def parse_to_dictionary(input_file_name, output_file_name, structure_file_name):
-    pipeline = Pipeline(None, resource_directory)
+    pipeline = Pipeline(resource_directory)
     pipeline.import_file(input_file_name, 'tei-initial')
     pipeline.do_split_tei()
     pipeline.do_assign_single()
@@ -46,13 +50,13 @@ def parse_to_dictionary(input_file_name, output_file_name, structure_file_name):
     pipeline.cleanup()
 
 def validate_structures(input_file_name):
-    pipeline = Pipeline(None, resource_directory)
+    pipeline = Pipeline(resource_directory)
     pipeline.import_file(input_file_name, 'structures-new')
     pipeline.do_validate_structures()
     pipeline.cleanup()
     
 def validate_dictionary(input_file_name):
-    pipeline = Pipeline(None, resource_directory)
+    pipeline = Pipeline(resource_directory)
     pipeline.import_file(input_file_name, 'dictionary')
     pipeline.do_validate_dictionary()
     pipeline.cleanup()
@@ -85,4 +89,4 @@ if (__name__ == '__main__'):
     elif (part_name == 'validate_dictionary'):
         validate_dictionary(input_file_name)
     elif (part_name == 'all'):
-        run_all(input_file_name)
+        run_all(input_file_name, output_file_name, nlp, structure_file_name)