Redmine #1835: turned pipeline2 scripts into modules

2021-03-12 16:35:51 +01:00
parent 5395d8def0
commit f5d4a009ea
7 changed files with 180 additions and 84 deletions
@@ -1,5 +1,6 @@
 import os

+
 import lxml.etree as lxml

 from flask import Flask, Response
@@ -23,21 +24,21 @@ def test(string):
        string_file.write(string + '\n')

    try:
-        pipeline = Pipeline(nlp)
-        pipeline.import_file(string_file_name, 'strings-list')
-        pipeline.do_tokenise()
-        pipeline.do_tweak_conllu()
-        pipeline.do_parse()
-        pipeline.do_translate_jos()
-        pipeline.do_conllu_to_tei()
-        pipeline.export_file(parse_file_name, 'tei-initial')
-        pipeline.cleanup()
+        # pipeline = Pipeline(nlp)
+        # pipeline.import_file(string_file_name, 'strings-list')
+        # pipeline.do_tokenise()
+        # pipeline.do_tweak_conllu()
+        # pipeline.do_parse()
+        # pipeline.do_translate_jos()
+        # pipeline.do_conllu_to_tei()
+        # pipeline.export_file(parse_file_name, 'tei-initial')
+        # pipeline.cleanup()
+        import sys
+        sys.path.insert(0, resource_directory)
+        print(sys.path)
+        import wani
        tei = lxml.parse(parse_file_name).getroot()
        message = lxml.tostring(tei, encoding='UTF-8', pretty_print=True).decode()
-        ok = True
    except Exception as e:
-        message = str(e)
-        ok = False
-
-    results = {'ok':ok, 'message':message}
+        message = lxml.tostring('<error>' + str(e) + '</error>').decode()
    return Response(message, mimetype='text/xml')
@@ -1,11 +1,7 @@
 # scripts
-TEI_SPLIT_SCRIPT_NAME = 'split_tei.py'
 MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
-STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py'
 STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
 STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
-STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
-TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py'
 DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'

 # resources
@@ -27,7 +23,7 @@ FILE_MAP = {'strings-list': 'strings.txt',
            'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml',
            'mwes-1': 'mwes1.csv',
            'mwes-2': 'mwes2.csv',
-            'structures-old': 'structures_old.xml',
+            'structures-old': 'structures.xml',
            'structures-new': 'structures_new.xml',
            'dictionary-single': 'dictionary_single.xml',
            'dictionary-multiple': 'dictionary_multiple.xml',
@@ -0,0 +1,27 @@
+import argparse
+import re
+import lxml.etree as lxml
+
+def get_entries(input_file_name):
+    return list(lxml.parse(input_file_name).getroot())
+
+
+def merge(single_file_name, multiple_file_name, output_file_name):
+    entries = get_entries(single_file_name) + get_entries(multiple_file_name)
+    entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
+
+    root = lxml.Element('dictionary')
+    for entry in entries:
+        del entry.attrib['sid']
+        root.append(entry)
+    tree = lxml.ElementTree(root)
+    tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
+
+
+if (__name__ == '__main__'):
+    arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
+    arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
+    arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
+    arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
+    arguments = arg_parser.parse_args()
+    merge(arguments.single, arguments.multiple, arguments.outfile)
@@ -1,19 +1,24 @@
-import codecs
-import shutil
 import os
+import shutil
 import tempfile
-from copy import deepcopy
+from types import SimpleNamespace

 import obeliks

 import classla
 from classla import Document
-from classla.models.common.conll import CoNLLFile
+#from classla.models.common.conll import CoNLLFile

 from structure_assignment.constants import *
 from structure_assignment.tweak_conllu import tweak as tweak_conllu
 from nova_slovnica.translate_jos import translate as translate_jos
 from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
+from structure_assignment.split_tei import split as split_tei
+from nova_slovnica.assign_single_structures import assign as assign_single
+from nova_slovnica.assign_structures import assign as assign_multiple
+from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
+from nova_slovnica.create_structures import create as create_structures
+from structure_assignment.merge_dictionaries import merge as merge_dictionaries

 def create_nlp(resource_directory):
    NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
@@ -21,9 +26,15 @@ def create_nlp(resource_directory):

 class Pipeline:

-    def __init__(self, nlp):
+    def __init__(self, nlp, resource_directory):
        self.nlp = nlp
        self.tmp_directory = tempfile.mkdtemp()
+        resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
+        for resource_file_name in resource_file_names:
+            if (os.path.isfile(resource_file_name)):
+                shutil.copy(resource_file_name, self.tmp_directory)
+        import sys
+        sys.path.insert(0, self.tmp_directory)
        self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}

    def import_file(self, file_name, file_key):
@@ -50,7 +61,7 @@ class Pipeline:

    def do_translate_jos(self):
        input_file_name = self.file_map['classla-parsed']
-        dictionary_file_name = resource_directory + '/dict.xml'
+        dictionary_file_name = self.file_map['dict']
        output_file_name = self.file_map['classla-translated']
        translate_jos(input_file_name, dictionary_file_name, output_file_name)

@@ -59,6 +70,88 @@ class Pipeline:
        output_file_name = self.file_map['tei-initial']
        conllu_to_tei(input_file_name, output_file_name)

+    def do_split_tei(self):
+        input_file_name = self.file_map['tei-initial']
+        output_single_file_name = self.file_map['tei-single']
+        output_multiple_file_name = self.file_map['tei-multiple']
+        split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
+
+    def do_assign_single(self):
+        input_file_name = self.file_map['tei-single']
+        structure_file_name = self.file_map['structures-old']
+        output_file_name = self.file_map['tei-single-ids']
+        assign_single(input_file_name, structure_file_name, output_file_name)
+
+    def do_tei_to_dictionary_single(self):
+        input_file_name = self.file_map['tei-single-ids']
+        output_file_name = self.file_map['dictionary-single']
+        tei_to_dictionary(input_file_name, output_file_name)
+
+    def do_tei_to_dictionary_multiple(self):
+        input_file_name = self.file_map['tei-multiple-ids-2']
+        output_file_name = self.file_map['dictionary-multiple']
+        tei_to_dictionary(input_file_name, output_file_name)
+
+    def do_find_structure_units_first(self):
+        self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
+
+    def do_find_structure_units_second(self):
+        self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
+
+    def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
+
+        from wani import main as wani_main
+        namespace = SimpleNamespace()
+
+        # relevant values
+        namespace.structures = structure_file_name
+        namespace.input = [tei_file_name]
+        namespace.all = csv_file_name
+        namespace.skip_id_check = True
+        namespace.fixed_restriction_order = True
+        namespace.new_tei = True
+
+        # default values
+        namespace.sloleks_db = None
+        namespace.out = None
+        namespace.out_no_stat = None
+        namespace.stats = None
+        namespace.no_msd_translate = False
+        namespace.min_freq = 0
+        namespace.verbose = 'info'
+        namespace.count_files = False
+        namespace.multiple_output = False
+        namespace.load_sloleks = False
+        namespace.sort_by = -1
+        namespace.sort_reversed = False
+        namespace.db = None
+        namespace.collocation_sentence_map_dest = None
+        namespace.new_db = False
+        namespace.pc_tag = 'pc'
+        namespace.separator = '\t'
+        namespace.ignore_punctuations = False
+         
+        wani_main(namespace)
+
+
+    def do_assign_multiple_first(self):
+        assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'])
+
+    def do_assign_multiple_second(self):
+        assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'])
+
+    def do_create_structures(self):
+        input_file_name = self.file_map['structures-old']
+        tei_file_name = self.file_map['tei-multiple-ids-1']
+        output_file_name = self.file_map['structures-new']
+        create_structures(input_file_name, tei_file_name, output_file_name)
+
+    def do_merge_dictionaries(self):
+        single_file_name = self.file_map['dictionary-single']
+        multiple_file_name = self.file_map['dictionary-multiple']
+        output_file_name = self.file_map['dictionary']
+        merge_dictionaries(single_file_name, multiple_file_name, output_file_name)
+
    def export_file(self, file_name, file_key):
        shutil.copyfile(self.file_map[file_key], file_name)

@@ -0,0 +1,38 @@
+import argparse
+import lxml.etree as lxml
+
+
+def xpath_find(element,expression):
+    return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
+
+
+def count_tokens(paragraph):
+    return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
+
+
+def split(input_file_name, single_file_name, multiple_file_name):
+
+    tree = lxml.parse(input_file_name)
+    root = tree.getroot()
+    paragraphs = xpath_find(root, './/tei:p')
+    for paragraph in paragraphs:
+        if (count_tokens(paragraph) > 1):
+            paragraph.getparent().remove(paragraph)
+    tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
+
+    tree = lxml.parse(input_file_name)
+    root = tree.getroot()
+    paragraphs = xpath_find(root, './/tei:p')
+    for paragraph in paragraphs:
+        if (count_tokens(paragraph) == 1):
+            paragraph.getparent().remove(paragraph)
+    tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)
+
+
+if (__name__ == '__main__'):
+    arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
+    arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
+    arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
+    arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
+    arguments = arg_parser.parse_args()
+    split(arguments.infile, arguments.single, arguments.multiple)
@@ -1,25 +0,0 @@
-import argparse
-import re
-import lxml.etree as lxml
-
-arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
-arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
-arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
-arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
-arguments = arg_parser.parse_args()
-single_file_name = arguments.single
-multiple_file_name = arguments.multiple
-output_file_name = arguments.outfile
-
-def get_entries(input_file_name):
-    return list(lxml.parse(input_file_name).getroot())
-
-entries = get_entries(single_file_name) + get_entries(multiple_file_name)
-entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
-
-root = lxml.Element('dictionary')
-for entry in entries:
-    del entry.attrib['sid']
-    root.append(entry)
-tree = lxml.ElementTree(root)
-tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@@ -1,34 +0,0 @@
-import argparse
-import lxml.etree as lxml
-
-arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
-arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
-arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
-arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
-arguments = arg_parser.parse_args()
-input_file_name = arguments.infile
-single_file_name = arguments.single
-multiple_file_name = arguments.multiple
-
-TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
-def xpath_find(element,expression):
-    return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
-
-def count_tokens(paragraph):
-    return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
-
-tree = lxml.parse(input_file_name)
-root = tree.getroot()
-paragraphs = xpath_find(root, './/tei:p')
-for paragraph in paragraphs:
-    if (count_tokens(paragraph) > 1):
-        paragraph.getparent().remove(paragraph)
-tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
-
-tree = lxml.parse(input_file_name)
-root = tree.getroot()
-paragraphs = xpath_find(root, './/tei:p')
-for paragraph in paragraphs:
-    if (count_tokens(paragraph) == 1):
-        paragraph.getparent().remove(paragraph)
-tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)