Redmine #1835: turned pipeline2 scripts into modules

This commit is contained in:
Cyprian Laskowski 2021-03-12 16:35:51 +01:00
parent 5395d8def0
commit f5d4a009ea
7 changed files with 180 additions and 84 deletions

View File

@ -1,5 +1,6 @@
import os import os
import lxml.etree as lxml import lxml.etree as lxml
from flask import Flask, Response from flask import Flask, Response
@ -23,21 +24,21 @@ def test(string):
string_file.write(string + '\n') string_file.write(string + '\n')
try: try:
pipeline = Pipeline(nlp) # pipeline = Pipeline(nlp)
pipeline.import_file(string_file_name, 'strings-list') # pipeline.import_file(string_file_name, 'strings-list')
pipeline.do_tokenise() # pipeline.do_tokenise()
pipeline.do_tweak_conllu() # pipeline.do_tweak_conllu()
pipeline.do_parse() # pipeline.do_parse()
pipeline.do_translate_jos() # pipeline.do_translate_jos()
pipeline.do_conllu_to_tei() # pipeline.do_conllu_to_tei()
pipeline.export_file(parse_file_name, 'tei-initial') # pipeline.export_file(parse_file_name, 'tei-initial')
pipeline.cleanup() # pipeline.cleanup()
import sys
sys.path.insert(0, resource_directory)
print(sys.path)
import wani
tei = lxml.parse(parse_file_name).getroot() tei = lxml.parse(parse_file_name).getroot()
message = lxml.tostring(tei, encoding='UTF-8', pretty_print=True).decode() message = lxml.tostring(tei, encoding='UTF-8', pretty_print=True).decode()
ok = True
except Exception as e: except Exception as e:
message = str(e) message = lxml.tostring('<error>' + str(e) + '</error>').decode()
ok = False
results = {'ok':ok, 'message':message}
return Response(message, mimetype='text/xml') return Response(message, mimetype='text/xml')

View File

@ -1,11 +1,7 @@
# scripts # scripts
TEI_SPLIT_SCRIPT_NAME = 'split_tei.py'
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py'
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py'
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py' DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
# resources # resources
@ -27,7 +23,7 @@ FILE_MAP = {'strings-list': 'strings.txt',
'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml', 'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml',
'mwes-1': 'mwes1.csv', 'mwes-1': 'mwes1.csv',
'mwes-2': 'mwes2.csv', 'mwes-2': 'mwes2.csv',
'structures-old': 'structures_old.xml', 'structures-old': 'structures.xml',
'structures-new': 'structures_new.xml', 'structures-new': 'structures_new.xml',
'dictionary-single': 'dictionary_single.xml', 'dictionary-single': 'dictionary_single.xml',
'dictionary-multiple': 'dictionary_multiple.xml', 'dictionary-multiple': 'dictionary_multiple.xml',

View File

@ -0,0 +1,27 @@
import argparse
import re
import lxml.etree as lxml
def get_entries(input_file_name):
return list(lxml.parse(input_file_name).getroot())
def merge(single_file_name, multiple_file_name, output_file_name):
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
root = lxml.Element('dictionary')
for entry in entries:
del entry.attrib['sid']
root.append(entry)
tree = lxml.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
arguments = arg_parser.parse_args()
merge(arguments.single, arguments.multiple, arguments.outfile)

View File

@ -1,19 +1,24 @@
import codecs
import shutil
import os import os
import shutil
import tempfile import tempfile
from copy import deepcopy from types import SimpleNamespace
import obeliks import obeliks
import classla import classla
from classla import Document from classla import Document
from classla.models.common.conll import CoNLLFile #from classla.models.common.conll import CoNLLFile
from structure_assignment.constants import * from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu from structure_assignment.tweak_conllu import tweak as tweak_conllu
from nova_slovnica.translate_jos import translate as translate_jos from nova_slovnica.translate_jos import translate as translate_jos
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
from structure_assignment.split_tei import split as split_tei
from nova_slovnica.assign_single_structures import assign as assign_single
from nova_slovnica.assign_structures import assign as assign_multiple
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
from nova_slovnica.create_structures import create as create_structures
from structure_assignment.merge_dictionaries import merge as merge_dictionaries
def create_nlp(resource_directory): def create_nlp(resource_directory):
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla' NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
@ -21,9 +26,15 @@ def create_nlp(resource_directory):
class Pipeline: class Pipeline:
def __init__(self, nlp): def __init__(self, nlp, resource_directory):
self.nlp = nlp self.nlp = nlp
self.tmp_directory = tempfile.mkdtemp() self.tmp_directory = tempfile.mkdtemp()
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
for resource_file_name in resource_file_names:
if (os.path.isfile(resource_file_name)):
shutil.copy(resource_file_name, self.tmp_directory)
import sys
sys.path.insert(0, self.tmp_directory)
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()} self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
def import_file(self, file_name, file_key): def import_file(self, file_name, file_key):
@ -50,7 +61,7 @@ class Pipeline:
def do_translate_jos(self): def do_translate_jos(self):
input_file_name = self.file_map['classla-parsed'] input_file_name = self.file_map['classla-parsed']
dictionary_file_name = resource_directory + '/dict.xml' dictionary_file_name = self.file_map['dict']
output_file_name = self.file_map['classla-translated'] output_file_name = self.file_map['classla-translated']
translate_jos(input_file_name, dictionary_file_name, output_file_name) translate_jos(input_file_name, dictionary_file_name, output_file_name)
@ -59,6 +70,88 @@ class Pipeline:
output_file_name = self.file_map['tei-initial'] output_file_name = self.file_map['tei-initial']
conllu_to_tei(input_file_name, output_file_name) conllu_to_tei(input_file_name, output_file_name)
def do_split_tei(self):
input_file_name = self.file_map['tei-initial']
output_single_file_name = self.file_map['tei-single']
output_multiple_file_name = self.file_map['tei-multiple']
split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
def do_assign_single(self):
input_file_name = self.file_map['tei-single']
structure_file_name = self.file_map['structures-old']
output_file_name = self.file_map['tei-single-ids']
assign_single(input_file_name, structure_file_name, output_file_name)
def do_tei_to_dictionary_single(self):
input_file_name = self.file_map['tei-single-ids']
output_file_name = self.file_map['dictionary-single']
tei_to_dictionary(input_file_name, output_file_name)
def do_tei_to_dictionary_multiple(self):
input_file_name = self.file_map['tei-multiple-ids-2']
output_file_name = self.file_map['dictionary-multiple']
tei_to_dictionary(input_file_name, output_file_name)
def do_find_structure_units_first(self):
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
def do_find_structure_units_second(self):
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
from wani import main as wani_main
namespace = SimpleNamespace()
# relevant values
namespace.structures = structure_file_name
namespace.input = [tei_file_name]
namespace.all = csv_file_name
namespace.skip_id_check = True
namespace.fixed_restriction_order = True
namespace.new_tei = True
# default values
namespace.sloleks_db = None
namespace.out = None
namespace.out_no_stat = None
namespace.stats = None
namespace.no_msd_translate = False
namespace.min_freq = 0
namespace.verbose = 'info'
namespace.count_files = False
namespace.multiple_output = False
namespace.load_sloleks = False
namespace.sort_by = -1
namespace.sort_reversed = False
namespace.db = None
namespace.collocation_sentence_map_dest = None
namespace.new_db = False
namespace.pc_tag = 'pc'
namespace.separator = '\t'
namespace.ignore_punctuations = False
wani_main(namespace)
def do_assign_multiple_first(self):
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'])
def do_assign_multiple_second(self):
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'])
def do_create_structures(self):
input_file_name = self.file_map['structures-old']
tei_file_name = self.file_map['tei-multiple-ids-1']
output_file_name = self.file_map['structures-new']
create_structures(input_file_name, tei_file_name, output_file_name)
def do_merge_dictionaries(self):
single_file_name = self.file_map['dictionary-single']
multiple_file_name = self.file_map['dictionary-multiple']
output_file_name = self.file_map['dictionary']
merge_dictionaries(single_file_name, multiple_file_name, output_file_name)
def export_file(self, file_name, file_key): def export_file(self, file_name, file_key):
shutil.copyfile(self.file_map[file_key], file_name) shutil.copyfile(self.file_map[file_key], file_name)

View File

@ -0,0 +1,38 @@
import argparse
import lxml.etree as lxml
def xpath_find(element,expression):
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
def count_tokens(paragraph):
return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
def split(input_file_name, single_file_name, multiple_file_name):
tree = lxml.parse(input_file_name)
root = tree.getroot()
paragraphs = xpath_find(root, './/tei:p')
for paragraph in paragraphs:
if (count_tokens(paragraph) > 1):
paragraph.getparent().remove(paragraph)
tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
tree = lxml.parse(input_file_name)
root = tree.getroot()
paragraphs = xpath_find(root, './/tei:p')
for paragraph in paragraphs:
if (count_tokens(paragraph) == 1):
paragraph.getparent().remove(paragraph)
tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)
if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
arguments = arg_parser.parse_args()
split(arguments.infile, arguments.single, arguments.multiple)

View File

@ -1,25 +0,0 @@
import argparse
import re
import lxml.etree as lxml
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
arguments = arg_parser.parse_args()
single_file_name = arguments.single
multiple_file_name = arguments.multiple
output_file_name = arguments.outfile
def get_entries(input_file_name):
return list(lxml.parse(input_file_name).getroot())
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
root = lxml.Element('dictionary')
for entry in entries:
del entry.attrib['sid']
root.append(entry)
tree = lxml.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)

View File

@ -1,34 +0,0 @@
import argparse
import lxml.etree as lxml
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
arguments = arg_parser.parse_args()
input_file_name = arguments.infile
single_file_name = arguments.single
multiple_file_name = arguments.multiple
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
def xpath_find(element,expression):
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
def count_tokens(paragraph):
return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
tree = lxml.parse(input_file_name)
root = tree.getroot()
paragraphs = xpath_find(root, './/tei:p')
for paragraph in paragraphs:
if (count_tokens(paragraph) > 1):
paragraph.getparent().remove(paragraph)
tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
tree = lxml.parse(input_file_name)
root = tree.getroot()
paragraphs = xpath_find(root, './/tei:p')
for paragraph in paragraphs:
if (count_tokens(paragraph) == 1):
paragraph.getparent().remove(paragraph)
tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)