Redmine #1835: turned pipeline2 scripts into modules
This commit is contained in:
parent
5395d8def0
commit
f5d4a009ea
|
@ -1,5 +1,6 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
|
|
||||||
from flask import Flask, Response
|
from flask import Flask, Response
|
||||||
|
@ -23,21 +24,21 @@ def test(string):
|
||||||
string_file.write(string + '\n')
|
string_file.write(string + '\n')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pipeline = Pipeline(nlp)
|
# pipeline = Pipeline(nlp)
|
||||||
pipeline.import_file(string_file_name, 'strings-list')
|
# pipeline.import_file(string_file_name, 'strings-list')
|
||||||
pipeline.do_tokenise()
|
# pipeline.do_tokenise()
|
||||||
pipeline.do_tweak_conllu()
|
# pipeline.do_tweak_conllu()
|
||||||
pipeline.do_parse()
|
# pipeline.do_parse()
|
||||||
pipeline.do_translate_jos()
|
# pipeline.do_translate_jos()
|
||||||
pipeline.do_conllu_to_tei()
|
# pipeline.do_conllu_to_tei()
|
||||||
pipeline.export_file(parse_file_name, 'tei-initial')
|
# pipeline.export_file(parse_file_name, 'tei-initial')
|
||||||
pipeline.cleanup()
|
# pipeline.cleanup()
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, resource_directory)
|
||||||
|
print(sys.path)
|
||||||
|
import wani
|
||||||
tei = lxml.parse(parse_file_name).getroot()
|
tei = lxml.parse(parse_file_name).getroot()
|
||||||
message = lxml.tostring(tei, encoding='UTF-8', pretty_print=True).decode()
|
message = lxml.tostring(tei, encoding='UTF-8', pretty_print=True).decode()
|
||||||
ok = True
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
message = str(e)
|
message = lxml.tostring('<error>' + str(e) + '</error>').decode()
|
||||||
ok = False
|
|
||||||
|
|
||||||
results = {'ok':ok, 'message':message}
|
|
||||||
return Response(message, mimetype='text/xml')
|
return Response(message, mimetype='text/xml')
|
||||||
|
|
|
@ -1,11 +1,7 @@
|
||||||
# scripts
|
# scripts
|
||||||
TEI_SPLIT_SCRIPT_NAME = 'split_tei.py'
|
|
||||||
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
|
MWE_EXTRACTION_SCRIPT_NAME = 'wani.py'
|
||||||
STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py'
|
|
||||||
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
|
STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py'
|
||||||
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
|
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
|
||||||
STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py'
|
|
||||||
TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py'
|
|
||||||
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
|
DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py'
|
||||||
|
|
||||||
# resources
|
# resources
|
||||||
|
@ -27,7 +23,7 @@ FILE_MAP = {'strings-list': 'strings.txt',
|
||||||
'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml',
|
'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml',
|
||||||
'mwes-1': 'mwes1.csv',
|
'mwes-1': 'mwes1.csv',
|
||||||
'mwes-2': 'mwes2.csv',
|
'mwes-2': 'mwes2.csv',
|
||||||
'structures-old': 'structures_old.xml',
|
'structures-old': 'structures.xml',
|
||||||
'structures-new': 'structures_new.xml',
|
'structures-new': 'structures_new.xml',
|
||||||
'dictionary-single': 'dictionary_single.xml',
|
'dictionary-single': 'dictionary_single.xml',
|
||||||
'dictionary-multiple': 'dictionary_multiple.xml',
|
'dictionary-multiple': 'dictionary_multiple.xml',
|
||||||
|
|
27
package/structure_assignment/merge_dictionaries.py
Normal file
27
package/structure_assignment/merge_dictionaries.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import lxml.etree as lxml
|
||||||
|
|
||||||
|
def get_entries(input_file_name):
|
||||||
|
return list(lxml.parse(input_file_name).getroot())
|
||||||
|
|
||||||
|
|
||||||
|
def merge(single_file_name, multiple_file_name, output_file_name):
|
||||||
|
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
||||||
|
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
|
||||||
|
|
||||||
|
root = lxml.Element('dictionary')
|
||||||
|
for entry in entries:
|
||||||
|
del entry.attrib['sid']
|
||||||
|
root.append(entry)
|
||||||
|
tree = lxml.ElementTree(root)
|
||||||
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
|
|
||||||
|
|
||||||
|
if (__name__ == '__main__'):
|
||||||
|
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
||||||
|
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
|
||||||
|
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
|
||||||
|
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
|
||||||
|
arguments = arg_parser.parse_args()
|
||||||
|
merge(arguments.single, arguments.multiple, arguments.outfile)
|
|
@ -1,19 +1,24 @@
|
||||||
import codecs
|
|
||||||
import shutil
|
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
from copy import deepcopy
|
from types import SimpleNamespace
|
||||||
|
|
||||||
import obeliks
|
import obeliks
|
||||||
|
|
||||||
import classla
|
import classla
|
||||||
from classla import Document
|
from classla import Document
|
||||||
from classla.models.common.conll import CoNLLFile
|
#from classla.models.common.conll import CoNLLFile
|
||||||
|
|
||||||
from structure_assignment.constants import *
|
from structure_assignment.constants import *
|
||||||
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
||||||
from nova_slovnica.translate_jos import translate as translate_jos
|
from nova_slovnica.translate_jos import translate as translate_jos
|
||||||
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
|
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
|
||||||
|
from structure_assignment.split_tei import split as split_tei
|
||||||
|
from nova_slovnica.assign_single_structures import assign as assign_single
|
||||||
|
from nova_slovnica.assign_structures import assign as assign_multiple
|
||||||
|
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
|
||||||
|
from nova_slovnica.create_structures import create as create_structures
|
||||||
|
from structure_assignment.merge_dictionaries import merge as merge_dictionaries
|
||||||
|
|
||||||
def create_nlp(resource_directory):
|
def create_nlp(resource_directory):
|
||||||
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
NLP_CONFIG_MAP['dir'] = resource_directory + '/classla'
|
||||||
|
@ -21,9 +26,15 @@ def create_nlp(resource_directory):
|
||||||
|
|
||||||
class Pipeline:
|
class Pipeline:
|
||||||
|
|
||||||
def __init__(self, nlp):
|
def __init__(self, nlp, resource_directory):
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.tmp_directory = tempfile.mkdtemp()
|
self.tmp_directory = tempfile.mkdtemp()
|
||||||
|
resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)]
|
||||||
|
for resource_file_name in resource_file_names:
|
||||||
|
if (os.path.isfile(resource_file_name)):
|
||||||
|
shutil.copy(resource_file_name, self.tmp_directory)
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, self.tmp_directory)
|
||||||
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
|
self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()}
|
||||||
|
|
||||||
def import_file(self, file_name, file_key):
|
def import_file(self, file_name, file_key):
|
||||||
|
@ -50,7 +61,7 @@ class Pipeline:
|
||||||
|
|
||||||
def do_translate_jos(self):
|
def do_translate_jos(self):
|
||||||
input_file_name = self.file_map['classla-parsed']
|
input_file_name = self.file_map['classla-parsed']
|
||||||
dictionary_file_name = resource_directory + '/dict.xml'
|
dictionary_file_name = self.file_map['dict']
|
||||||
output_file_name = self.file_map['classla-translated']
|
output_file_name = self.file_map['classla-translated']
|
||||||
translate_jos(input_file_name, dictionary_file_name, output_file_name)
|
translate_jos(input_file_name, dictionary_file_name, output_file_name)
|
||||||
|
|
||||||
|
@ -59,6 +70,88 @@ class Pipeline:
|
||||||
output_file_name = self.file_map['tei-initial']
|
output_file_name = self.file_map['tei-initial']
|
||||||
conllu_to_tei(input_file_name, output_file_name)
|
conllu_to_tei(input_file_name, output_file_name)
|
||||||
|
|
||||||
|
def do_split_tei(self):
|
||||||
|
input_file_name = self.file_map['tei-initial']
|
||||||
|
output_single_file_name = self.file_map['tei-single']
|
||||||
|
output_multiple_file_name = self.file_map['tei-multiple']
|
||||||
|
split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
|
||||||
|
|
||||||
|
def do_assign_single(self):
|
||||||
|
input_file_name = self.file_map['tei-single']
|
||||||
|
structure_file_name = self.file_map['structures-old']
|
||||||
|
output_file_name = self.file_map['tei-single-ids']
|
||||||
|
assign_single(input_file_name, structure_file_name, output_file_name)
|
||||||
|
|
||||||
|
def do_tei_to_dictionary_single(self):
|
||||||
|
input_file_name = self.file_map['tei-single-ids']
|
||||||
|
output_file_name = self.file_map['dictionary-single']
|
||||||
|
tei_to_dictionary(input_file_name, output_file_name)
|
||||||
|
|
||||||
|
def do_tei_to_dictionary_multiple(self):
|
||||||
|
input_file_name = self.file_map['tei-multiple-ids-2']
|
||||||
|
output_file_name = self.file_map['dictionary-multiple']
|
||||||
|
tei_to_dictionary(input_file_name, output_file_name)
|
||||||
|
|
||||||
|
def do_find_structure_units_first(self):
|
||||||
|
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
|
||||||
|
|
||||||
|
def do_find_structure_units_second(self):
|
||||||
|
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
|
||||||
|
|
||||||
|
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
|
||||||
|
|
||||||
|
from wani import main as wani_main
|
||||||
|
namespace = SimpleNamespace()
|
||||||
|
|
||||||
|
# relevant values
|
||||||
|
namespace.structures = structure_file_name
|
||||||
|
namespace.input = [tei_file_name]
|
||||||
|
namespace.all = csv_file_name
|
||||||
|
namespace.skip_id_check = True
|
||||||
|
namespace.fixed_restriction_order = True
|
||||||
|
namespace.new_tei = True
|
||||||
|
|
||||||
|
# default values
|
||||||
|
namespace.sloleks_db = None
|
||||||
|
namespace.out = None
|
||||||
|
namespace.out_no_stat = None
|
||||||
|
namespace.stats = None
|
||||||
|
namespace.no_msd_translate = False
|
||||||
|
namespace.min_freq = 0
|
||||||
|
namespace.verbose = 'info'
|
||||||
|
namespace.count_files = False
|
||||||
|
namespace.multiple_output = False
|
||||||
|
namespace.load_sloleks = False
|
||||||
|
namespace.sort_by = -1
|
||||||
|
namespace.sort_reversed = False
|
||||||
|
namespace.db = None
|
||||||
|
namespace.collocation_sentence_map_dest = None
|
||||||
|
namespace.new_db = False
|
||||||
|
namespace.pc_tag = 'pc'
|
||||||
|
namespace.separator = '\t'
|
||||||
|
namespace.ignore_punctuations = False
|
||||||
|
|
||||||
|
wani_main(namespace)
|
||||||
|
|
||||||
|
|
||||||
|
def do_assign_multiple_first(self):
|
||||||
|
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'])
|
||||||
|
|
||||||
|
def do_assign_multiple_second(self):
|
||||||
|
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'])
|
||||||
|
|
||||||
|
def do_create_structures(self):
|
||||||
|
input_file_name = self.file_map['structures-old']
|
||||||
|
tei_file_name = self.file_map['tei-multiple-ids-1']
|
||||||
|
output_file_name = self.file_map['structures-new']
|
||||||
|
create_structures(input_file_name, tei_file_name, output_file_name)
|
||||||
|
|
||||||
|
def do_merge_dictionaries(self):
|
||||||
|
single_file_name = self.file_map['dictionary-single']
|
||||||
|
multiple_file_name = self.file_map['dictionary-multiple']
|
||||||
|
output_file_name = self.file_map['dictionary']
|
||||||
|
merge_dictionaries(single_file_name, multiple_file_name, output_file_name)
|
||||||
|
|
||||||
def export_file(self, file_name, file_key):
|
def export_file(self, file_name, file_key):
|
||||||
shutil.copyfile(self.file_map[file_key], file_name)
|
shutil.copyfile(self.file_map[file_key], file_name)
|
||||||
|
|
||||||
|
|
38
package/structure_assignment/split_tei.py
Normal file
38
package/structure_assignment/split_tei.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
import argparse
|
||||||
|
import lxml.etree as lxml
|
||||||
|
|
||||||
|
|
||||||
|
def xpath_find(element,expression):
|
||||||
|
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
|
||||||
|
|
||||||
|
|
||||||
|
def count_tokens(paragraph):
|
||||||
|
return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
|
||||||
|
|
||||||
|
|
||||||
|
def split(input_file_name, single_file_name, multiple_file_name):
|
||||||
|
|
||||||
|
tree = lxml.parse(input_file_name)
|
||||||
|
root = tree.getroot()
|
||||||
|
paragraphs = xpath_find(root, './/tei:p')
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if (count_tokens(paragraph) > 1):
|
||||||
|
paragraph.getparent().remove(paragraph)
|
||||||
|
tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
|
|
||||||
|
tree = lxml.parse(input_file_name)
|
||||||
|
root = tree.getroot()
|
||||||
|
paragraphs = xpath_find(root, './/tei:p')
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if (count_tokens(paragraph) == 1):
|
||||||
|
paragraph.getparent().remove(paragraph)
|
||||||
|
tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
|
|
||||||
|
|
||||||
|
if (__name__ == '__main__'):
|
||||||
|
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
||||||
|
arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
|
||||||
|
arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
|
||||||
|
arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
|
||||||
|
arguments = arg_parser.parse_args()
|
||||||
|
split(arguments.infile, arguments.single, arguments.multiple)
|
|
@ -1,25 +0,0 @@
|
||||||
import argparse
|
|
||||||
import re
|
|
||||||
import lxml.etree as lxml
|
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
|
||||||
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
|
|
||||||
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
|
|
||||||
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
|
|
||||||
arguments = arg_parser.parse_args()
|
|
||||||
single_file_name = arguments.single
|
|
||||||
multiple_file_name = arguments.multiple
|
|
||||||
output_file_name = arguments.outfile
|
|
||||||
|
|
||||||
def get_entries(input_file_name):
|
|
||||||
return list(lxml.parse(input_file_name).getroot())
|
|
||||||
|
|
||||||
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
|
||||||
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
|
|
||||||
|
|
||||||
root = lxml.Element('dictionary')
|
|
||||||
for entry in entries:
|
|
||||||
del entry.attrib['sid']
|
|
||||||
root.append(entry)
|
|
||||||
tree = lxml.ElementTree(root)
|
|
||||||
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
|
|
@ -1,34 +0,0 @@
|
||||||
import argparse
|
|
||||||
import lxml.etree as lxml
|
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
|
||||||
arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
|
|
||||||
arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
|
|
||||||
arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
|
|
||||||
arguments = arg_parser.parse_args()
|
|
||||||
input_file_name = arguments.infile
|
|
||||||
single_file_name = arguments.single
|
|
||||||
multiple_file_name = arguments.multiple
|
|
||||||
|
|
||||||
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
|
||||||
def xpath_find(element,expression):
|
|
||||||
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
|
||||||
|
|
||||||
def count_tokens(paragraph):
|
|
||||||
return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
|
|
||||||
|
|
||||||
tree = lxml.parse(input_file_name)
|
|
||||||
root = tree.getroot()
|
|
||||||
paragraphs = xpath_find(root, './/tei:p')
|
|
||||||
for paragraph in paragraphs:
|
|
||||||
if (count_tokens(paragraph) > 1):
|
|
||||||
paragraph.getparent().remove(paragraph)
|
|
||||||
tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
|
|
||||||
|
|
||||||
tree = lxml.parse(input_file_name)
|
|
||||||
root = tree.getroot()
|
|
||||||
paragraphs = xpath_find(root, './/tei:p')
|
|
||||||
for paragraph in paragraphs:
|
|
||||||
if (count_tokens(paragraph) == 1):
|
|
||||||
paragraph.getparent().remove(paragraph)
|
|
||||||
tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)
|
|
Loading…
Reference in New Issue
Block a user