diff --git a/package/structure_assignment/api.py b/package/structure_assignment/api.py index 453804d..79a001e 100644 --- a/package/structure_assignment/api.py +++ b/package/structure_assignment/api.py @@ -1,5 +1,6 @@ import os + import lxml.etree as lxml from flask import Flask, Response @@ -23,21 +24,21 @@ def test(string): string_file.write(string + '\n') try: - pipeline = Pipeline(nlp) - pipeline.import_file(string_file_name, 'strings-list') - pipeline.do_tokenise() - pipeline.do_tweak_conllu() - pipeline.do_parse() - pipeline.do_translate_jos() - pipeline.do_conllu_to_tei() - pipeline.export_file(parse_file_name, 'tei-initial') - pipeline.cleanup() + # pipeline = Pipeline(nlp) + # pipeline.import_file(string_file_name, 'strings-list') + # pipeline.do_tokenise() + # pipeline.do_tweak_conllu() + # pipeline.do_parse() + # pipeline.do_translate_jos() + # pipeline.do_conllu_to_tei() + # pipeline.export_file(parse_file_name, 'tei-initial') + # pipeline.cleanup() + import sys + sys.path.insert(0, resource_directory) + print(sys.path) + import wani tei = lxml.parse(parse_file_name).getroot() message = lxml.tostring(tei, encoding='UTF-8', pretty_print=True).decode() - ok = True except Exception as e: - message = str(e) - ok = False - - results = {'ok':ok, 'message':message} + message = lxml.tostring('' + str(e) + '').decode() return Response(message, mimetype='text/xml') diff --git a/package/structure_assignment/constants.py b/package/structure_assignment/constants.py index 41aa8b8..cae1d4b 100644 --- a/package/structure_assignment/constants.py +++ b/package/structure_assignment/constants.py @@ -1,11 +1,7 @@ # scripts -TEI_SPLIT_SCRIPT_NAME = 'split_tei.py' MWE_EXTRACTION_SCRIPT_NAME = 'wani.py' -STRUCTURE_SINGLE_ASSIGNMENT_SCRIPT_NAME = 'assign_single_structures.py' STRUCTURE_ASSIGNMENT_SCRIPT_NAME = 'assign_structures.py' STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' -STRUCTURE_CREATION_SCRIPT_NAME = 'create_structures.py' -TEI_DICTIONARY_SCRIPT_NAME = 'tei_to_dictionary.py' DICTIONARY_MERGE_SCRIPT_NAME = 'merge_dictionaries.py' # resources @@ -27,7 +23,7 @@ FILE_MAP = {'strings-list': 'strings.txt', 'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml', 'mwes-1': 'mwes1.csv', 'mwes-2': 'mwes2.csv', - 'structures-old': 'structures_old.xml', + 'structures-old': 'structures.xml', 'structures-new': 'structures_new.xml', 'dictionary-single': 'dictionary_single.xml', 'dictionary-multiple': 'dictionary_multiple.xml', diff --git a/package/structure_assignment/merge_dictionaries.py b/package/structure_assignment/merge_dictionaries.py new file mode 100644 index 0000000..7d3372c --- /dev/null +++ b/package/structure_assignment/merge_dictionaries.py @@ -0,0 +1,27 @@ +import argparse +import re +import lxml.etree as lxml + +def get_entries(input_file_name): + return list(lxml.parse(input_file_name).getroot()) + + +def merge(single_file_name, multiple_file_name, output_file_name): + entries = get_entries(single_file_name) + get_entries(multiple_file_name) + entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1))) + + root = lxml.Element('dictionary') + for entry in entries: + del entry.attrib['sid'] + root.append(entry) + tree = lxml.ElementTree(root) + tree.write(output_file_name, encoding='UTF-8', pretty_print=True) + + +if (__name__ == '__main__'): + arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') + arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary') + arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary') + arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary') + arguments = arg_parser.parse_args() + merge(arguments.single, arguments.multiple, arguments.outfile) diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index 69a739e..6a5bbff 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -1,19 +1,24 @@ -import codecs -import shutil import os +import shutil import tempfile -from copy import deepcopy +from types import SimpleNamespace import obeliks import classla from classla import Document -from classla.models.common.conll import CoNLLFile +#from classla.models.common.conll import CoNLLFile from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu from nova_slovnica.translate_jos import translate as translate_jos from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei +from structure_assignment.split_tei import split as split_tei +from nova_slovnica.assign_single_structures import assign as assign_single +from nova_slovnica.assign_structures import assign as assign_multiple +from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary +from nova_slovnica.create_structures import create as create_structures +from structure_assignment.merge_dictionaries import merge as merge_dictionaries def create_nlp(resource_directory): NLP_CONFIG_MAP['dir'] = resource_directory + '/classla' @@ -21,9 +26,15 @@ def create_nlp(resource_directory): class Pipeline: - def __init__(self, nlp): + def __init__(self, nlp, resource_directory): self.nlp = nlp self.tmp_directory = tempfile.mkdtemp() + resource_file_names = [resource_directory + '/' + f for f in os.listdir(resource_directory)] + for resource_file_name in resource_file_names: + if (os.path.isfile(resource_file_name)): + shutil.copy(resource_file_name, self.tmp_directory) + import sys + sys.path.insert(0, self.tmp_directory) self.file_map = {key: self.tmp_directory + '/' + FILE_MAP[key] for key in FILE_MAP.keys()} def import_file(self, file_name, file_key): @@ -50,7 +61,7 @@ class Pipeline: def do_translate_jos(self): input_file_name = self.file_map['classla-parsed'] - dictionary_file_name = resource_directory + '/dict.xml' + dictionary_file_name = self.file_map['dict'] output_file_name = self.file_map['classla-translated'] translate_jos(input_file_name, dictionary_file_name, output_file_name) @@ -59,6 +70,88 @@ class Pipeline: output_file_name = self.file_map['tei-initial'] conllu_to_tei(input_file_name, output_file_name) + def do_split_tei(self): + input_file_name = self.file_map['tei-initial'] + output_single_file_name = self.file_map['tei-single'] + output_multiple_file_name = self.file_map['tei-multiple'] + split_tei(input_file_name, output_single_file_name, output_multiple_file_name) + + def do_assign_single(self): + input_file_name = self.file_map['tei-single'] + structure_file_name = self.file_map['structures-old'] + output_file_name = self.file_map['tei-single-ids'] + assign_single(input_file_name, structure_file_name, output_file_name) + + def do_tei_to_dictionary_single(self): + input_file_name = self.file_map['tei-single-ids'] + output_file_name = self.file_map['dictionary-single'] + tei_to_dictionary(input_file_name, output_file_name) + + def do_tei_to_dictionary_multiple(self): + input_file_name = self.file_map['tei-multiple-ids-2'] + output_file_name = self.file_map['dictionary-multiple'] + tei_to_dictionary(input_file_name, output_file_name) + + def do_find_structure_units_first(self): + self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1']) + + def do_find_structure_units_second(self): + self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2']) + + def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name): + + from wani import main as wani_main + namespace = SimpleNamespace() + + # relevant values + namespace.structures = structure_file_name + namespace.input = [tei_file_name] + namespace.all = csv_file_name + namespace.skip_id_check = True + namespace.fixed_restriction_order = True + namespace.new_tei = True + + # default values + namespace.sloleks_db = None + namespace.out = None + namespace.out_no_stat = None + namespace.stats = None + namespace.no_msd_translate = False + namespace.min_freq = 0 + namespace.verbose = 'info' + namespace.count_files = False + namespace.multiple_output = False + namespace.load_sloleks = False + namespace.sort_by = -1 + namespace.sort_reversed = False + namespace.db = None + namespace.collocation_sentence_map_dest = None + namespace.new_db = False + namespace.pc_tag = 'pc' + namespace.separator = '\t' + namespace.ignore_punctuations = False + + wani_main(namespace) + + + def do_assign_multiple_first(self): + assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1']) + + def do_assign_multiple_second(self): + assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2']) + + def do_create_structures(self): + input_file_name = self.file_map['structures-old'] + tei_file_name = self.file_map['tei-multiple-ids-1'] + output_file_name = self.file_map['structures-new'] + create_structures(input_file_name, tei_file_name, output_file_name) + + def do_merge_dictionaries(self): + single_file_name = self.file_map['dictionary-single'] + multiple_file_name = self.file_map['dictionary-multiple'] + output_file_name = self.file_map['dictionary'] + merge_dictionaries(single_file_name, multiple_file_name, output_file_name) + def export_file(self, file_name, file_key): shutil.copyfile(self.file_map[file_key], file_name) diff --git a/package/structure_assignment/split_tei.py b/package/structure_assignment/split_tei.py new file mode 100644 index 0000000..f1b36f7 --- /dev/null +++ b/package/structure_assignment/split_tei.py @@ -0,0 +1,38 @@ +import argparse +import lxml.etree as lxml + + +def xpath_find(element,expression): + return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'}) + + +def count_tokens(paragraph): + return len(xpath_find(paragraph, './/tei:w|.//tei:pc')) + + +def split(input_file_name, single_file_name, multiple_file_name): + + tree = lxml.parse(input_file_name) + root = tree.getroot() + paragraphs = xpath_find(root, './/tei:p') + for paragraph in paragraphs: + if (count_tokens(paragraph) > 1): + paragraph.getparent().remove(paragraph) + tree.write(single_file_name, encoding='UTF-8', pretty_print=True) + + tree = lxml.parse(input_file_name) + root = tree.getroot() + paragraphs = xpath_find(root, './/tei:p') + for paragraph in paragraphs: + if (count_tokens(paragraph) == 1): + paragraph.getparent().remove(paragraph) + tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True) + + +if (__name__ == '__main__'): + arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') + arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file') + arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file') + arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file') + arguments = arg_parser.parse_args() + split(arguments.infile, arguments.single, arguments.multiple) diff --git a/scripts/merge_dictionaries.py b/scripts/merge_dictionaries.py deleted file mode 100644 index b6e9117..0000000 --- a/scripts/merge_dictionaries.py +++ /dev/null @@ -1,25 +0,0 @@ -import argparse -import re -import lxml.etree as lxml - -arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') -arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary') -arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary') -arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary') -arguments = arg_parser.parse_args() -single_file_name = arguments.single -multiple_file_name = arguments.multiple -output_file_name = arguments.outfile - -def get_entries(input_file_name): - return list(lxml.parse(input_file_name).getroot()) - -entries = get_entries(single_file_name) + get_entries(multiple_file_name) -entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1))) - -root = lxml.Element('dictionary') -for entry in entries: - del entry.attrib['sid'] - root.append(entry) -tree = lxml.ElementTree(root) -tree.write(output_file_name, encoding='UTF-8', pretty_print=True) diff --git a/scripts/split_tei.py b/scripts/split_tei.py deleted file mode 100644 index d42599e..0000000 --- a/scripts/split_tei.py +++ /dev/null @@ -1,34 +0,0 @@ -import argparse -import lxml.etree as lxml - -arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.') -arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file') -arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file') -arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file') -arguments = arg_parser.parse_args() -input_file_name = arguments.infile -single_file_name = arguments.single -multiple_file_name = arguments.multiple - -TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0' -def xpath_find(element,expression): - return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE}) - -def count_tokens(paragraph): - return len(xpath_find(paragraph, './/tei:w|.//tei:pc')) - -tree = lxml.parse(input_file_name) -root = tree.getroot() -paragraphs = xpath_find(root, './/tei:p') -for paragraph in paragraphs: - if (count_tokens(paragraph) > 1): - paragraph.getparent().remove(paragraph) -tree.write(single_file_name, encoding='UTF-8', pretty_print=True) - -tree = lxml.parse(input_file_name) -root = tree.getroot() -paragraphs = xpath_find(root, './/tei:p') -for paragraph in paragraphs: - if (count_tokens(paragraph) == 1): - paragraph.getparent().remove(paragraph) -tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)