Redmine #2198: Adapted pipeline to limit wani.py to collocation structures
This commit is contained in:
parent
3c38cdbcae
commit
616e7823df
|
@ -6,17 +6,11 @@ FILE_MAP = {'strings-list': 'strings.txt',
|
||||||
'dict': 'dict.xml',
|
'dict': 'dict.xml',
|
||||||
'structure-schema': 'structures.xsd',
|
'structure-schema': 'structures.xsd',
|
||||||
'tei-initial': 'tei_initial.xml',
|
'tei-initial': 'tei_initial.xml',
|
||||||
'tei-single': 'tei_single.xml',
|
'tei-ids-collocation': 'tei_ids_collocations.xml',
|
||||||
'tei-single-ids': 'tei_single_with_ids.xml',
|
'tei-ids-all': 'tei_ids_all.xml',
|
||||||
'tei-multiple': 'tei_multiple.xml',
|
'collocations': 'collocation_matches.csv',
|
||||||
'tei-multiple-ids-1': 'tei_multiple_with_ids1.xml',
|
|
||||||
'tei-multiple-ids-2': 'tei_multiple_with_ids2.xml',
|
|
||||||
'mwes-1': 'mwes1.csv',
|
|
||||||
'mwes-2': 'mwes2.csv',
|
|
||||||
'structures-old': 'structures_old.xml',
|
'structures-old': 'structures_old.xml',
|
||||||
'structures-new': 'structures_new.xml',
|
'structures-new': 'structures_new.xml',
|
||||||
'dictionary-single': 'dictionary_single.xml',
|
|
||||||
'dictionary-multiple': 'dictionary_multiple.xml',
|
|
||||||
'dictionary': 'dictionary.xml',
|
'dictionary': 'dictionary.xml',
|
||||||
'dictionary-schema': 'monolingual_dictionaries.xsd'
|
'dictionary-schema': 'monolingual_dictionaries.xsd'
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,27 +0,0 @@
|
||||||
import argparse
|
|
||||||
import re
|
|
||||||
import lxml.etree as lxml
|
|
||||||
|
|
||||||
def get_entries(input_file_name):
|
|
||||||
return list(lxml.parse(input_file_name).getroot())
|
|
||||||
|
|
||||||
|
|
||||||
def merge(single_file_name, multiple_file_name, output_file_name):
|
|
||||||
entries = get_entries(single_file_name) + get_entries(multiple_file_name)
|
|
||||||
entries.sort(key=lambda entry: int(re.search('^s(\d+)\.\d+$', entry.get('sid')).group(1)))
|
|
||||||
|
|
||||||
root = lxml.Element('dictionary')
|
|
||||||
for entry in entries:
|
|
||||||
del entry.attrib['sid']
|
|
||||||
root.append(entry)
|
|
||||||
tree = lxml.ElementTree(root)
|
|
||||||
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
|
||||||
|
|
||||||
|
|
||||||
if (__name__ == '__main__'):
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
|
||||||
arg_parser.add_argument('-single', type=str, required=True, help='Input single token dictionary')
|
|
||||||
arg_parser.add_argument('-multiple', type=str, required=True, help='Input multiple token dictionary')
|
|
||||||
arg_parser.add_argument('-outfile', type=str, required=True, help='Output merged dictionary')
|
|
||||||
arguments = arg_parser.parse_args()
|
|
||||||
merge(arguments.single, arguments.multiple, arguments.outfile)
|
|
|
@ -10,12 +10,9 @@ from structure_assignment.constants import *
|
||||||
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
||||||
from nova_slovnica.translate_jos import translate as translate_jos
|
from nova_slovnica.translate_jos import translate as translate_jos
|
||||||
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
|
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
|
||||||
from structure_assignment.split_tei import split as split_tei
|
from nova_slovnica.assign_collocation_structures import assign as assign_collocation_structures
|
||||||
from nova_slovnica.assign_single_structures import assign as assign_single
|
from nova_slovnica.assign_other_structures import assign as assign_other_structures
|
||||||
from nova_slovnica.assign_structures import assign as assign_multiple
|
|
||||||
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
|
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
|
||||||
from nova_slovnica.create_structures import create as create_structures
|
|
||||||
from structure_assignment.merge_dictionaries import merge as merge_dictionaries
|
|
||||||
|
|
||||||
class Runner:
|
class Runner:
|
||||||
|
|
||||||
|
@ -83,17 +80,10 @@ class Runner:
|
||||||
pipeline.do_conllu_to_tei()
|
pipeline.do_conllu_to_tei()
|
||||||
|
|
||||||
def _parse_to_dictionary_sequence(self, pipeline):
|
def _parse_to_dictionary_sequence(self, pipeline):
|
||||||
pipeline.do_split_tei()
|
pipeline.do_find_collocation_structure_units()
|
||||||
pipeline.do_assign_single()
|
pipeline.do_assign_collocation_structures()
|
||||||
pipeline.do_tei_to_dictionary_single()
|
pipeline.do_assign_other_structures()
|
||||||
pipeline.do_find_structure_units_first()
|
pipeline.do_tei_to_dictionary()
|
||||||
pipeline.do_assign_multiple_first()
|
|
||||||
pipeline.do_create_structures()
|
|
||||||
pipeline.do_find_structure_units_second()
|
|
||||||
pipeline.do_assign_multiple_second()
|
|
||||||
pipeline.do_tei_to_dictionary_multiple()
|
|
||||||
pipeline.do_merge_dictionaries()
|
|
||||||
|
|
||||||
|
|
||||||
class Pipeline:
|
class Pipeline:
|
||||||
|
|
||||||
|
@ -152,49 +142,16 @@ class Pipeline:
|
||||||
output_file_name = self.file_map['tei-initial']
|
output_file_name = self.file_map['tei-initial']
|
||||||
conllu_to_tei(input_file_name, output_file_name)
|
conllu_to_tei(input_file_name, output_file_name)
|
||||||
|
|
||||||
def do_split_tei(self):
|
def do_find_collocation_structure_units(self):
|
||||||
print('Splitting TEI ...')
|
print('Finding units for existing collocation structures ...')
|
||||||
input_file_name = self.file_map['tei-initial']
|
|
||||||
output_single_file_name = self.file_map['tei-single']
|
|
||||||
output_multiple_file_name = self.file_map['tei-multiple']
|
|
||||||
split_tei(input_file_name, output_single_file_name, output_multiple_file_name)
|
|
||||||
|
|
||||||
def do_assign_single(self):
|
|
||||||
print('Assigning single structures ...')
|
|
||||||
input_file_name = self.file_map['tei-single']
|
|
||||||
structure_file_name = self.file_map['structures-old']
|
|
||||||
output_file_name = self.file_map['tei-single-ids']
|
|
||||||
assign_single(input_file_name, structure_file_name, output_file_name)
|
|
||||||
|
|
||||||
def do_tei_to_dictionary_single(self):
|
|
||||||
print('Converting single TEI to dictionary ...')
|
|
||||||
input_file_name = self.file_map['tei-single-ids']
|
|
||||||
output_file_name = self.file_map['dictionary-single']
|
|
||||||
tei_to_dictionary(input_file_name, output_file_name)
|
|
||||||
|
|
||||||
def do_tei_to_dictionary_multiple(self):
|
|
||||||
print('Converting multiple TEI to dictionary ...')
|
|
||||||
input_file_name = self.file_map['tei-multiple-ids-2']
|
|
||||||
output_file_name = self.file_map['dictionary-multiple']
|
|
||||||
tei_to_dictionary(input_file_name, output_file_name)
|
|
||||||
|
|
||||||
def do_find_structure_units_first(self):
|
|
||||||
print('Finding units for existing structures ...')
|
|
||||||
self._do_find_structure_units(self.file_map['structures-old'], self.file_map['tei-multiple'], self.file_map['mwes-1'])
|
|
||||||
|
|
||||||
def do_find_structure_units_second(self):
|
|
||||||
print('Finding units for extended structures ...')
|
|
||||||
self._do_find_structure_units(self.file_map['structures-new'], self.file_map['tei-multiple'], self.file_map['mwes-2'])
|
|
||||||
|
|
||||||
def _do_find_structure_units(self, structure_file_name, tei_file_name, csv_file_name):
|
|
||||||
|
|
||||||
from wani import main as wani_main
|
from wani import main as wani_main
|
||||||
namespace = SimpleNamespace()
|
namespace = SimpleNamespace()
|
||||||
|
|
||||||
# relevant values
|
# relevant values
|
||||||
namespace.structures = structure_file_name
|
namespace.structures = self.file_map['structures-old']
|
||||||
namespace.input = [tei_file_name]
|
namespace.input = [self.file_map['tei-initial']]
|
||||||
namespace.all = csv_file_name
|
namespace.all = self.file_map['collocations']
|
||||||
namespace.skip_id_check = True
|
namespace.skip_id_check = True
|
||||||
namespace.fixed_restriction_order = True
|
namespace.fixed_restriction_order = True
|
||||||
namespace.new_tei = True
|
namespace.new_tei = True
|
||||||
|
@ -221,39 +178,26 @@ class Pipeline:
|
||||||
|
|
||||||
wani_main(namespace)
|
wani_main(namespace)
|
||||||
|
|
||||||
|
def do_assign_collocation_structures(self):
|
||||||
|
print('Assigning ids of collocation structures ...')
|
||||||
|
input_file_name = self.file_map['tei-initial']
|
||||||
|
collocations_file_name = self.file_map['collocations']
|
||||||
|
output_file_name = self.file_map['tei-ids-collocation']
|
||||||
|
assign_collocation_structures(input_file_name, collocations_file_name, output_file_name)
|
||||||
|
|
||||||
def _find_min_other_id(self, key):
|
def do_assign_other_structures(self):
|
||||||
try:
|
print('Assigning ids of single and other structures, creating if necessary ...')
|
||||||
root = lxml.parse(self.file_map[key])
|
input_file_name = self.file_map['tei-ids-collocation']
|
||||||
other_ids = [int(oid) for oid in root.xpath('syntactic_structure[@type="other"]/@id')]
|
structure_old_file_name = self.file_map['structures-old']
|
||||||
min_id = min(other_ids)
|
output_file_name = self.file_map['tei-ids-all']
|
||||||
except:
|
structure_new_file_name = self.file_map['structures-new']
|
||||||
min_id = 109 # This is the current value in structures.xml, and is not expected to change. Ugly, but code shouldn't reach here ...
|
assign_other_structures(input_file_name, structure_old_file_name, output_file_name, structure_new_file_name)
|
||||||
return min_id
|
|
||||||
|
|
||||||
def do_assign_multiple_first(self):
|
def do_tei_to_dictionary(self):
|
||||||
print('Assigning ids based on existing structures ...')
|
print('Converting TEI to dictionary ...')
|
||||||
min_other_id = self._find_min_other_id('structures-old')
|
input_file_name = self.file_map['tei-ids-all']
|
||||||
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-1'], self.file_map['tei-multiple-ids-1'], min_other_id)
|
|
||||||
|
|
||||||
def do_assign_multiple_second(self):
|
|
||||||
print('Assigning ids based on extended structures ...')
|
|
||||||
min_other_id = self._find_min_other_id('structures-new')
|
|
||||||
assign_multiple(self.file_map['tei-multiple'], self.file_map['mwes-2'], self.file_map['tei-multiple-ids-2'], min_other_id)
|
|
||||||
|
|
||||||
def do_create_structures(self):
|
|
||||||
print('Creating missing structures ...')
|
|
||||||
input_file_name = self.file_map['structures-old']
|
|
||||||
tei_file_name = self.file_map['tei-multiple-ids-1']
|
|
||||||
output_file_name = self.file_map['structures-new']
|
|
||||||
create_structures(input_file_name, tei_file_name, output_file_name)
|
|
||||||
|
|
||||||
def do_merge_dictionaries(self):
|
|
||||||
print('Merging single and multiple dictionaries ...')
|
|
||||||
single_file_name = self.file_map['dictionary-single']
|
|
||||||
multiple_file_name = self.file_map['dictionary-multiple']
|
|
||||||
output_file_name = self.file_map['dictionary']
|
output_file_name = self.file_map['dictionary']
|
||||||
merge_dictionaries(single_file_name, multiple_file_name, output_file_name)
|
tei_to_dictionary(input_file_name, output_file_name)
|
||||||
|
|
||||||
def _do_validate(self, schema_file_name, xml_file_name):
|
def _do_validate(self, schema_file_name, xml_file_name):
|
||||||
xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name))
|
xml_schema = lxml.XMLSchema(lxml.parse(schema_file_name))
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
import argparse
|
|
||||||
import lxml.etree as lxml
|
|
||||||
|
|
||||||
|
|
||||||
def xpath_find(element,expression):
|
|
||||||
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
|
|
||||||
|
|
||||||
|
|
||||||
def count_tokens(paragraph):
|
|
||||||
return len(xpath_find(paragraph, './/tei:w|.//tei:pc'))
|
|
||||||
|
|
||||||
|
|
||||||
def split(input_file_name, single_file_name, multiple_file_name):
|
|
||||||
|
|
||||||
tree = lxml.parse(input_file_name)
|
|
||||||
root = tree.getroot()
|
|
||||||
paragraphs = xpath_find(root, './/tei:p')
|
|
||||||
for paragraph in paragraphs:
|
|
||||||
if (count_tokens(paragraph) > 1):
|
|
||||||
paragraph.getparent().remove(paragraph)
|
|
||||||
tree.write(single_file_name, encoding='UTF-8', pretty_print=True)
|
|
||||||
|
|
||||||
tree = lxml.parse(input_file_name)
|
|
||||||
root = tree.getroot()
|
|
||||||
paragraphs = xpath_find(root, './/tei:p')
|
|
||||||
for paragraph in paragraphs:
|
|
||||||
if (count_tokens(paragraph) == 1):
|
|
||||||
paragraph.getparent().remove(paragraph)
|
|
||||||
tree.write(multiple_file_name, encoding='UTF-8', pretty_print=True)
|
|
||||||
|
|
||||||
|
|
||||||
if (__name__ == '__main__'):
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Split input TEI into single and multiple token units.')
|
|
||||||
arg_parser.add_argument('-infile', type=str, required=True, help='Input TEI file')
|
|
||||||
arg_parser.add_argument('-single', type=str, required=True, help='Output single token TEI file')
|
|
||||||
arg_parser.add_argument('-multiple', type=str, required=True, help='Output multiple token TEI file')
|
|
||||||
arguments = arg_parser.parse_args()
|
|
||||||
split(arguments.infile, arguments.single, arguments.multiple)
|
|
Loading…
Reference in New Issue
Block a user