Moved some files to this repository and between other repositories
This commit is contained in:
parent
6cf298855e
commit
a83686b9eb
|
@ -0,0 +1,57 @@
|
|||
import argparse
|
||||
import csv
|
||||
import codecs
|
||||
import re
|
||||
import lxml.etree as lxml
|
||||
|
||||
def xpath_find(element,expression):
|
||||
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
|
||||
|
||||
def get_xml_id(element):
|
||||
return element.get('{http://www.w3.org/XML/1998/namespace}id')
|
||||
|
||||
def get_id_counter(xml_id):
|
||||
return int(re.search(r'^s(\d+)\.\d+(?:\.\d+)?$', xml_id).group(1))
|
||||
|
||||
def assign(input_file_name, csv_file_name, output_file_name):
|
||||
|
||||
csv_file = codecs.open(csv_file_name, 'r')
|
||||
reader = csv.DictReader(csv_file, delimiter='\t')
|
||||
mwe_map = {}
|
||||
for row in reader:
|
||||
structure_id = row['Structure_ID']
|
||||
token_ids = [row[key] for key in sorted(row.keys()) if key.endswith('_Token_ID') and len(row[key]) > 0]
|
||||
index = get_id_counter(token_ids[0])
|
||||
component_count = len(token_ids)
|
||||
if (index not in mwe_map):
|
||||
mwe_map[index] = set()
|
||||
mwe_map[index].add((structure_id, component_count))
|
||||
csv_file.close()
|
||||
|
||||
xml_tree = lxml.parse(input_file_name)
|
||||
xml_root = xml_tree.getroot()
|
||||
mwes_xml = xpath_find(xml_root, './/tei:s')
|
||||
for mwe_xml in mwes_xml:
|
||||
index = get_id_counter(get_xml_id(mwe_xml))
|
||||
mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip()
|
||||
token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc'))
|
||||
structure_ids = set()
|
||||
if (index in mwe_map):
|
||||
for (structure_id, component_count) in mwe_map[index]:
|
||||
if (component_count == token_count):
|
||||
structure_ids.add(int(structure_id))
|
||||
if (len(structure_ids) > 1):
|
||||
print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')')
|
||||
elif (len(structure_ids) == 1):
|
||||
mwe_xml.set('structure_id', str(list(structure_ids)[0]))
|
||||
xml_tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
||||
|
||||
|
||||
if (__name__ == '__main__'):
|
||||
|
||||
arg_parser = argparse.ArgumentParser(description='Assign collocation structure ids to parsed lexical units.')
|
||||
arg_parser.add_argument('-infile', type=str, help='Input TEI')
|
||||
arg_parser.add_argument('-csv', type=str, help='CSV file')
|
||||
arg_parser.add_argument('-outfile', type=str, help='Output TEI')
|
||||
arguments = arg_parser.parse_args()
|
||||
assign(arguments.infile, arguments.structures, arguments.outfile)
|
222
package/structure_assignment/assign_other_structures.py
Normal file
222
package/structure_assignment/assign_other_structures.py
Normal file
|
@ -0,0 +1,222 @@
|
|||
import argparse
|
||||
import lxml.etree as lxml
|
||||
|
||||
from conversion_utils.jos_msds_and_properties import Converter, Msd, Properties
|
||||
|
||||
CONVERTER = Converter()
|
||||
|
||||
FEATURE_CATEGORY_MAP = {'noun':{'case'},
|
||||
'adjective':{'case'},
|
||||
'numeral':{'case', 'form'},
|
||||
'verb':{'type'}}
|
||||
|
||||
LABEL_MAP = {'form':{'digit':'a', 'roman':'r', 'letter':'b'},
|
||||
'case':{'nominative':'1', 'genitive':'2', 'dative':'3', 'accusative':'4', 'locative':'5', 'instrumental':'6'},
|
||||
'type':{'main':'g', 'auxiliary':'p', 'reflexive':'p'}}
|
||||
|
||||
CONTACT_MAP = {(False, False): 'both', (False, True): 'left', (True, False): 'right', (True, True): 'neither'}
|
||||
|
||||
DEPENDENCY_ROOT_SYMBOL = '#'
|
||||
DEPENDENCY_ROOT_LABEL = 'modra'
|
||||
|
||||
class SyntacticStructure:
|
||||
def __init__(self):
|
||||
self.id = None
|
||||
self.type = None
|
||||
self.components = []
|
||||
self.dependencies = []
|
||||
def set_components(self, component_maps):
|
||||
self.components.clear()
|
||||
for component_map in component_maps:
|
||||
contact = component_map.pop('contact')
|
||||
label = self._generate_label(component_map)
|
||||
self.components.append({'features': component_map, 'contact':contact, 'label':label})
|
||||
def set_dependencies(self, dependency_tuples):
|
||||
self.dependencies.clear()
|
||||
if (len(dependency_tuples) > 1):
|
||||
for (from_index, to_index, label) in dependency_tuples:
|
||||
self.dependencies.append({'from':from_index, 'to':to_index, 'label':label})
|
||||
def set_example(self, parsed_unit):
|
||||
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
|
||||
self.example = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
|
||||
def _generate_label(self, feature_map):
|
||||
category = feature_map['POS']
|
||||
label = CONVERTER.specifications.find_category_by_name(category, 'en').codes.get('sl').lower()
|
||||
if (self.type == 'other'):
|
||||
for name in ['form', 'case', 'type']:
|
||||
try:
|
||||
label += LABEL_MAP[name][feature_map[name]]
|
||||
except KeyError:
|
||||
pass
|
||||
return label
|
||||
def __str__(self):
|
||||
return str(self.components) + str(self.dependencies)
|
||||
def __eq__(self, other):
|
||||
return other.components == self.components and other.dependencies == self.dependencies
|
||||
|
||||
|
||||
def assign(unit_input_file_name, structure_old_file_name, unit_output_file_name, structure_new_file_name):
|
||||
|
||||
parser = lxml.XMLParser(remove_blank_text=True)
|
||||
unit_tree = lxml.parse(unit_input_file_name, parser=parser)
|
||||
unit_root = unit_tree.getroot()
|
||||
structure_tree = lxml.parse(structure_old_file_name, parser=parser)
|
||||
structure_root = structure_tree.getroot()
|
||||
|
||||
find_and_assign_structures(unit_root, structure_root)
|
||||
|
||||
unit_tree.write(unit_output_file_name, encoding='UTF-8', pretty_print=True)
|
||||
structure_tree.write(structure_new_file_name, encoding='UTF-8', pretty_print=True)
|
||||
|
||||
def find_and_assign_structures(unit_root, structure_root): # TODO: check and test
|
||||
syntactic_structures = parse_xml_structures(structure_root)
|
||||
last_id = get_max_id(structure_root)
|
||||
for unit in xpath_find(unit_root, 'tei:text/tei:body/tei:p/tei:s'):
|
||||
if (unit.get('structure_id') is None):
|
||||
new_syntactic_structure = SyntacticStructure()
|
||||
new_syntactic_structure.type = 'single' if len(xpath_find(unit, 'tei:w|tei:pc')) == 1 else 'other'
|
||||
component_maps = make_component_map(unit)
|
||||
new_syntactic_structure.set_components(component_maps)
|
||||
dependency_tuples = make_dependency_tuples(unit)
|
||||
new_syntactic_structure.set_dependencies(dependency_tuples)
|
||||
syntactic_structure = next((ss for ss in syntactic_structures if ss == new_syntactic_structure), None)
|
||||
if (syntactic_structure is None):
|
||||
syntactic_structure = new_syntactic_structure
|
||||
last_id += 1
|
||||
syntactic_structure.id = last_id
|
||||
syntactic_structure.set_example(unit)
|
||||
syntactic_structures.append(syntactic_structure)
|
||||
structure_element = create_xml_structure(syntactic_structure)
|
||||
structure_root.append(structure_element)
|
||||
unit.set('structure_id', str(syntactic_structure.id))
|
||||
|
||||
def parse_xml_structures(root):
|
||||
syntactic_structures = []
|
||||
for structure_element in root.xpath('syntactic_structure[@type!="collocation"]'):
|
||||
syntactic_structure = SyntacticStructure()
|
||||
syntactic_structure.id = int(structure_element.get('id'))
|
||||
syntactic_structure.type = structure_element.get('type')
|
||||
dependency_tuples = []
|
||||
for dependency in structure_element.xpath('system[@type="JOS"]/dependencies/dependency'):
|
||||
dependency_tuples.append((dependency.get('from'), dependency.get('to'), dependency.get('label')))
|
||||
syntactic_structure.set_dependencies(dependency_tuples)
|
||||
component_maps = []
|
||||
for component in structure_element.xpath('system[@type="JOS"]/definition/component'):
|
||||
morphology_features = component.xpath('restriction[@type="morphology"]/feature')
|
||||
component_map = {}
|
||||
for feature in morphology_features:
|
||||
key = feature.attrib.keys()[0]
|
||||
if (key == 'POS' or syntactic_structure.type != 'single'):
|
||||
component_map[key] = feature.get(key)
|
||||
try:
|
||||
contact = component.xpath('restriction[@type="space"]/feature')[0].get('contact')
|
||||
except:
|
||||
contact = None
|
||||
component_map['contact'] = contact
|
||||
component_maps.append(component_map)
|
||||
syntactic_structure.set_components(component_maps)
|
||||
syntactic_structures.append(syntactic_structure)
|
||||
return syntactic_structures
|
||||
|
||||
def xpath_find(element,expression):
|
||||
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
|
||||
|
||||
def make_component_map(unit):
|
||||
component_maps = []
|
||||
tokens = xpath_find(unit, 'tei:w|tei:pc')
|
||||
for (index, token) in enumerate(tokens, start=1):
|
||||
component_map = {}
|
||||
msd = Msd(token.get('ana')[len('MTE:'):], 'sl')
|
||||
properties = CONVERTER.msd_to_properties(msd, 'en')
|
||||
component_map['POS'] = properties.category
|
||||
component_map['contact'] = get_contact(token, properties.category)
|
||||
if (len(tokens) > 1):
|
||||
if (msd.code[0] == 'Z'):
|
||||
if (msd.code in {'Zp------k', 'Zp---d--k'}):
|
||||
component_map['type'] = 'reflexive'
|
||||
component_map['clitic'] = 'yes'
|
||||
elif (properties.category in FEATURE_CATEGORY_MAP):
|
||||
feature_map = {**properties.lexeme_feature_map, **properties.form_feature_map}
|
||||
for name in FEATURE_CATEGORY_MAP[properties.category]:
|
||||
if (name in feature_map):
|
||||
component_map[name] = feature_map[name]
|
||||
component_maps.append(component_map)
|
||||
return component_maps
|
||||
|
||||
def make_dependency_tuples(unit):
|
||||
dependency_tuples = []
|
||||
dependencies = xpath_find(unit, 'tei:linkGrp[@type="JOS-SYN"]/tei:link')
|
||||
dependencies.sort(key=lambda dep: int(dep.get('target')[dep.get('target').rindex('.')+1:]))
|
||||
for dependency in dependencies:
|
||||
relation = dependency.get('ana')[len('jos-syn:'):]
|
||||
[from_index, to_index] = [string[string.rindex('.')+1:] if string.count('.') == 2 else '#' for string in dependency.get('target').split(' ')]
|
||||
dependency_tuples.append((from_index, to_index, relation))
|
||||
return dependency_tuples
|
||||
|
||||
def has_space(token):
|
||||
return token.get('join') != 'right'
|
||||
|
||||
def get_contact(token, category):
|
||||
if (category != 'punctuation'):
|
||||
contact = None
|
||||
else:
|
||||
left_space = token.getprevious() is not None and has_space(token.getprevious())
|
||||
right_space = has_space(token)
|
||||
contact = CONTACT_MAP[(left_space, right_space)]
|
||||
return contact
|
||||
|
||||
def get_max_id(root):
|
||||
return max([int(ss.get('id')) for ss in root.xpath('syntactic_structure')])
|
||||
|
||||
def create_xml_structure(syntactic_structure):
|
||||
structure_element = lxml.Element('syntactic_structure')
|
||||
structure_element.set('tempId', str(syntactic_structure.id))
|
||||
comment = lxml.Comment(' example: ' + syntactic_structure.example)
|
||||
structure_element.append(comment)
|
||||
structure_element.set('type', 'other')
|
||||
system = lxml.SubElement(structure_element, 'system')
|
||||
system.set('type', 'JOS')
|
||||
components = lxml.SubElement(system, 'components')
|
||||
components.set('order', 'fixed')
|
||||
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
|
||||
component = lxml.SubElement(components, 'component')
|
||||
component.set('cid', str(index))
|
||||
component.set('type', 'core')
|
||||
component.set('label', component_map['label'])
|
||||
structure_element.set('label', '-'.join([c.get('label') for c in components]))
|
||||
dependencies = lxml.SubElement(system, 'dependencies')
|
||||
for dependency_map in syntactic_structure.dependencies:
|
||||
dependency = lxml.SubElement(dependencies, 'dependency')
|
||||
[from_index, label, to_index] = [dependency_map[key] for key in ['from', 'label', 'to']]
|
||||
if (label == DEPENDENCY_ROOT_LABEL):
|
||||
from_index = DEPENDENCY_ROOT_SYMBOL
|
||||
dependency.set('from', from_index)
|
||||
dependency.set('label', label)
|
||||
dependency.set('to', to_index)
|
||||
definition = lxml.SubElement(system, 'definition')
|
||||
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
|
||||
component = lxml.SubElement(definition, 'component')
|
||||
component.set('cid', str(index))
|
||||
restriction = lxml.SubElement(component, 'restriction')
|
||||
restriction.set('type', 'morphology')
|
||||
for key in ['POS', 'type', 'form', 'case', 'clitic']:
|
||||
if (key in component_map['features']):
|
||||
feature = lxml.SubElement(restriction, 'feature')
|
||||
feature.set(key, component_map['features'][key])
|
||||
if (component_map['contact'] is not None):
|
||||
space_restriction = lxml.SubElement(component, 'restriction')
|
||||
space_restriction.set('type', 'space')
|
||||
space_feature = lxml.SubElement(space_restriction, 'feature')
|
||||
space_feature.set('contact', component_map['contact'])
|
||||
return structure_element
|
||||
|
||||
if (__name__ == '__main__'):
|
||||
|
||||
arg_parser = argparse.ArgumentParser(description='Assign structure ids to single-component structures.')
|
||||
arg_parser.add_argument('-infile', type=str, help='Input TEI')
|
||||
arg_parser.add_argument('-instruct', type=str, help='Structures input file')
|
||||
arg_parser.add_argument('-outfile', type=str, help='Output TEI')
|
||||
arg_parser.add_argument('-outstruct', type=str, help='Structures output file')
|
||||
|
||||
arguments = arg_parser.parse_args()
|
||||
assign(arguments.infile, arguments.instruct, arguments.outfile, arguments.outstruct)
|
|
@ -3,7 +3,6 @@ FILE_MAP = {'strings-list': 'strings.txt',
|
|||
'obeliks-tweaked': 'obeliks_tweaked.conllu',
|
||||
'classla-parsed': 'classla_raw.conllu',
|
||||
'classla-translated': 'classla_translated.conllu',
|
||||
'dict': 'dict.xml',
|
||||
'structure-schema': 'structures.xsd',
|
||||
'tei-initial': 'tei_initial.xml',
|
||||
'tei-ids-collocation': 'tei_ids_collocations.xml',
|
||||
|
|
|
@ -8,11 +8,11 @@ import classla
|
|||
|
||||
from structure_assignment.constants import *
|
||||
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
||||
from nova_slovnica.translate_jos import translate as translate_jos
|
||||
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
|
||||
from nova_slovnica.assign_collocation_structures import assign as assign_collocation_structures
|
||||
from nova_slovnica.assign_other_structures import assign as assign_other_structures
|
||||
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
|
||||
from conversion_utils.translate_conllu_jos import translate as translate_jos
|
||||
from conversion_utils.conllu_to_tei import convert_file as conllu_to_tei
|
||||
from structure_assignment.assign_collocation_structures import assign as assign_collocation_structures
|
||||
from structure_assignment.assign_other_structures import assign as assign_other_structures
|
||||
from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
|
||||
|
||||
class Runner:
|
||||
|
||||
|
@ -132,9 +132,8 @@ class Pipeline:
|
|||
def do_translate_jos(self):
|
||||
print('Translating JOS ...')
|
||||
input_file_name = self.file_map['classla-parsed']
|
||||
dictionary_file_name = self.file_map['dict']
|
||||
output_file_name = self.file_map['classla-translated']
|
||||
translate_jos(input_file_name, dictionary_file_name, output_file_name)
|
||||
translate_jos(input_file_name, output_file_name)
|
||||
|
||||
def do_conllu_to_tei(self):
|
||||
print('Converting to TEI ...')
|
||||
|
|
|
@ -7,9 +7,7 @@ mkdir lib resources
|
|||
|
||||
## get dependencies
|
||||
cd lib
|
||||
git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
|
||||
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
|
||||
git clone git@gitea.cjvt.si:generic/data_admin.git
|
||||
git clone git@gitea.cjvt.si:generic/xml_schemas.git
|
||||
git clone git@gitea.cjvt.si:generic/conversion_utils.git
|
||||
cd ..
|
||||
|
@ -23,7 +21,6 @@ pip install psycopg2cffi
|
|||
pip install sqlalchemy
|
||||
pip install classla
|
||||
python -c "import classla; classla.download('sl', type='standard_jos', dir='resources/classla')"
|
||||
pip install lib/nova_slovnica/python/package/
|
||||
pip install lib/luscenje_struktur/
|
||||
pip install lib/conversion_utils/
|
||||
pip install package/
|
||||
|
@ -32,8 +29,7 @@ deactivate
|
|||
## put needed resources in place
|
||||
cd resources
|
||||
ln -s ../lib/luscenje_struktur/wani.py .
|
||||
ln -s ../lib/nova_slovnica/resources/dict.xml .
|
||||
ln -s ../lib/data_admin/resources/structures.xsd .
|
||||
ln -s ../lib/xml_schemas/resources/schema/structures.xsd .
|
||||
ln -s ../lib/xml_schemas/resources/schema/inventory.xsd .
|
||||
ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd .
|
||||
cd ..
|
||||
|
|
Loading…
Reference in New Issue
Block a user