Moved some files to this repository and between other repositories

This commit is contained in:
Cyprian Laskowski 2021-12-07 18:49:33 +01:00
parent 6cf298855e
commit a83686b9eb
5 changed files with 286 additions and 13 deletions

View File

@ -0,0 +1,57 @@
import argparse
import csv
import codecs
import re
import lxml.etree as lxml
def xpath_find(element,expression):
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
def get_xml_id(element):
return element.get('{http://www.w3.org/XML/1998/namespace}id')
def get_id_counter(xml_id):
return int(re.search(r'^s(\d+)\.\d+(?:\.\d+)?$', xml_id).group(1))
def assign(input_file_name, csv_file_name, output_file_name):
csv_file = codecs.open(csv_file_name, 'r')
reader = csv.DictReader(csv_file, delimiter='\t')
mwe_map = {}
for row in reader:
structure_id = row['Structure_ID']
token_ids = [row[key] for key in sorted(row.keys()) if key.endswith('_Token_ID') and len(row[key]) > 0]
index = get_id_counter(token_ids[0])
component_count = len(token_ids)
if (index not in mwe_map):
mwe_map[index] = set()
mwe_map[index].add((structure_id, component_count))
csv_file.close()
xml_tree = lxml.parse(input_file_name)
xml_root = xml_tree.getroot()
mwes_xml = xpath_find(xml_root, './/tei:s')
for mwe_xml in mwes_xml:
index = get_id_counter(get_xml_id(mwe_xml))
mwe_string = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in xpath_find(mwe_xml, 'tei:w|tei:pc')]).strip()
token_count = len(xpath_find(mwe_xml, 'tei:w|tei:pc'))
structure_ids = set()
if (index in mwe_map):
for (structure_id, component_count) in mwe_map[index]:
if (component_count == token_count):
structure_ids.add(int(structure_id))
if (len(structure_ids) > 1):
print('MWE #' + str(index) + ': ' + mwe_string + ': MULTIPLE_COLLOCATION_STRUCTURES (' + str(structure_ids) + ')')
elif (len(structure_ids) == 1):
mwe_xml.set('structure_id', str(list(structure_ids)[0]))
xml_tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Assign collocation structure ids to parsed lexical units.')
arg_parser.add_argument('-infile', type=str, help='Input TEI')
arg_parser.add_argument('-csv', type=str, help='CSV file')
arg_parser.add_argument('-outfile', type=str, help='Output TEI')
arguments = arg_parser.parse_args()
assign(arguments.infile, arguments.structures, arguments.outfile)

View File

@ -0,0 +1,222 @@
import argparse
import lxml.etree as lxml
from conversion_utils.jos_msds_and_properties import Converter, Msd, Properties
CONVERTER = Converter()
FEATURE_CATEGORY_MAP = {'noun':{'case'},
'adjective':{'case'},
'numeral':{'case', 'form'},
'verb':{'type'}}
LABEL_MAP = {'form':{'digit':'a', 'roman':'r', 'letter':'b'},
'case':{'nominative':'1', 'genitive':'2', 'dative':'3', 'accusative':'4', 'locative':'5', 'instrumental':'6'},
'type':{'main':'g', 'auxiliary':'p', 'reflexive':'p'}}
CONTACT_MAP = {(False, False): 'both', (False, True): 'left', (True, False): 'right', (True, True): 'neither'}
DEPENDENCY_ROOT_SYMBOL = '#'
DEPENDENCY_ROOT_LABEL = 'modra'
class SyntacticStructure:
def __init__(self):
self.id = None
self.type = None
self.components = []
self.dependencies = []
def set_components(self, component_maps):
self.components.clear()
for component_map in component_maps:
contact = component_map.pop('contact')
label = self._generate_label(component_map)
self.components.append({'features': component_map, 'contact':contact, 'label':label})
def set_dependencies(self, dependency_tuples):
self.dependencies.clear()
if (len(dependency_tuples) > 1):
for (from_index, to_index, label) in dependency_tuples:
self.dependencies.append({'from':from_index, 'to':to_index, 'label':label})
def set_example(self, parsed_unit):
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
self.example = ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
def _generate_label(self, feature_map):
category = feature_map['POS']
label = CONVERTER.specifications.find_category_by_name(category, 'en').codes.get('sl').lower()
if (self.type == 'other'):
for name in ['form', 'case', 'type']:
try:
label += LABEL_MAP[name][feature_map[name]]
except KeyError:
pass
return label
def __str__(self):
return str(self.components) + str(self.dependencies)
def __eq__(self, other):
return other.components == self.components and other.dependencies == self.dependencies
def assign(unit_input_file_name, structure_old_file_name, unit_output_file_name, structure_new_file_name):
parser = lxml.XMLParser(remove_blank_text=True)
unit_tree = lxml.parse(unit_input_file_name, parser=parser)
unit_root = unit_tree.getroot()
structure_tree = lxml.parse(structure_old_file_name, parser=parser)
structure_root = structure_tree.getroot()
find_and_assign_structures(unit_root, structure_root)
unit_tree.write(unit_output_file_name, encoding='UTF-8', pretty_print=True)
structure_tree.write(structure_new_file_name, encoding='UTF-8', pretty_print=True)
def find_and_assign_structures(unit_root, structure_root): # TODO: check and test
syntactic_structures = parse_xml_structures(structure_root)
last_id = get_max_id(structure_root)
for unit in xpath_find(unit_root, 'tei:text/tei:body/tei:p/tei:s'):
if (unit.get('structure_id') is None):
new_syntactic_structure = SyntacticStructure()
new_syntactic_structure.type = 'single' if len(xpath_find(unit, 'tei:w|tei:pc')) == 1 else 'other'
component_maps = make_component_map(unit)
new_syntactic_structure.set_components(component_maps)
dependency_tuples = make_dependency_tuples(unit)
new_syntactic_structure.set_dependencies(dependency_tuples)
syntactic_structure = next((ss for ss in syntactic_structures if ss == new_syntactic_structure), None)
if (syntactic_structure is None):
syntactic_structure = new_syntactic_structure
last_id += 1
syntactic_structure.id = last_id
syntactic_structure.set_example(unit)
syntactic_structures.append(syntactic_structure)
structure_element = create_xml_structure(syntactic_structure)
structure_root.append(structure_element)
unit.set('structure_id', str(syntactic_structure.id))
def parse_xml_structures(root):
syntactic_structures = []
for structure_element in root.xpath('syntactic_structure[@type!="collocation"]'):
syntactic_structure = SyntacticStructure()
syntactic_structure.id = int(structure_element.get('id'))
syntactic_structure.type = structure_element.get('type')
dependency_tuples = []
for dependency in structure_element.xpath('system[@type="JOS"]/dependencies/dependency'):
dependency_tuples.append((dependency.get('from'), dependency.get('to'), dependency.get('label')))
syntactic_structure.set_dependencies(dependency_tuples)
component_maps = []
for component in structure_element.xpath('system[@type="JOS"]/definition/component'):
morphology_features = component.xpath('restriction[@type="morphology"]/feature')
component_map = {}
for feature in morphology_features:
key = feature.attrib.keys()[0]
if (key == 'POS' or syntactic_structure.type != 'single'):
component_map[key] = feature.get(key)
try:
contact = component.xpath('restriction[@type="space"]/feature')[0].get('contact')
except:
contact = None
component_map['contact'] = contact
component_maps.append(component_map)
syntactic_structure.set_components(component_maps)
syntactic_structures.append(syntactic_structure)
return syntactic_structures
def xpath_find(element,expression):
return element.xpath(expression, namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
def make_component_map(unit):
component_maps = []
tokens = xpath_find(unit, 'tei:w|tei:pc')
for (index, token) in enumerate(tokens, start=1):
component_map = {}
msd = Msd(token.get('ana')[len('MTE:'):], 'sl')
properties = CONVERTER.msd_to_properties(msd, 'en')
component_map['POS'] = properties.category
component_map['contact'] = get_contact(token, properties.category)
if (len(tokens) > 1):
if (msd.code[0] == 'Z'):
if (msd.code in {'Zp------k', 'Zp---d--k'}):
component_map['type'] = 'reflexive'
component_map['clitic'] = 'yes'
elif (properties.category in FEATURE_CATEGORY_MAP):
feature_map = {**properties.lexeme_feature_map, **properties.form_feature_map}
for name in FEATURE_CATEGORY_MAP[properties.category]:
if (name in feature_map):
component_map[name] = feature_map[name]
component_maps.append(component_map)
return component_maps
def make_dependency_tuples(unit):
dependency_tuples = []
dependencies = xpath_find(unit, 'tei:linkGrp[@type="JOS-SYN"]/tei:link')
dependencies.sort(key=lambda dep: int(dep.get('target')[dep.get('target').rindex('.')+1:]))
for dependency in dependencies:
relation = dependency.get('ana')[len('jos-syn:'):]
[from_index, to_index] = [string[string.rindex('.')+1:] if string.count('.') == 2 else '#' for string in dependency.get('target').split(' ')]
dependency_tuples.append((from_index, to_index, relation))
return dependency_tuples
def has_space(token):
return token.get('join') != 'right'
def get_contact(token, category):
if (category != 'punctuation'):
contact = None
else:
left_space = token.getprevious() is not None and has_space(token.getprevious())
right_space = has_space(token)
contact = CONTACT_MAP[(left_space, right_space)]
return contact
def get_max_id(root):
return max([int(ss.get('id')) for ss in root.xpath('syntactic_structure')])
def create_xml_structure(syntactic_structure):
structure_element = lxml.Element('syntactic_structure')
structure_element.set('tempId', str(syntactic_structure.id))
comment = lxml.Comment(' example: ' + syntactic_structure.example)
structure_element.append(comment)
structure_element.set('type', 'other')
system = lxml.SubElement(structure_element, 'system')
system.set('type', 'JOS')
components = lxml.SubElement(system, 'components')
components.set('order', 'fixed')
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
component = lxml.SubElement(components, 'component')
component.set('cid', str(index))
component.set('type', 'core')
component.set('label', component_map['label'])
structure_element.set('label', '-'.join([c.get('label') for c in components]))
dependencies = lxml.SubElement(system, 'dependencies')
for dependency_map in syntactic_structure.dependencies:
dependency = lxml.SubElement(dependencies, 'dependency')
[from_index, label, to_index] = [dependency_map[key] for key in ['from', 'label', 'to']]
if (label == DEPENDENCY_ROOT_LABEL):
from_index = DEPENDENCY_ROOT_SYMBOL
dependency.set('from', from_index)
dependency.set('label', label)
dependency.set('to', to_index)
definition = lxml.SubElement(system, 'definition')
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
component = lxml.SubElement(definition, 'component')
component.set('cid', str(index))
restriction = lxml.SubElement(component, 'restriction')
restriction.set('type', 'morphology')
for key in ['POS', 'type', 'form', 'case', 'clitic']:
if (key in component_map['features']):
feature = lxml.SubElement(restriction, 'feature')
feature.set(key, component_map['features'][key])
if (component_map['contact'] is not None):
space_restriction = lxml.SubElement(component, 'restriction')
space_restriction.set('type', 'space')
space_feature = lxml.SubElement(space_restriction, 'feature')
space_feature.set('contact', component_map['contact'])
return structure_element
if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Assign structure ids to single-component structures.')
arg_parser.add_argument('-infile', type=str, help='Input TEI')
arg_parser.add_argument('-instruct', type=str, help='Structures input file')
arg_parser.add_argument('-outfile', type=str, help='Output TEI')
arg_parser.add_argument('-outstruct', type=str, help='Structures output file')
arguments = arg_parser.parse_args()
assign(arguments.infile, arguments.instruct, arguments.outfile, arguments.outstruct)

View File

@ -3,7 +3,6 @@ FILE_MAP = {'strings-list': 'strings.txt',
'obeliks-tweaked': 'obeliks_tweaked.conllu',
'classla-parsed': 'classla_raw.conllu',
'classla-translated': 'classla_translated.conllu',
'dict': 'dict.xml',
'structure-schema': 'structures.xsd',
'tei-initial': 'tei_initial.xml',
'tei-ids-collocation': 'tei_ids_collocations.xml',

View File

@ -8,11 +8,11 @@ import classla
from structure_assignment.constants import *
from structure_assignment.tweak_conllu import tweak as tweak_conllu
from nova_slovnica.translate_jos import translate as translate_jos
from nova_slovnica.conllu_to_xml import convert_file as conllu_to_tei
from nova_slovnica.assign_collocation_structures import assign as assign_collocation_structures
from nova_slovnica.assign_other_structures import assign as assign_other_structures
from nova_slovnica.tei_to_dictionary import convert as tei_to_dictionary
from conversion_utils.translate_conllu_jos import translate as translate_jos
from conversion_utils.conllu_to_tei import convert_file as conllu_to_tei
from structure_assignment.assign_collocation_structures import assign as assign_collocation_structures
from structure_assignment.assign_other_structures import assign as assign_other_structures
from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
class Runner:
@ -132,9 +132,8 @@ class Pipeline:
def do_translate_jos(self):
print('Translating JOS ...')
input_file_name = self.file_map['classla-parsed']
dictionary_file_name = self.file_map['dict']
output_file_name = self.file_map['classla-translated']
translate_jos(input_file_name, dictionary_file_name, output_file_name)
translate_jos(input_file_name, output_file_name)
def do_conllu_to_tei(self):
print('Converting to TEI ...')

View File

@ -7,9 +7,7 @@ mkdir lib resources
## get dependencies
cd lib
git clone git@gitea.cjvt.si:redmine_projects/nova_slovnica.git
git clone git@gitea.cjvt.si:ozbolt/luscenje_struktur.git
git clone git@gitea.cjvt.si:generic/data_admin.git
git clone git@gitea.cjvt.si:generic/xml_schemas.git
git clone git@gitea.cjvt.si:generic/conversion_utils.git
cd ..
@ -23,7 +21,6 @@ pip install psycopg2cffi
pip install sqlalchemy
pip install classla
python -c "import classla; classla.download('sl', type='standard_jos', dir='resources/classla')"
pip install lib/nova_slovnica/python/package/
pip install lib/luscenje_struktur/
pip install lib/conversion_utils/
pip install package/
@ -32,8 +29,7 @@ deactivate
## put needed resources in place
cd resources
ln -s ../lib/luscenje_struktur/wani.py .
ln -s ../lib/nova_slovnica/resources/dict.xml .
ln -s ../lib/data_admin/resources/structures.xsd .
ln -s ../lib/xml_schemas/resources/schema/structures.xsd .
ln -s ../lib/xml_schemas/resources/schema/inventory.xsd .
ln -s ../lib/xml_schemas/resources/schema/monolingual_dictionaries.xsd .
cd ..