Redmine #1461: switched from luscenje_struktur to cordex
This commit is contained in:
parent
e9eff0658f
commit
24824cd459
|
@ -3,7 +3,6 @@ import argparse
|
||||||
from structure_assignment.pipeline import Runner
|
from structure_assignment.pipeline import Runner
|
||||||
|
|
||||||
classla_directory = '../resources/classla'
|
classla_directory = '../resources/classla'
|
||||||
wani_file_name = '../resources/wani.py' # TODO: remove once luscenje_struktur incorporates wani in package
|
|
||||||
|
|
||||||
if (__name__ == '__main__'):
|
if (__name__ == '__main__'):
|
||||||
|
|
||||||
|
@ -22,7 +21,7 @@ if (__name__ == '__main__'):
|
||||||
output_structure_file_name = arguments.outstructs
|
output_structure_file_name = arguments.outstructs
|
||||||
|
|
||||||
nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
|
nlp_needed = mode in {'strings_to_parse', 'strings_to_dictionary', 'all'}
|
||||||
runner = Runner(nlp_needed, classla_directory, wani_file_name)
|
runner = Runner(nlp_needed, classla_directory)
|
||||||
if (mode == 'strings_to_parse'):
|
if (mode == 'strings_to_parse'):
|
||||||
runner.strings_to_parse(input_file_name, output_file_name)
|
runner.strings_to_parse(input_file_name, output_file_name)
|
||||||
elif (mode == 'strings_to_dictionary'):
|
elif (mode == 'strings_to_dictionary'):
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -10,8 +10,6 @@ setup(name='structure_assignment',
|
||||||
install_requires=['lxml',
|
install_requires=['lxml',
|
||||||
'classla',
|
'classla',
|
||||||
'conversion_utils @ git+https://gitea.cjvt.si/generic/conversion_utils.git',
|
'conversion_utils @ git+https://gitea.cjvt.si/generic/conversion_utils.git',
|
||||||
'luscenje_struktur_loc @ git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git@i2198', # TODO: switch to master once luscenje_struktur's i2198 is merged into master
|
'cordex @ git+https://github.com/clarinsi/cordex.git',
|
||||||
'psycopg2cffi', # TODO: remove once luscenje_struktur takes care of it
|
|
||||||
'sqlalchemy', # TODO: remove once luscenje_struktur takes care of it
|
|
||||||
],
|
],
|
||||||
zip_safe=True)
|
zip_safe=True)
|
||||||
|
|
|
@ -16,7 +16,7 @@ def get_id_counter(xml_id):
|
||||||
def assign(input_file_name, csv_file_name, output_file_name):
|
def assign(input_file_name, csv_file_name, output_file_name):
|
||||||
|
|
||||||
csv_file = codecs.open(csv_file_name, 'r')
|
csv_file = codecs.open(csv_file_name, 'r')
|
||||||
reader = csv.DictReader(csv_file, delimiter='\t')
|
reader = csv.DictReader(csv_file, delimiter=',')
|
||||||
mwe_map = {}
|
mwe_map = {}
|
||||||
for row in reader:
|
for row in reader:
|
||||||
structure_id = row['Structure_ID']
|
structure_id = row['Structure_ID']
|
||||||
|
|
|
@ -97,11 +97,11 @@ def parse_xml_structures(root):
|
||||||
syntactic_structure.id = int(structure_element.get('id'))
|
syntactic_structure.id = int(structure_element.get('id'))
|
||||||
syntactic_structure.type = structure_element.get('type')
|
syntactic_structure.type = structure_element.get('type')
|
||||||
dependency_tuples = []
|
dependency_tuples = []
|
||||||
for dependency in structure_element.xpath('system[@type="JOS"]/dependencies/dependency'):
|
for dependency in structure_element.xpath('dependencies/dependency'):
|
||||||
dependency_tuples.append((dependency.get('from'), dependency.get('to'), dependency.get('label')))
|
dependency_tuples.append((dependency.get('from'), dependency.get('to'), dependency.get('label')))
|
||||||
syntactic_structure.set_dependencies(dependency_tuples)
|
syntactic_structure.set_dependencies(dependency_tuples)
|
||||||
component_maps = []
|
component_maps = []
|
||||||
for component in structure_element.xpath('system[@type="JOS"]/definition/component'):
|
for component in structure_element.xpath('definition/component'):
|
||||||
morphology_features = component.xpath('restriction[@type="morphology"]/feature')
|
morphology_features = component.xpath('restriction[@type="morphology"]/feature')
|
||||||
component_map = {}
|
component_map = {}
|
||||||
for feature in morphology_features:
|
for feature in morphology_features:
|
||||||
|
@ -174,9 +174,7 @@ def create_xml_structure(syntactic_structure):
|
||||||
comment = lxml.Comment(' example: ' + syntactic_structure.example)
|
comment = lxml.Comment(' example: ' + syntactic_structure.example)
|
||||||
structure_element.append(comment)
|
structure_element.append(comment)
|
||||||
structure_element.set('type', 'other')
|
structure_element.set('type', 'other')
|
||||||
system = lxml.SubElement(structure_element, 'system')
|
components = lxml.SubElement(structure_element, 'components')
|
||||||
system.set('type', 'JOS')
|
|
||||||
components = lxml.SubElement(system, 'components')
|
|
||||||
components.set('order', 'fixed')
|
components.set('order', 'fixed')
|
||||||
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
|
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
|
||||||
component = lxml.SubElement(components, 'component')
|
component = lxml.SubElement(components, 'component')
|
||||||
|
@ -184,7 +182,7 @@ def create_xml_structure(syntactic_structure):
|
||||||
component.set('type', 'core')
|
component.set('type', 'core')
|
||||||
component.set('label', component_map['label'])
|
component.set('label', component_map['label'])
|
||||||
structure_element.set('label', '-'.join([c.get('label') for c in components]))
|
structure_element.set('label', '-'.join([c.get('label') for c in components]))
|
||||||
dependencies = lxml.SubElement(system, 'dependencies')
|
dependencies = lxml.SubElement(structure_element, 'dependencies')
|
||||||
for dependency_map in syntactic_structure.dependencies:
|
for dependency_map in syntactic_structure.dependencies:
|
||||||
dependency = lxml.SubElement(dependencies, 'dependency')
|
dependency = lxml.SubElement(dependencies, 'dependency')
|
||||||
[from_index, label, to_index] = [dependency_map[key] for key in ['from', 'label', 'to']]
|
[from_index, label, to_index] = [dependency_map[key] for key in ['from', 'label', 'to']]
|
||||||
|
@ -193,7 +191,7 @@ def create_xml_structure(syntactic_structure):
|
||||||
dependency.set('from', from_index)
|
dependency.set('from', from_index)
|
||||||
dependency.set('label', label)
|
dependency.set('label', label)
|
||||||
dependency.set('to', to_index)
|
dependency.set('to', to_index)
|
||||||
definition = lxml.SubElement(system, 'definition')
|
definition = lxml.SubElement(structure_element, 'definition')
|
||||||
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
|
for (index, component_map) in enumerate(syntactic_structure.components, start=1):
|
||||||
component = lxml.SubElement(definition, 'component')
|
component = lxml.SubElement(definition, 'component')
|
||||||
component.set('cid', str(index))
|
component.set('cid', str(index))
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
from types import SimpleNamespace
|
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
|
|
||||||
import classla
|
import classla
|
||||||
|
import cordex
|
||||||
|
|
||||||
from structure_assignment.constants import *
|
from structure_assignment.constants import *
|
||||||
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
from structure_assignment.tweak_conllu import tweak as tweak_conllu
|
||||||
|
@ -15,19 +15,11 @@ from conversion_utils.tei_to_dictionary import convert as tei_to_dictionary
|
||||||
|
|
||||||
class Runner:
|
class Runner:
|
||||||
|
|
||||||
def __init__(self, nlp_needed, classla_directory=None, wani_file_name=None):
|
def __init__(self, nlp_needed, classla_directory=None):
|
||||||
self.classla_directory = classla_directory
|
self.classla_directory = classla_directory
|
||||||
if (nlp_needed):
|
if (nlp_needed):
|
||||||
NLP_CONFIG_MAP['dir'] = classla_directory
|
NLP_CONFIG_MAP['dir'] = classla_directory
|
||||||
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
self.nlp = classla.Pipeline('sl', **NLP_CONFIG_MAP)
|
||||||
if (wani_file_name is not None):
|
|
||||||
self._provide_wani(wani_file_name)
|
|
||||||
|
|
||||||
def _provide_wani(self, wani_file_name): # TODO: remove once wani is incorporated into luscenje_struktur package
|
|
||||||
self.wani_directory = tempfile.mkdtemp()
|
|
||||||
shutil.copy(wani_file_name, self.wani_directory)
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, self.wani_directory)
|
|
||||||
|
|
||||||
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
def run_all(self, input_file_name, output_file_name, input_structure_file_name, output_structure_file_name):
|
||||||
pipeline = Pipeline(self.nlp)
|
pipeline = Pipeline(self.nlp)
|
||||||
|
@ -93,7 +85,6 @@ class Runner:
|
||||||
pipeline.do_tei_to_dictionary()
|
pipeline.do_tei_to_dictionary()
|
||||||
|
|
||||||
def cleanup(self, pipeline):
|
def cleanup(self, pipeline):
|
||||||
shutil.rmtree(self.wani_directory, True)
|
|
||||||
pipeline.cleanup()
|
pipeline.cleanup()
|
||||||
|
|
||||||
|
|
||||||
|
@ -149,38 +140,13 @@ class Pipeline:
|
||||||
def do_find_collocation_structure_units(self):
|
def do_find_collocation_structure_units(self):
|
||||||
print('Finding units for existing collocation structures ...')
|
print('Finding units for existing collocation structures ...')
|
||||||
|
|
||||||
from wani import main as wani_main
|
structure_file_name = self.file_map['structures-old']
|
||||||
namespace = SimpleNamespace()
|
input_file_name = self.file_map['tei-initial']
|
||||||
|
output_file_name = self.file_map['collocations']
|
||||||
|
|
||||||
# relevant values
|
extractor = cordex.Pipeline(structure_file_name, fixed_restriction_order=True, statistics=False, collocation_sentence_map_dest=None)
|
||||||
namespace.structures = self.file_map['structures-old']
|
extraction = extractor(input_file_name)
|
||||||
namespace.input = [self.file_map['tei-initial']]
|
extraction.write(output_file_name, token_output=True)
|
||||||
namespace.all = self.file_map['collocations']
|
|
||||||
namespace.skip_id_check = True
|
|
||||||
namespace.fixed_restriction_order = True
|
|
||||||
namespace.new_tei = True
|
|
||||||
|
|
||||||
# default values
|
|
||||||
namespace.sloleks_db = None
|
|
||||||
namespace.out = None
|
|
||||||
namespace.out_no_stat = None
|
|
||||||
namespace.stats = None
|
|
||||||
namespace.no_msd_translate = False
|
|
||||||
namespace.min_freq = 0
|
|
||||||
namespace.verbose = 'info'
|
|
||||||
namespace.count_files = False
|
|
||||||
namespace.multiple_output = False
|
|
||||||
namespace.load_sloleks = False
|
|
||||||
namespace.sort_by = -1
|
|
||||||
namespace.sort_reversed = False
|
|
||||||
namespace.db = None
|
|
||||||
namespace.collocation_sentence_map_dest = None
|
|
||||||
namespace.new_db = False
|
|
||||||
namespace.pc_tag = 'pc'
|
|
||||||
namespace.separator = '\t'
|
|
||||||
namespace.ignore_punctuations = False
|
|
||||||
|
|
||||||
wani_main(namespace)
|
|
||||||
|
|
||||||
def do_assign_collocation_structures(self):
|
def do_assign_collocation_structures(self):
|
||||||
print('Assigning ids of collocation structures ...')
|
print('Assigning ids of collocation structures ...')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user