12 Commits

19 changed files with 2715 additions and 55 deletions

5
.gitignore vendored
View File

@@ -1,2 +1,7 @@
*.pyc *.pyc
venv venv
data
.idea
build
dist
*.egg-info

22
LICENSE.txt Normal file
View File

@@ -0,0 +1,22 @@
MIT License
Copyright (c) 2023 CLARIN.SI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,3 +1,6 @@
include conversion_utils/resources/jos_specifications.pickle include conversion_utils/resources/jos_specifications.pickle
include conversion_utils/resources/dict.xml include conversion_utils/resources/dict.xml
include conversion_utils/resources/structure_conversions.csv include conversion_utils/resources/structure_conversions.csv
include conversion_utils/resources/jos-msd2features.tbl
include conversion_utils/resources/jos2ud-features.tbl
include conversion_utils/resources/jos2ud-pos.tbl

View File

@@ -1,7 +1,8 @@
## Conversion utilities ## CJVT conversion utilities
This repository is currently intended for common conversions needed by CJVT developers. For the This repository is intended for common conversions needed by CJVT developers. It can of course also
moment, this is limited to JOS msds and properties. be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
### JOS msds and properties ### JOS msds and properties

View File

@@ -1,23 +1,36 @@
"""Convert a series of CoNNL-U files to a TEI file.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse import argparse
import re import re
import sys import sys
from glob import glob
from lxml import etree from lxml import etree
class Sentence: class Sentence:
def __init__(self, _id, no_ud=False, system='jos'): def __init__(self, _id, no_ud=False, system='jos'):
self._id = _id self._id = _id
self.items = [] self.items = []
self.links = [] self.links = []
self.srl_links = []
self.no_ud = no_ud self.no_ud = no_ud
self.system = system self.system = system
def add_item(self, token, lemma, upos, upos_other, xpos, misc): def add_item(self, token, lemma, upos, upos_other, xpos, misc):
self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')]) no_space_after = 'SpaceAfter' in misc and misc['SpaceAfter'] == 'No'
ner = misc['NER'] if 'NER' in misc else 'O'
self.items.append([token, lemma, upos, upos_other, xpos, no_space_after, ner])
def add_link(self, link_ref, link_type): def add_link(self, link_ref, link_type):
self.links.append([link_ref, link_type]) self.links.append([link_ref, link_type])
def add_srl_link(self, link_ref, link_type):
self.srl_links.append([link_ref, link_type])
def as_xml(self, id_prefix=None): def as_xml(self, id_prefix=None):
if id_prefix: if id_prefix:
xml_id = id_prefix + '.' + self._id xml_id = id_prefix + '.' + self._id
@@ -27,8 +40,24 @@ class Sentence:
set_xml_attr(base, 'id', xml_id) set_xml_attr(base, 'id', xml_id)
id_counter = 1 id_counter = 1
in_seg = False
sentence_base = base
for item in self.items: for item in self.items:
token, lemma, upos, upos_other, xpos, no_space_after = item token, lemma, upos, upos_other, xpos, no_space_after, ner = item
if ner[0] == 'B':
if in_seg:
sentence_base.append(base)
in_seg = True
base = etree.Element('seg')
base.set('type', 'name')
base.set('subtype', f'{ner[2:].lower()}')
elif ner[0] == 'O':
if in_seg:
sentence_base.append(base)
base = sentence_base
in_seg = False
if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
to_add = etree.Element('pc') to_add = etree.Element('pc')
@@ -53,6 +82,11 @@ class Sentence:
base.append(to_add) base.append(to_add)
if in_seg:
sentence_base.append(base)
base = sentence_base
# depparsing linkGrp
link_grp = etree.Element('linkGrp') link_grp = etree.Element('linkGrp')
link_grp.set('corresp', '#'+xml_id) link_grp.set('corresp', '#'+xml_id)
link_grp.set('targFunc', 'head argument') link_grp.set('targFunc', 'head argument')
@@ -67,6 +101,23 @@ class Sentence:
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1)) link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
link_grp.append(link) link_grp.append(link)
base.append(link_grp) base.append(link_grp)
# srl linkGrp
if self.srl_links:
link_grp = etree.Element('linkGrp')
link_grp.set('corresp', '#' + xml_id)
link_grp.set('targFunc', 'head argument')
link_grp.set('type', 'SRL')
for link_id, item in enumerate(self.srl_links):
link_ref, link_type = item
link = etree.Element('link')
link.set('ana', 'srl:' + link_type.replace(':', '_'))
if link_ref == u'0':
link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1))
else:
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
link_grp.append(link)
base.append(link_grp)
return base return base
@@ -234,7 +285,7 @@ def construct_sentence(sent_id, lines):
upos_other = tokens[5] upos_other = tokens[5]
depparse_link = tokens[6] depparse_link = tokens[6]
depparse_link_name = tokens[7] depparse_link_name = tokens[7]
misc = tokens[9] misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')} if tokens[9] != '_' else {}
sentence.add_item( sentence.add_item(
token, token,
@@ -247,6 +298,11 @@ def construct_sentence(sent_id, lines):
sentence.add_link( sentence.add_link(
depparse_link, depparse_link,
depparse_link_name) depparse_link_name)
if 'SRL' in misc:
sentence.add_srl_link(
depparse_link,
misc['SRL'])
return sentence return sentence
@@ -256,7 +312,7 @@ def construct_tei_etrees(conllu_lines):
def convert_file(input_file_name, output_file_name): def convert_file(input_file_name, output_file_name):
input_file = open(input_file_name, 'r') input_file = open(input_file_name, 'r', encoding='utf-8')
root = construct_tei_etrees(input_file)[0] root = construct_tei_etrees(input_file)[0]
tree = etree.ElementTree(root) tree = etree.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True) tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@@ -267,19 +323,16 @@ def convert_file(input_file_name, output_file_name):
if __name__ == '__main__': if __name__ == '__main__':
import argparse
from glob import glob
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.') parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
parser.add_argument('files', nargs='+', help='CoNNL-U file') parser.add_argument('files', nargs='+', help='CoNNL-U file')
parser.add_argument('-o', '--out-file', dest='out', default=None, parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
help='Write output to file instead of stdout.')
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud']) parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
args = parser.parse_args() args = parser.parse_args()
if args.out: if args.out:
f_out = open(args.out, 'w') f_out = open(args.out, 'w', encoding='utf-8')
else: else:
f_out = sys.stdout f_out = sys.stdout
@@ -288,7 +341,7 @@ if __name__ == '__main__':
for arg in args.files: for arg in args.files:
filelist = glob(arg) filelist = glob(arg)
for f in filelist: for f in filelist:
with open(f, 'r') as conllu_f: with open(f, 'r', encoding='utf-8') as conllu_f:
tei_etrees = construct_tei_etrees(conllu_f) tei_etrees = construct_tei_etrees(conllu_f)
for tei_etree in tei_etrees: for tei_etree in tei_etrees:
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode()) f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())

View File

@@ -1,12 +1,21 @@
import lxml.etree as lxml
import re import re
import pickle import pickle
import importlib_resources as pkg_resources import lxml.etree as lxml
from collections import defaultdict
from importlib_resources import files
from enum import Enum
from conversion_utils.utils import xpath_find, get_xml_id from conversion_utils.utils import xpath_find, get_xml_id
JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle' JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'
RESOURCES_DIR = "conversion_utils.resources"
MSD_TO_FEATURES = "jos-msd2features.tbl"
JOS_TO_UD_FEATURES_RULES = "jos2ud-features.tbl"
JOS_TO_UPOS_RULES = "jos2ud-pos.tbl"
## Positions of lexeme-level features for each category ## Positions of lexeme-level features for each category
LEXEME_FEATURE_MAP = {'noun':{1,2}, LEXEME_FEATURE_MAP = {'noun':{1,2},
'verb':{1,2}, 'verb':{1,2},
@@ -53,6 +62,14 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
('pronoun', 8, 'se'), ('zaimek', 8, 'se'), ('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')} ('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
class MsdState(Enum):
FULL = 1
PARTIAL = 2
UNKNOWN = 3
class MsdException(Exception):
pass
class Specifications: class Specifications:
"""JOS specifications with list of all word categories.""" """JOS specifications with list of all word categories."""
@@ -216,6 +233,36 @@ class Properties:
and self.language == obj.language and self.language == obj.language
class UD:
"""Universal Dependencies object.
Can be converted to a valid UD features string.
"""
def __init__(self, pos, features_map):
self.pos = pos
self.features_map = features_map
def to_features_string(self):
return self._features_string()
def to_full_string(self):
features = self._features_string()
if features:
return "UposTag=" + self.pos + "|" + features
else:
return "UposTag=" + self.pos
def _features_string(self):
return "|".join([f"{feature}={value}" for feature, value in self._sort_features(self.features_map)])
def _sort_features(self, features_map):
return sorted(features_map.items(), key=lambda x: x[0].lower(), reverse=False)
def __str__(self):
return f"pos={self.pos}, features_map={self.features_map}"
class Msd: class Msd:
"""JOS msd.""" """JOS msd."""
@@ -230,17 +277,15 @@ class Msd:
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
class ConverterException(Exception):
pass
class Converter: class Converter:
"""Converter between Msd and Properties objects.""" """Converter between Msd and Properties objects."""
def __init__(self, xml_file_name=None): def __init__(self, xml_file_name=None):
if (xml_file_name is None): if (xml_file_name is None):
if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)): resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
if (resource.is_file()):
try: try:
with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file: with resource.open('rb') as pickle_file:
self.specifications = pickle.load(pickle_file) self.specifications = pickle.load(pickle_file)
except: except:
exit('Could not parse specifications pickle file installed.') exit('Could not parse specifications pickle file installed.')
@@ -253,17 +298,84 @@ class Converter:
except: except:
exit('Could not parse specifications xml file provided.') exit('Could not parse specifications xml file provided.')
def msd_to_properties(self, msd, language, lemma=None): self.mte_to_ud_features = self._parse_msd_ud_conversion(MSD_TO_FEATURES)
"""Convert Msd to Properties (possibly in the other language). self.mte_to_ud_features_rules = self._parse_ud_rules(JOS_TO_UD_FEATURES_RULES)
self.mte_to_upos_rules = self._parse_ud_rules(JOS_TO_UPOS_RULES)
def _parse_msd_ud_conversion(self, file_name):
"""Parse file with direct conversions from English Msd to Universal Dependencies."""
conversion_map = defaultdict()
with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as conversion_file:
for line in conversion_file.readlines():
mte_msd_en, mte_features_en = line.strip("\n").split("\t")
mte_sl = self.translate_msd(Msd(mte_msd_en, "en"), "sl").code
conversion_map[mte_msd_en] = mte_features_en
conversion_map[mte_sl] = mte_features_en
return conversion_map
def _parse_ud_rules(self, file_name):
"""Parse file with rules additional rules for converting from applied to conversion from English Msd to Universal Dependencies."""
all_rules = defaultdict(list)
with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as rules_file:
for line in [l for l in rules_file.readlines() if l[0].isdigit()]:
priority, *current_rules = line.strip("\n").split("\t")
current_rules += [""] * (6 - len(current_rules))
all_rules[priority].append(current_rules)
return all_rules
def is_valid_msd(self, msd):
"""Verify if the Msd code is in the standard JOS set."""
return msd.code in self.specifications.codes_map[msd.language]
def get_msd_state(self, msd):
"""Determine if the Msd code is full, partial or unknown."""
code_map = self.specifications.codes_map[msd.language]
if msd.code in code_map:
return MsdState.FULL
for msd_code in code_map:
if msd_code.startswith(msd.code):
return MsdState.PARTIAL
return MsdState.UNKNOWN
def check_valid_msd(self, msd, require_valid_flag, allow_partial=True):
"""If the Msd code is not valid, raise an exception or give a warning."""
msd_state = self.get_msd_state(msd)
if msd_state == MsdState.UNKNOWN:
message = f"The msd '{msd.code}' is unknown"
if require_valid_flag:
raise MsdException(message)
else:
print('[WARN] ' + message)
if msd_state == MsdState.PARTIAL and not allow_partial:
raise MsdException(f"Partial msd '{msd.code}' is not allowed. Full msd is required.")
def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
"""Convert Msd to Properties.
The language of the generated Properties is specified and can differ from the Msd language.
If require_valid_flag is True, a MsdException is raised if the MSD is not in the standard
JOS set. Otherwise only a warning is given.
If you care about accurate level information (i.e., which properties are lexeme-level and
which are form-level), note that some features depends on the particular lemma. For such
features, if lemma is not provided and warn_level_flag is True, a warning will be given.
If a MSD has dashes in place of letters for certain features, they are skipped, so that
these features are not included in the generated Properties object.
Parameters:
msd(Msd): the JOS MSD to convert
language(str): the language for the Properties object to be generated: "en" (English) or "sl" (Slovene)
lemma(str): the lemma of the word form with the MSD
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is provided
warn_level_flag(boolean): whether to warn if cannot be sure of level of a property
Returns:
Properties: the result of the conversion of the Msd in the language requested
The level (lexeme vs form) of certain reflexive msd features
depends on the lemma, so set the lemma if you need accurate
level information.
""" """
self.check_valid_msd(msd, require_valid_flag)
if (msd.code not in self.specifications.codes_map[msd.language]):
raise ConverterException('The msd {} is unknown'.format(msd.code))
category_char = msd.code[0].lower() category_char = msd.code[0].lower()
value_chars = msd.code[1:] value_chars = msd.code[1:]
category = self.specifications.find_category_by_code(category_char, msd.language) category = self.specifications.find_category_by_code(category_char, msd.language)
@@ -277,8 +389,8 @@ class Converter:
value = feature.find_value_by_char(value_char, msd.language) value = feature.find_value_by_char(value_char, msd.language)
feature_name = feature.names.get(language) feature_name = feature.names.get(language)
feature_value = value.names.get(language) feature_value = value.names.get(language)
if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]): if (warn_level_flag and lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
print('[WARN] The level of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.' print('[WARN] The level (lexeme vs form) of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
.format(category=category_name, position=index)) .format(category=category_name, position=index))
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
@@ -289,8 +401,21 @@ class Converter:
form_feature_map[feature_name] = feature_value form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, language) return Properties(category_name, lexeme_feature_map, form_feature_map, language)
def properties_to_msd(self, properties, language): def properties_to_msd(self, properties, language, require_valid_flag=False):
"""Convert Properties to msd (possibly in the other language).""" """Convert Properties to Msd.
The language of the generated Msd is specified and can differ from the Properties language.
If require_valid_flag is True, a MsdException is raised if the generated MSD is not in
the standard JOS set. Otherwise only a warning is given.
Any skipped positions among the Properties are represented as dashes in the MSD.
Parameters:
properties(Properties): the properties to convert
language(str): the language for the Msd object to be returned: "en" (English) or "sl" (Slovene)
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is generated
"""
category = self.specifications.find_category_by_name(properties.category, properties.language) category = self.specifications.find_category_by_name(properties.category, properties.language)
category_char = category.codes.get(language).upper() category_char = category.codes.get(language).upper()
feature_map = properties.lexeme_feature_map.copy() feature_map = properties.lexeme_feature_map.copy()
@@ -308,7 +433,51 @@ class Converter:
msd_code += '-' msd_code += '-'
i += 1 i += 1
msd_code += position_map[position] msd_code += position_map[position]
return Msd(msd_code, language) msd = Msd(msd_code, language)
self.check_valid_msd(msd, require_valid_flag)
return msd
def msd_to_ud(self, msd, lemma):
"""Convert Msd to Universal Dependencies object.
Partial Msds are currently not supported.
Parameters:
msd(Msd): the Msd to convert
lemma(str): the lemma of the word form with the MSD
"""
self.check_valid_msd(msd, False, allow_partial=False)
upos_category, *upos_features = self.mte_to_ud_features[msd.code].split()
final_upos = ""
for priority in sorted(self.mte_to_upos_rules, reverse=True):
for rule in self.mte_to_upos_rules[priority]:
rule_lemma, rule_category, rule_mte_features, _, rule_pos_ud, _ = rule
if (rule_category != upos_category
or (rule_lemma not in ("*", "*en") and lemma != rule_lemma)
or (rule_lemma == "*en" and not lemma.endswith("en"))
or (rule_mte_features != "*" and not all(f in upos_features for f in rule_mte_features.split("|")))):
continue
final_upos = rule_pos_ud
for priority in sorted(self.mte_to_ud_features_rules):
for rule in self.mte_to_ud_features_rules[priority]:
rule_lemma, rule_category, rule_mte_features, rule_pos_ud, rule_ud_features, _ = rule
if (rule_lemma != "*" and lemma != rule_lemma
or (rule_category != "*" and rule_category != upos_category)
or (rule_pos_ud != "*" and rule_pos_ud != final_upos)):
continue
upos_features = [rule_ud_features if f == rule_mte_features else f for f in upos_features]
if rule_mte_features == "*" and rule_ud_features != "-":
upos_features.append(rule_ud_features)
ud_features = dict(f.split("=", 1) for f in "|".join(upos_features).split("|") if f not in {"", "-"})
return UD(final_upos, ud_features)
def translate_msd(self, msd, language): def translate_msd(self, msd, language):
return self.properties_to_msd(self.msd_to_properties(msd, language), language) return self.properties_to_msd(self.msd_to_properties(msd, language), language)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,128 @@
# Mapping from JOS features to UD features
# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek
# 2018-11-23
#
#Prio Lemma Category Feats PoS-UD ->Feature-UD #Comment
----------------------------------------------------------------------------------------------------
1 * Noun Type=common * -
1 * Noun Type=proper * -
1 * Verb Negative=no * Polarity=Pos
1 * Verb Negative=yes * Polarity=Neg
1 * Verb Type=auxiliary * -
1 * Verb Type=main * -
1 * Verb VForm=present * VerbForm=Fin|Mood=Ind|Tense=Pres
1 * Verb VForm=future * VerbForm=Fin|Mood=Ind|Tense=Fut
1 * Verb VForm=conditional * VerbForm=Fin|Mood=Cnd
1 * Verb VForm=imperative * VerbForm=Fin|Mood=Imp
1 * Verb VForm=infinitive * VerbForm=Inf
1 * Verb VForm=supine * VerbForm=Sup
1 * Verb VForm=participle * VerbForm=Part
1 * Adjective Type=general * -
1 * Adjective Type=possessive * Poss=Yes
1 * Adjective Type=participle * VerbForm=Part
2 * Adverb Type=participle * VerbForm=Conv
2 * Adverb Type=general * -
1 nekaj Adverb Type=general DET PronType=Ind
1 več Adverb Type=general DET PronType=Ind
1 veliko Adverb Type=general DET PronType=Ind
1 manj Adverb Type=general DET PronType=Ind
1 dovolj Adverb Type=general DET PronType=Ind
1 pol Adverb Type=general DET PronType=Ind
1 malo Adverb Type=general DET PronType=Ind
1 toliko Adverb Type=general DET PronType=Dem
1 največ Adverb Type=general DET PronType=Ind
1 mnogo Adverb Type=general DET PronType=Ind
1 preveč Adverb Type=general DET PronType=Ind
1 par Adverb Type=general DET PronType=Ind
1 koliko Adverb Type=general DET PronType=Int
1 dosti Adverb Type=general DET PronType=Ind
1 nešteto Adverb Type=general DET PronType=Ind
1 četrt Adverb Type=general DET PronType=Ind
1 ogromno Adverb Type=general DET PronType=Ind
1 čimveč Adverb Type=general DET PronType=Ind
1 obilo Adverb Type=general DET PronType=Ind
1 premnogo Adverb Type=general DET PronType=Ind
1 enormno Adverb Type=general DET PronType=Ind
1 majčkeno Adverb Type=general DET PronType=Ind
2 * Pronoun Type=reflexive * PronType=Prs|Reflex=Yes
2 * Pronoun Type=personal * PronType=Prs
2 * Pronoun Type=possessive * PronType=Prs|Poss=Yes
2 * Pronoun Type=interrogative * PronType=Int
2 * Pronoun Type=relative * PronType=Rel
2 * Pronoun Type=demonstrative * PronType=Dem
2 * Pronoun Type=general * PronType=Tot
2 * Pronoun Type=negative * PronType=Neg
2 * Pronoun Type=indefinite * PronType=Ind
1 * Pronoun Type=personal DET PronType=Prs
1 * Pronoun Type=possessive DET PronType=Prs|Poss=Yes
1 * Pronoun Owner_Gender=masculine * Gender[psor]=Masc #lg.spec.feature
1 * Pronoun Owner_Gender=feminine * Gender[psor]=Fem #lg.spec.feature
1 * Pronoun Owner_Gender=neuter * Gender[psor]=Neut #lg.spec.feature
1 * Pronoun Owner_Number=singular * Number[psor]=Sing #lg.spec.feature
1 * Pronoun Owner_Number=plural * Number[psor]=Plur #lg.spec.feature
1 * Pronoun Owner_Number=dual * Number[psor]=Dual #lg.spec.feature
1 * Pronoun Clitic=yes * Variant=Short #lg.spec.feature
1 * Pronoun Clitic=bound * Variant=Bound #lg.spec.feature
1 svoj Pronoun Type=reflexive * PronType=Prs|Reflex=Yes|Poss=Yes
2 * Numeral Type=pronominal * -
2 * Numeral Form=letter * -
2 * Numeral Type=cardinal NUM NumType=Card
1 * Numeral Form=letter NUM NumForm=Word #lg.spec.feature
1 * Numeral Form=digit NUM NumForm=Digit #lg.spec.feature
1 * Numeral Form=roman NUM NumForm=Roman #lg.spec.feature
1 * Numeral Type=ordinal * NumType=Ord
1 * Numeral Type=special ADJ NumType=Mult
1 * Numeral Type=special NUM NumType=Sets
1 en Numeral Type=pronominal * NumType=Card
1 eden Numeral Type=pronominal * NumType=Card
1 * Conjunction Type=subordinating * -
1 * Conjunction Type=coordinating * -
2 * Particle * * -
1 ne Particle * * Polarity=Neg
1 * Interjection * * -
1 * Abbreviation * * Abbr=Yes
2 * Residual * * -
1 * Residual Type=foreign * Foreign=Yes
1 * Residual Type=typo * -
1 * Residual Type=program * -
1 * Punctuation * * -
2 * * Degree=positive * Degree=Pos
2 * * Degree=comparative * Degree=Cmp
2 * * Degree=superlative * Degree=Sup
1 * * Degree=positive DET -
1 * * Degree=comparative DET -
1 * * Degree=superlative DET -
1 * * Animate=no * Animacy=Inan
1 * * Animate=yes * Animacy=Anim
1 * * Aspect=perfective * Aspect=Perf
1 * * Aspect=progressive * Aspect=Imp
1 * * Aspect=biaspectual * -
1 * * Case=nominative * Case=Nom
1 * * Case=genitive * Case=Gen
1 * * Case=dative * Case=Dat
1 * * Case=accusative * Case=Acc
1 * * Case=locative * Case=Loc
1 * * Case=instrumental * Case=Ins
1 * * Definiteness=no * Definite=Ind
1 * * Definiteness=yes * Definite=Def
1 * * Gender=masculine * Gender=Masc
1 * * Gender=feminine * Gender=Fem
1 * * Gender=neuter * Gender=Neut
1 * * Number=singular * Number=Sing
1 * * Number=plural * Number=Plur
1 * * Number=dual * Number=Dual
1 * * Person=first * Person=1
1 * * Person=second * Person=2
1 * * Person=third * Person=3

View File

@@ -0,0 +1,282 @@
# Mapping from JOS PoS to UD 2.0 PoS
# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek
# 2019-02-04
#
#Prio Lemma Category Feats Deps ->PoS-UD #Comment
#-------------------------------------------------------------------------------------------------------
3 * Noun Type=common * NOUN
3 * Noun Type=proper * PROPN
3 * Verb * * VERB
2 * Verb Type=auxiliary * AUX #This is one can in fact also be VERB, but this has to be determined by some other means
3 * Adjective * * ADJ
3 * Adverb * * ADV
1 četrt Adverb * * DET
1 čimmanj Adverb * * DET
1 čimveč Adverb * * DET
1 dosti Adverb * * DET
1 dovolj Adverb * * DET
1 enako Adverb * * ADV
1 enormno Adverb * * DET
1 ful Adverb * * ADV
1 koliko Adverb * * DET
1 majčkeno Adverb * * DET
1 maksimalno Adverb * * ADV
1 malce Adverb * * ADV
1 malo Adverb * * DET
1 manj Adverb * * DET
1 minimalno Adverb * * ADV
1 mnogo Adverb * * DET
1 najmanj Adverb * * ADV
1 največ Adverb * * DET
1 nekaj Adverb * * DET
1 nekoliko Adverb * * ADV
1 nemalo Adverb * * ADV
1 nešteto Adverb * * DET
1 nič Adverb * * ADV
1 ničkoliko Adverb * * DET
1 obilo Adverb * * DET
1 ogromno Adverb * * DET
1 par Adverb * * DET
1 pol Adverb * * DET
1 polno Adverb * * ADV
1 precej Adverb * * ADV
1 premalo Adverb * * ADV
1 premnogo Adverb * * DET
1 preveč Adverb * * DET
1 toliko Adverb * * DET
1 veliko Adverb * * DET
1 več Adverb * * DET
1 večidel Adverb * * ADV
1 vse Adverb * * ADV
1 zadosti Adverb * * ADV
##All Pronouns should be explicitly defined
##But are not because of jos1M wrong lemmatisations for e.g. "ti", "te" etc.
3 * Pronoun * * PRON
##2 * Pronoun Type=demonstrative * DET
##2 * Pronoun Type=possessive * DET
1 bogsigavedikakšen Pronoun Type=indefinite * DET
1 bogvedikaj Pronoun Type=indefinite * PRON
1 bogvedikateri Pronoun Type=indefinite * DET
1 bogvekaj Pronoun Type=indefinite * PRON
1 bogvekakšen Pronoun Type=indefinite * DET
1 bogvekateri Pronoun Type=indefinite * DET
1 bogvekolik Pronoun Type=indefinite * DET
1 bogvekolikšen Pronoun Type=indefinite * DET
1 čezme Pronoun Type=personal * PRON
1 čezse Pronoun Type=reflexive * PRON
1 čigar Pronoun Type=relative * DET
1 čigarkoli Pronoun Type=relative * DET
1 čigarsižebodi Pronoun Type=relative * DET
1 čigav Pronoun Type=interrogative * DET
1 čigaver Pronoun Type=relative * DET
1 čigaverkoli Pronoun Type=relative * DET
1 čigavršen Pronoun Type=relative * DET
1 čigavršnji Pronoun Type=relative * DET
1 enak Pronoun Type=indefinite * DET
1 enaki Pronoun Type=indefinite * DET
1 enakšen Pronoun Type=indefinite * DET
1 isti Pronoun Type=indefinite * DET
1 jaz Pronoun Type=personal * PRON
1 jest Pronoun Type=personal * PRON
1 kaj Pronoun Type=interrogative * PRON
1 kak Pronoun Type=interrogative * DET
1 kakov Pronoun Type=interrogative * DET
1 kakošen Pronoun Type=interrogative * DET
1 kakršen Pronoun Type=relative * DET
1 kakršenkoli Pronoun Type=relative * DET
1 kakršensižebodi Pronoun Type=relative * DET
1 kakšen Pronoun Type=interrogative * DET
1 kar Pronoun Type=relative * PRON
1 karkoli Pronoun Type=relative * PRON
1 karsibodi Pronoun Type=relative * PRON
1 karsižebodi Pronoun Type=relative * PRON
1 kateri Pronoun Type=interrogative * DET
1 katerikoli Pronoun Type=relative * DET
1 katerisibodi Pronoun Type=relative * DET
1 kdo Pronoun Type=interrogative * PRON
1 kdor Pronoun Type=relative * PRON
1 kdorkoli Pronoun Type=relative * PRON
1 kdorsibodi Pronoun Type=relative * PRON
1 kdorsižebodi Pronoun Type=relative * PRON
1 kdovekaj Pronoun Type=indefinite * PRON
1 kdovekak Pronoun Type=indefinite * DET
1 kdovekakšen Pronoun Type=indefinite * DET
1 kdovekateri Pronoun Type=indefinite * DET
1 kdovekdo Pronoun Type=indefinite * PRON
1 kdovekolik Pronoun Type=indefinite * DET
1 koji Pronoun Type=interrogative * DET
1 kolik Pronoun Type=interrogative * DET
1 kolik Pronoun Type=indefinite * DET
1 koliker Pronoun Type=interrogative * DET
1 kolikršen Pronoun Type=relative * DET
1 kolikšen Pronoun Type=interrogative * DET
1 malokaj Pronoun Type=indefinite * PRON
1 malokak Pronoun Type=indefinite * DET
1 malokakšen Pronoun Type=indefinite * DET
1 malokateri Pronoun Type=indefinite * DET
1 malokdo Pronoun Type=indefinite * PRON
1 marsikaj Pronoun Type=indefinite * PRON
1 marsikak Pronoun Type=indefinite * DET
1 marsikakšen Pronoun Type=indefinite * DET
1 marsikateri Pronoun Type=indefinite * DET
1 marsikdo Pronoun Type=indefinite * PRON
1 marsičigav Pronoun Type=indefinite * DET
1 medme Pronoun Type=personal * PRON
1 medse Pronoun Type=reflexive * PRON
1 mnog Pronoun Type=indefinite * DET
1 mnogokaj Pronoun Type=indefinite * PRON
1 mnogokateri Pronoun Type=indefinite * DET
1 mnogokdo Pronoun Type=indefinite * PRON
1 moj Pronoun Type=possessive * DET
1 nadme Pronoun Type=personal * PRON
1 nadse Pronoun Type=reflexive * PRON
1 najin Pronoun Type=possessive * DET
1 name Pronoun Type=personal * PRON
1 nase Pronoun Type=reflexive * PRON
1 naš Pronoun Type=possessive * DET
1 negdo Pronoun Type=indefinite * PRON
1 nek Pronoun Type=indefinite * DET
1 nekaj Pronoun Type=indefinite * PRON
1 nekak Pronoun Type=indefinite * DET
1 nekakov Pronoun Type=indefinite * DET
1 nekakšen Pronoun Type=indefinite * DET
1 nekateri Pronoun Type=indefinite * DET
1 nekdo Pronoun Type=indefinite * PRON
1 neki Pronoun Type=indefinite * DET
1 nekolik Pronoun Type=indefinite * DET
1 nekolikšen Pronoun Type=indefinite * DET
1 nekolikšnji Pronoun Type=indefinite * DET
1 nekov Pronoun Type=indefinite * DET
1 nekšen Pronoun Type=indefinite * DET
1 nevemkakšen Pronoun Type=indefinite * DET
1 nihče Pronoun Type=negative * PRON
1 nikak Pronoun Type=negative * DET
1 nikakršen Pronoun Type=negative * DET
1 nikakšen Pronoun Type=negative * DET
1 nikdo Pronoun Type=negative * PRON
1 nikogaršen Pronoun Type=negative * DET
1 nikogaršnji Pronoun Type=negative * DET
1 nič Pronoun Type=negative * PRON
1 njegov Pronoun Type=possessive * DET
1 njen Pronoun Type=possessive * DET
1 njihen Pronoun Type=possessive * DET
1 njihnji Pronoun Type=possessive * DET
1 njihov Pronoun Type=possessive * DET
1 njun Pronoun Type=possessive * DET
1 nobeden Pronoun Type=negative * PRON
1 noben Pronoun Type=negative * DET
1 oba Pronoun Type=general * DET
1 obadva Pronoun Type=general * PRON
1 obme Pronoun Type=personal * PRON
1 oboj Pronoun Type=general * DET
1 obojen Pronoun Type=general * DET
1 obse Pronoun Type=reflexive * PRON
1 on Pronoun Type=personal * PRON
1 oni Pronoun Type=demonstrative * DET
1 onile Pronoun Type=demonstrative * PRON
1 podme Pronoun Type=personal * PRON
1 podse Pronoun Type=reflexive * PRON
1 pome Pronoun Type=personal * PRON
1 predme Pronoun Type=personal * PRON
1 predse Pronoun Type=reflexive * PRON
1 premarsikateri Pronoun Type=indefinite * DET
1 premnog Pronoun Type=indefinite * DET
1 prenekaj Pronoun Type=indefinite * PRON
1 prenekateri Pronoun Type=indefinite * DET
1 prenekdo Pronoun Type=indefinite * PRON
1 redkokateri Pronoun Type=indefinite * DET
1 redkokdo Pronoun Type=indefinite * PRON
1 se Pronoun Type=reflexive * PRON
1 skozme Pronoun Type=personal * PRON
1 skozse Pronoun Type=reflexive * PRON
1 svoj Pronoun Type=reflexive * DET
1 ta Pronoun Type=demonstrative * DET
1 tadva Pronoun Type=demonstrative * PRON
1 taisti Pronoun Type=demonstrative * DET
1 tak Pronoun Type=demonstrative * DET
1 takisti Pronoun Type=demonstrative * DET
1 takle Pronoun Type=demonstrative * DET
1 takov Pronoun Type=demonstrative * DET
1 takošen Pronoun Type=demonstrative * DET
1 takšen Pronoun Type=demonstrative * DET
1 takšenle Pronoun Type=demonstrative * DET
1 tale Pronoun Type=demonstrative * DET
1 talele Pronoun Type=demonstrative * DET
1 teu Pronoun Type=personal * PRON
1 ti Pronoun Type=personal * PRON
1 tisti Pronoun Type=demonstrative * DET
1 tistile Pronoun Type=demonstrative * DET
1 tolik Pronoun Type=demonstrative * DET
1 toliker Pronoun Type=demonstrative * DET
1 tolikšen Pronoun Type=demonstrative * DET
1 tolikšnji Pronoun Type=demonstrative * DET
1 toti Pronoun Type=demonstrative * DET
1 tvoj Pronoun Type=possessive * DET
1 un Pronoun Type=demonstrative * DET
1 vajin Pronoun Type=possessive * DET
1 vame Pronoun Type=personal * PRON
1 vase Pronoun Type=reflexive * PRON
1 vaš Pronoun Type=possessive * DET
1 ves Pronoun Type=general * DET
1 vsak Pronoun Type=general * DET
1 vsakateri Pronoun Type=general * DET
1 vsakdo Pronoun Type=general * PRON
1 vsakogaršen Pronoun Type=general * DET
1 vsakogaršnji Pronoun Type=general * DET
1 vsakršen Pronoun Type=general * DET
1 vsakteri Pronoun Type=general * DET
1 zame Pronoun Type=personal * PRON
1 zase Pronoun Type=reflexive * PRON
3 * Numeral Form=digit * NUM
3 * Numeral Form=roman * NUM
3 * Numeral Form=letter|Type=special * NUM
3 * Numeral Form=letter|Type=cardinal * NUM
2 * Numeral Form=letter|Type=ordinal * ADJ
1 drug Numeral Form=letter|Type=pronominal * ADJ
1 en Numeral Form=letter|Type=pronominal * NUM
1 *en Numeral Form=letter|Type=special * ADJ #enojen, dvojen
1 eden Numeral Form=letter|Type=pronominal * NUM #Dodal E.T.
3 * Adposition * * ADP #MULTEXT-East name
3 * Preposition * * ADP #JOS name
3 * Conjunction Type=coordinating * CCONJ
3 * Conjunction Type=subordinating * SCONJ
3 * Particle * * PART
3 * Interjection * * INTJ
3 * Abbreviation * * X
3 * Residual * * X
2 * Residual Type=web * SYM
2 * Residual Type=emo * SYM
2 * Residual Type=hashtag * SYM #Better mapping?
2 * Residual Type=at * SYM #Better mapping?
2 * Residual Type=foreign * X #Better mapping?
3 * Punctuation * * PUNCT
1 # Punctuation * * SYM
1 % Punctuation * * SYM
1 & Punctuation * * SYM
1 < Punctuation * * SYM
1 > Punctuation * * SYM
1 + Punctuation * * SYM
1 = Punctuation * * SYM
1 ° Punctuation * * SYM
1 × Punctuation * * SYM
1 ÷ Punctuation * * SYM
1 $ Punctuation * * SYM
1 @ Punctuation * * SYM
1 µ Punctuation * * SYM
1 © Punctuation * * SYM
1 § Punctuation * * SYM
1 € Punctuation * * SYM
1 £ Punctuation * * SYM

View File

@@ -1,12 +1,19 @@
"""Convert a TEI file to a XML file of the CJVT standard schema.
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse import argparse
import lxml.etree as lxml import lxml.etree as lxml
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
def get_parsed_unit_string(parsed_unit): def get_parsed_unit_string(parsed_unit):
elements = xpath_find(parsed_unit, 'tei:w|tei:pc') elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip() return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
def convert(input_file_name, output_file_name): def convert(input_file_name, output_file_name):
output_root = lxml.Element('dictionary') output_root = lxml.Element('dictionary')
@@ -55,4 +62,6 @@ if (__name__ == '__main__'):
arg_parser.add_argument('-infile', type=str, help='Input TEI xml') arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema') arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
arguments = arg_parser.parse_args() arguments = arg_parser.parse_args()
input_file_name = arguments.infile
output_file_name = arguments.outfile
convert(input_file_name, output_file_name) convert(input_file_name, output_file_name)

View File

@@ -1,6 +1,6 @@
import unittest import unittest
from conversion_utils.jos_msds_and_properties import Converter, Msd from conversion_utils.jos_msds_and_properties import Converter, Msd, MsdException
class JosMsdToPropertiesTestCase(unittest.TestCase): class JosMsdToPropertiesTestCase(unittest.TestCase):
@@ -55,3 +55,25 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
self.assertEqual(properties.category, 'punctuation') self.assertEqual(properties.category, 'punctuation')
self.assertEqual(properties.lexeme_feature_map, {}) self.assertEqual(properties.lexeme_feature_map, {})
self.assertEqual(properties.form_feature_map, {}) self.assertEqual(properties.form_feature_map, {})
def test_good_msd_with_require_valid(self):
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en', require_valid_flag=True)
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
def test_bad_msd(self):
properties = self.converter.msd_to_properties(Msd('N---d', 'en'), 'en')
self.assertEqual(properties.language, 'en')
self.assertEqual(properties.category, 'noun')
self.assertEqual(properties.lexeme_feature_map, {})
self.assertEqual(properties.form_feature_map, {'case':'dative'})
def test_bad_msd_with_require_valid(self):
try:
self.converter.msd_to_properties(Msd('N---d', 'en'), 'en', require_valid_flag=True)
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)

View File

@@ -1,6 +1,6 @@
import unittest import unittest
from conversion_utils.jos_msds_and_properties import Converter, Properties from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException, Msd
class JosPropertiesToMsdTestCase(unittest.TestCase): class JosPropertiesToMsdTestCase(unittest.TestCase):
@@ -41,3 +41,40 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl') msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
self.assertEqual(msd.language, 'sl') self.assertEqual(msd.language, 'sl')
self.assertEqual(msd.code, 'U') self.assertEqual(msd.code, 'U')
def test_good_msd_with_require_valid(self):
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en', require_valid_flag=True)
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Ncfdn')
def test_bad_msd(self):
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en')
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Nc-d')
def test_msd_to_jos(self):
ud = self.converter.msd_to_ud(Msd('Ppnzei', 'sl'), 'slovenski')
self.assertEqual(ud.pos, 'ADJ')
self.assertEqual(ud.to_full_string(), 'UposTag=ADJ|Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
self.assertEqual(ud.to_features_string(), 'Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
ud = self.converter.msd_to_ud(Msd('Sommr', 'sl'), 'dečko')
self.assertEqual(ud.pos, 'NOUN')
self.assertEqual(ud.to_full_string(), 'UposTag=NOUN|Case=Gen|Gender=Masc|Number=Plur')
self.assertEqual(ud.to_features_string(), 'Case=Gen|Gender=Masc|Number=Plur')
def test_msd_to_jos_partial_msd(self):
try:
self.converter.msd_to_ud(Msd('Soz', 'sl'), 'vlada')
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)
def test_bad_msd_with_require_valid(self):
try:
self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)

View File

@@ -1,26 +1,29 @@
#!/usr/bin/python3 """Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
# -*- coding: utf-8 -*-
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
"""
import argparse import argparse
import codecs
import lxml.etree as lxml import lxml.etree as lxml
from importlib_resources import files from importlib_resources import files
from conversion_utils.jos_msds_and_properties import Converter, Msd from conversion_utils.jos_msds_and_properties import Converter, Msd
def get_syn_map(): def get_syn_map():
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml') dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
dict_file = codecs.open(dict_file_name, 'r') dict_file = open(dict_file_name, 'r', encoding='utf-8')
root = lxml.parse(dict_file).getroot() root = lxml.parse(dict_file).getroot()
dict_file.close() dict_file.close()
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')} return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
def translate(input_file_name, output_file_name):
def translate(input_file_name, scope, output_file_name):
syn_map = get_syn_map() syn_map = get_syn_map()
output_file = codecs.open(output_file_name, 'w') output_file = open(output_file_name, 'w', encoding='utf-8')
input_file = codecs.open(input_file_name, 'r') input_file = open(input_file_name, 'r', encoding='utf-8')
converter = Converter() converter = Converter()
@@ -29,8 +32,10 @@ def translate(input_file_name, output_file_name):
if (len(columns) != 10): if (len(columns) != 10):
output_file.write(line) output_file.write(line)
else: else:
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code if (scope in {'msd', 'both'}):
columns[7] = syn_map[columns[7]] columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
if (scope in {'dep', 'both'}):
columns[7] = syn_map[columns[7]]
output_file.write('\t'.join(columns) + '\n') output_file.write('\t'.join(columns) + '\n')
input_file.close() input_file.close()
@@ -41,6 +46,7 @@ if (__name__ == '__main__'):
arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.') arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
arg_parser.add_argument('-infile', type=str, help='Input conllu') arg_parser.add_argument('-infile', type=str, help='Input conllu')
arg_parser.add_argument('-scope', type=str, options=['msd', 'dep', 'both'], default='both', help='Input conllu')
arg_parser.add_argument('-outfile', type=str, help='Output conllu') arg_parser.add_argument('-outfile', type=str, help='Output conllu')
arguments = arg_parser.parse_args() arguments = arg_parser.parse_args()
input_file_name = arguments.infile input_file_name = arguments.infile

View File

@@ -1,11 +1,16 @@
"""A few convenience TEI/XML constants and functions."""
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0' TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}' TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id' XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
def xpath_find(element,expression): def xpath_find(element,expression):
"""Executes XPath expression, with TEI namespace.""" """Executes XPath expression, with TEI namespace."""
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE}) return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
def get_xml_id(element): def get_xml_id(element):
"""Returns the element's @xml:id attribute.""" """Returns the element's @xml:id attribute."""
return element.get(XML_ID_ATTRIBUTE_NAME) return element.get(XML_ID_ATTRIBUTE_NAME)

View File

@@ -1,3 +1,13 @@
"""Parse source TEI specifications and save as a pickle.
You can use this script to create a new pickle file to replace the one stored at
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
are not expected to change, and if they do, the package pickle there should be updated upstream, so
you probably should not have to use this script.
"""
import pickle import pickle
import argparse import argparse
from conversion_utils.jos_msds_and_properties import SpecificationsParser from conversion_utils.jos_msds_and_properties import SpecificationsParser

View File

@@ -1,12 +1,20 @@
from setuptools import setup from setuptools import setup
import os
setup(name='conversion_utils', here = os.path.abspath(os.path.dirname(__file__))
version='0.1', with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
setup(name='cjvt_conversion_utils',
version='0.3',
description='CJVT conversion utilities', description='CJVT conversion utilities',
long_description=long_description,
long_description_content_type="text/markdown",
url='https://gitea.cjvt.si/generic/conversion_utils', url='https://gitea.cjvt.si/generic/conversion_utils',
author='Cyprian Laskowski', author='CJVT',
author_email='cyp@cjvt.si', author_email='pypi@cjvt.si',
packages=['conversion_utils', 'conversion_utils.resources'], license='MIT',
install_requires=['importlib_resources'], packages=['conversion_utils', 'conversion_utils.resources', 'conversion_utils.tests'],
install_requires=['lxml', 'importlib_resources'],
include_package_data=True, include_package_data=True,
zip_safe=True) zip_safe=True)