Compare commits
12 Commits
2f74dfcab8
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| e62c096126 | |||
| 165f24c64c | |||
| 4d86631283 | |||
| b711fae3b5 | |||
| f43ea39f1b | |||
| 03ce9f8ac7 | |||
| f28b5a3a01 | |||
| 89be603103 | |||
| 99ac426e4b | |||
| 89bcde58aa | |||
| d7be39d894 | |||
| 4ca67ec8cc |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,2 +1,7 @@
|
|||||||
*.pyc
|
*.pyc
|
||||||
venv
|
venv
|
||||||
|
data
|
||||||
|
.idea
|
||||||
|
build
|
||||||
|
dist
|
||||||
|
*.egg-info
|
||||||
|
|||||||
22
LICENSE.txt
Normal file
22
LICENSE.txt
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2023 CLARIN.SI
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
||||||
@@ -1,3 +1,6 @@
|
|||||||
include conversion_utils/resources/jos_specifications.pickle
|
include conversion_utils/resources/jos_specifications.pickle
|
||||||
include conversion_utils/resources/dict.xml
|
include conversion_utils/resources/dict.xml
|
||||||
include conversion_utils/resources/structure_conversions.csv
|
include conversion_utils/resources/structure_conversions.csv
|
||||||
|
include conversion_utils/resources/jos-msd2features.tbl
|
||||||
|
include conversion_utils/resources/jos2ud-features.tbl
|
||||||
|
include conversion_utils/resources/jos2ud-pos.tbl
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
## Conversion utilities
|
## CJVT conversion utilities
|
||||||
|
|
||||||
This repository is currently intended for common conversions needed by CJVT developers. For the
|
This repository is intended for common conversions needed by CJVT developers. It can of course also
|
||||||
moment, this is limited to JOS msds and properties.
|
be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
|
||||||
|
were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
|
||||||
|
|
||||||
### JOS msds and properties
|
### JOS msds and properties
|
||||||
|
|
||||||
|
|||||||
@@ -1,23 +1,36 @@
|
|||||||
|
"""Convert a series of CoNNL-U files to a TEI file.
|
||||||
|
|
||||||
|
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from glob import glob
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
class Sentence:
|
class Sentence:
|
||||||
def __init__(self, _id, no_ud=False, system='jos'):
|
def __init__(self, _id, no_ud=False, system='jos'):
|
||||||
self._id = _id
|
self._id = _id
|
||||||
self.items = []
|
self.items = []
|
||||||
self.links = []
|
self.links = []
|
||||||
|
self.srl_links = []
|
||||||
self.no_ud = no_ud
|
self.no_ud = no_ud
|
||||||
self.system = system
|
self.system = system
|
||||||
|
|
||||||
def add_item(self, token, lemma, upos, upos_other, xpos, misc):
|
def add_item(self, token, lemma, upos, upos_other, xpos, misc):
|
||||||
self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')])
|
no_space_after = 'SpaceAfter' in misc and misc['SpaceAfter'] == 'No'
|
||||||
|
ner = misc['NER'] if 'NER' in misc else 'O'
|
||||||
|
self.items.append([token, lemma, upos, upos_other, xpos, no_space_after, ner])
|
||||||
|
|
||||||
def add_link(self, link_ref, link_type):
|
def add_link(self, link_ref, link_type):
|
||||||
self.links.append([link_ref, link_type])
|
self.links.append([link_ref, link_type])
|
||||||
|
|
||||||
|
def add_srl_link(self, link_ref, link_type):
|
||||||
|
self.srl_links.append([link_ref, link_type])
|
||||||
|
|
||||||
def as_xml(self, id_prefix=None):
|
def as_xml(self, id_prefix=None):
|
||||||
if id_prefix:
|
if id_prefix:
|
||||||
xml_id = id_prefix + '.' + self._id
|
xml_id = id_prefix + '.' + self._id
|
||||||
@@ -27,8 +40,24 @@ class Sentence:
|
|||||||
set_xml_attr(base, 'id', xml_id)
|
set_xml_attr(base, 'id', xml_id)
|
||||||
id_counter = 1
|
id_counter = 1
|
||||||
|
|
||||||
|
in_seg = False
|
||||||
|
sentence_base = base
|
||||||
|
|
||||||
for item in self.items:
|
for item in self.items:
|
||||||
token, lemma, upos, upos_other, xpos, no_space_after = item
|
token, lemma, upos, upos_other, xpos, no_space_after, ner = item
|
||||||
|
|
||||||
|
if ner[0] == 'B':
|
||||||
|
if in_seg:
|
||||||
|
sentence_base.append(base)
|
||||||
|
in_seg = True
|
||||||
|
base = etree.Element('seg')
|
||||||
|
base.set('type', 'name')
|
||||||
|
base.set('subtype', f'{ner[2:].lower()}')
|
||||||
|
elif ner[0] == 'O':
|
||||||
|
if in_seg:
|
||||||
|
sentence_base.append(base)
|
||||||
|
base = sentence_base
|
||||||
|
in_seg = False
|
||||||
|
|
||||||
if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
|
if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
|
||||||
to_add = etree.Element('pc')
|
to_add = etree.Element('pc')
|
||||||
@@ -53,6 +82,11 @@ class Sentence:
|
|||||||
|
|
||||||
base.append(to_add)
|
base.append(to_add)
|
||||||
|
|
||||||
|
if in_seg:
|
||||||
|
sentence_base.append(base)
|
||||||
|
base = sentence_base
|
||||||
|
|
||||||
|
# depparsing linkGrp
|
||||||
link_grp = etree.Element('linkGrp')
|
link_grp = etree.Element('linkGrp')
|
||||||
link_grp.set('corresp', '#'+xml_id)
|
link_grp.set('corresp', '#'+xml_id)
|
||||||
link_grp.set('targFunc', 'head argument')
|
link_grp.set('targFunc', 'head argument')
|
||||||
@@ -67,6 +101,23 @@ class Sentence:
|
|||||||
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
|
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
|
||||||
link_grp.append(link)
|
link_grp.append(link)
|
||||||
base.append(link_grp)
|
base.append(link_grp)
|
||||||
|
|
||||||
|
# srl linkGrp
|
||||||
|
if self.srl_links:
|
||||||
|
link_grp = etree.Element('linkGrp')
|
||||||
|
link_grp.set('corresp', '#' + xml_id)
|
||||||
|
link_grp.set('targFunc', 'head argument')
|
||||||
|
link_grp.set('type', 'SRL')
|
||||||
|
for link_id, item in enumerate(self.srl_links):
|
||||||
|
link_ref, link_type = item
|
||||||
|
link = etree.Element('link')
|
||||||
|
link.set('ana', 'srl:' + link_type.replace(':', '_'))
|
||||||
|
if link_ref == u'0':
|
||||||
|
link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1))
|
||||||
|
else:
|
||||||
|
link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
|
||||||
|
link_grp.append(link)
|
||||||
|
base.append(link_grp)
|
||||||
return base
|
return base
|
||||||
|
|
||||||
|
|
||||||
@@ -234,7 +285,7 @@ def construct_sentence(sent_id, lines):
|
|||||||
upos_other = tokens[5]
|
upos_other = tokens[5]
|
||||||
depparse_link = tokens[6]
|
depparse_link = tokens[6]
|
||||||
depparse_link_name = tokens[7]
|
depparse_link_name = tokens[7]
|
||||||
misc = tokens[9]
|
misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')} if tokens[9] != '_' else {}
|
||||||
|
|
||||||
sentence.add_item(
|
sentence.add_item(
|
||||||
token,
|
token,
|
||||||
@@ -247,6 +298,11 @@ def construct_sentence(sent_id, lines):
|
|||||||
sentence.add_link(
|
sentence.add_link(
|
||||||
depparse_link,
|
depparse_link,
|
||||||
depparse_link_name)
|
depparse_link_name)
|
||||||
|
|
||||||
|
if 'SRL' in misc:
|
||||||
|
sentence.add_srl_link(
|
||||||
|
depparse_link,
|
||||||
|
misc['SRL'])
|
||||||
return sentence
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
@@ -256,7 +312,7 @@ def construct_tei_etrees(conllu_lines):
|
|||||||
|
|
||||||
|
|
||||||
def convert_file(input_file_name, output_file_name):
|
def convert_file(input_file_name, output_file_name):
|
||||||
input_file = open(input_file_name, 'r')
|
input_file = open(input_file_name, 'r', encoding='utf-8')
|
||||||
root = construct_tei_etrees(input_file)[0]
|
root = construct_tei_etrees(input_file)[0]
|
||||||
tree = etree.ElementTree(root)
|
tree = etree.ElementTree(root)
|
||||||
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
||||||
@@ -267,19 +323,16 @@ def convert_file(input_file_name, output_file_name):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import argparse
|
|
||||||
from glob import glob
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
|
parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
|
||||||
parser.add_argument('files', nargs='+', help='CoNNL-U file')
|
parser.add_argument('files', nargs='+', help='CoNNL-U file')
|
||||||
parser.add_argument('-o', '--out-file', dest='out', default=None,
|
parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
|
||||||
help='Write output to file instead of stdout.')
|
|
||||||
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
|
parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.out:
|
if args.out:
|
||||||
f_out = open(args.out, 'w')
|
f_out = open(args.out, 'w', encoding='utf-8')
|
||||||
else:
|
else:
|
||||||
f_out = sys.stdout
|
f_out = sys.stdout
|
||||||
|
|
||||||
@@ -288,7 +341,7 @@ if __name__ == '__main__':
|
|||||||
for arg in args.files:
|
for arg in args.files:
|
||||||
filelist = glob(arg)
|
filelist = glob(arg)
|
||||||
for f in filelist:
|
for f in filelist:
|
||||||
with open(f, 'r') as conllu_f:
|
with open(f, 'r', encoding='utf-8') as conllu_f:
|
||||||
tei_etrees = construct_tei_etrees(conllu_f)
|
tei_etrees = construct_tei_etrees(conllu_f)
|
||||||
for tei_etree in tei_etrees:
|
for tei_etree in tei_etrees:
|
||||||
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
|
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
|
||||||
|
|||||||
@@ -1,12 +1,21 @@
|
|||||||
import lxml.etree as lxml
|
|
||||||
import re
|
import re
|
||||||
import pickle
|
import pickle
|
||||||
import importlib_resources as pkg_resources
|
import lxml.etree as lxml
|
||||||
|
from collections import defaultdict
|
||||||
|
from importlib_resources import files
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
from conversion_utils.utils import xpath_find, get_xml_id
|
from conversion_utils.utils import xpath_find, get_xml_id
|
||||||
|
|
||||||
JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'
|
JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'
|
||||||
|
|
||||||
|
RESOURCES_DIR = "conversion_utils.resources"
|
||||||
|
|
||||||
|
MSD_TO_FEATURES = "jos-msd2features.tbl"
|
||||||
|
JOS_TO_UD_FEATURES_RULES = "jos2ud-features.tbl"
|
||||||
|
JOS_TO_UPOS_RULES = "jos2ud-pos.tbl"
|
||||||
|
|
||||||
## Positions of lexeme-level features for each category
|
## Positions of lexeme-level features for each category
|
||||||
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
LEXEME_FEATURE_MAP = {'noun':{1,2},
|
||||||
'verb':{1,2},
|
'verb':{1,2},
|
||||||
@@ -53,6 +62,14 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
|
|||||||
('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
|
('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
|
||||||
('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
|
('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
|
||||||
|
|
||||||
|
class MsdState(Enum):
|
||||||
|
FULL = 1
|
||||||
|
PARTIAL = 2
|
||||||
|
UNKNOWN = 3
|
||||||
|
|
||||||
|
class MsdException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Specifications:
|
class Specifications:
|
||||||
"""JOS specifications with list of all word categories."""
|
"""JOS specifications with list of all word categories."""
|
||||||
@@ -216,6 +233,36 @@ class Properties:
|
|||||||
and self.language == obj.language
|
and self.language == obj.language
|
||||||
|
|
||||||
|
|
||||||
|
class UD:
|
||||||
|
"""Universal Dependencies object.
|
||||||
|
|
||||||
|
Can be converted to a valid UD features string.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, pos, features_map):
|
||||||
|
self.pos = pos
|
||||||
|
self.features_map = features_map
|
||||||
|
|
||||||
|
def to_features_string(self):
|
||||||
|
return self._features_string()
|
||||||
|
|
||||||
|
def to_full_string(self):
|
||||||
|
features = self._features_string()
|
||||||
|
if features:
|
||||||
|
return "UposTag=" + self.pos + "|" + features
|
||||||
|
else:
|
||||||
|
return "UposTag=" + self.pos
|
||||||
|
|
||||||
|
def _features_string(self):
|
||||||
|
return "|".join([f"{feature}={value}" for feature, value in self._sort_features(self.features_map)])
|
||||||
|
|
||||||
|
def _sort_features(self, features_map):
|
||||||
|
return sorted(features_map.items(), key=lambda x: x[0].lower(), reverse=False)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"pos={self.pos}, features_map={self.features_map}"
|
||||||
|
|
||||||
|
|
||||||
class Msd:
|
class Msd:
|
||||||
"""JOS msd."""
|
"""JOS msd."""
|
||||||
|
|
||||||
@@ -230,17 +277,15 @@ class Msd:
|
|||||||
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
|
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
|
||||||
|
|
||||||
|
|
||||||
class ConverterException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class Converter:
|
class Converter:
|
||||||
"""Converter between Msd and Properties objects."""
|
"""Converter between Msd and Properties objects."""
|
||||||
|
|
||||||
def __init__(self, xml_file_name=None):
|
def __init__(self, xml_file_name=None):
|
||||||
if (xml_file_name is None):
|
if (xml_file_name is None):
|
||||||
if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)):
|
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
|
||||||
|
if (resource.is_file()):
|
||||||
try:
|
try:
|
||||||
with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file:
|
with resource.open('rb') as pickle_file:
|
||||||
self.specifications = pickle.load(pickle_file)
|
self.specifications = pickle.load(pickle_file)
|
||||||
except:
|
except:
|
||||||
exit('Could not parse specifications pickle file installed.')
|
exit('Could not parse specifications pickle file installed.')
|
||||||
@@ -253,17 +298,84 @@ class Converter:
|
|||||||
except:
|
except:
|
||||||
exit('Could not parse specifications xml file provided.')
|
exit('Could not parse specifications xml file provided.')
|
||||||
|
|
||||||
def msd_to_properties(self, msd, language, lemma=None):
|
self.mte_to_ud_features = self._parse_msd_ud_conversion(MSD_TO_FEATURES)
|
||||||
"""Convert Msd to Properties (possibly in the other language).
|
self.mte_to_ud_features_rules = self._parse_ud_rules(JOS_TO_UD_FEATURES_RULES)
|
||||||
|
self.mte_to_upos_rules = self._parse_ud_rules(JOS_TO_UPOS_RULES)
|
||||||
|
|
||||||
|
def _parse_msd_ud_conversion(self, file_name):
|
||||||
|
"""Parse file with direct conversions from English Msd to Universal Dependencies."""
|
||||||
|
conversion_map = defaultdict()
|
||||||
|
with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as conversion_file:
|
||||||
|
for line in conversion_file.readlines():
|
||||||
|
mte_msd_en, mte_features_en = line.strip("\n").split("\t")
|
||||||
|
mte_sl = self.translate_msd(Msd(mte_msd_en, "en"), "sl").code
|
||||||
|
conversion_map[mte_msd_en] = mte_features_en
|
||||||
|
conversion_map[mte_sl] = mte_features_en
|
||||||
|
return conversion_map
|
||||||
|
|
||||||
|
def _parse_ud_rules(self, file_name):
|
||||||
|
"""Parse file with rules additional rules for converting from applied to conversion from English Msd to Universal Dependencies."""
|
||||||
|
all_rules = defaultdict(list)
|
||||||
|
with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as rules_file:
|
||||||
|
for line in [l for l in rules_file.readlines() if l[0].isdigit()]:
|
||||||
|
priority, *current_rules = line.strip("\n").split("\t")
|
||||||
|
current_rules += [""] * (6 - len(current_rules))
|
||||||
|
all_rules[priority].append(current_rules)
|
||||||
|
return all_rules
|
||||||
|
|
||||||
|
def is_valid_msd(self, msd):
|
||||||
|
"""Verify if the Msd code is in the standard JOS set."""
|
||||||
|
return msd.code in self.specifications.codes_map[msd.language]
|
||||||
|
|
||||||
|
def get_msd_state(self, msd):
|
||||||
|
"""Determine if the Msd code is full, partial or unknown."""
|
||||||
|
code_map = self.specifications.codes_map[msd.language]
|
||||||
|
if msd.code in code_map:
|
||||||
|
return MsdState.FULL
|
||||||
|
for msd_code in code_map:
|
||||||
|
if msd_code.startswith(msd.code):
|
||||||
|
return MsdState.PARTIAL
|
||||||
|
return MsdState.UNKNOWN
|
||||||
|
|
||||||
|
def check_valid_msd(self, msd, require_valid_flag, allow_partial=True):
|
||||||
|
"""If the Msd code is not valid, raise an exception or give a warning."""
|
||||||
|
msd_state = self.get_msd_state(msd)
|
||||||
|
if msd_state == MsdState.UNKNOWN:
|
||||||
|
message = f"The msd '{msd.code}' is unknown"
|
||||||
|
if require_valid_flag:
|
||||||
|
raise MsdException(message)
|
||||||
|
else:
|
||||||
|
print('[WARN] ' + message)
|
||||||
|
if msd_state == MsdState.PARTIAL and not allow_partial:
|
||||||
|
raise MsdException(f"Partial msd '{msd.code}' is not allowed. Full msd is required.")
|
||||||
|
|
||||||
|
def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
|
||||||
|
"""Convert Msd to Properties.
|
||||||
|
|
||||||
|
The language of the generated Properties is specified and can differ from the Msd language.
|
||||||
|
|
||||||
|
If require_valid_flag is True, a MsdException is raised if the MSD is not in the standard
|
||||||
|
JOS set. Otherwise only a warning is given.
|
||||||
|
|
||||||
|
If you care about accurate level information (i.e., which properties are lexeme-level and
|
||||||
|
which are form-level), note that some features depends on the particular lemma. For such
|
||||||
|
features, if lemma is not provided and warn_level_flag is True, a warning will be given.
|
||||||
|
|
||||||
|
If a MSD has dashes in place of letters for certain features, they are skipped, so that
|
||||||
|
these features are not included in the generated Properties object.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
msd(Msd): the JOS MSD to convert
|
||||||
|
language(str): the language for the Properties object to be generated: "en" (English) or "sl" (Slovene)
|
||||||
|
lemma(str): the lemma of the word form with the MSD
|
||||||
|
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is provided
|
||||||
|
warn_level_flag(boolean): whether to warn if cannot be sure of level of a property
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Properties: the result of the conversion of the Msd in the language requested
|
||||||
|
|
||||||
The level (lexeme vs form) of certain reflexive msd features
|
|
||||||
depends on the lemma, so set the lemma if you need accurate
|
|
||||||
level information.
|
|
||||||
"""
|
"""
|
||||||
|
self.check_valid_msd(msd, require_valid_flag)
|
||||||
if (msd.code not in self.specifications.codes_map[msd.language]):
|
|
||||||
raise ConverterException('The msd {} is unknown'.format(msd.code))
|
|
||||||
|
|
||||||
category_char = msd.code[0].lower()
|
category_char = msd.code[0].lower()
|
||||||
value_chars = msd.code[1:]
|
value_chars = msd.code[1:]
|
||||||
category = self.specifications.find_category_by_code(category_char, msd.language)
|
category = self.specifications.find_category_by_code(category_char, msd.language)
|
||||||
@@ -277,8 +389,8 @@ class Converter:
|
|||||||
value = feature.find_value_by_char(value_char, msd.language)
|
value = feature.find_value_by_char(value_char, msd.language)
|
||||||
feature_name = feature.names.get(language)
|
feature_name = feature.names.get(language)
|
||||||
feature_value = value.names.get(language)
|
feature_value = value.names.get(language)
|
||||||
if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
|
if (warn_level_flag and lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
|
||||||
print('[WARN] The level of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
|
print('[WARN] The level (lexeme vs form) of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
|
||||||
.format(category=category_name, position=index))
|
.format(category=category_name, position=index))
|
||||||
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
|
||||||
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
|
||||||
@@ -289,8 +401,21 @@ class Converter:
|
|||||||
form_feature_map[feature_name] = feature_value
|
form_feature_map[feature_name] = feature_value
|
||||||
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
|
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
|
||||||
|
|
||||||
def properties_to_msd(self, properties, language):
|
def properties_to_msd(self, properties, language, require_valid_flag=False):
|
||||||
"""Convert Properties to msd (possibly in the other language)."""
|
"""Convert Properties to Msd.
|
||||||
|
|
||||||
|
The language of the generated Msd is specified and can differ from the Properties language.
|
||||||
|
|
||||||
|
If require_valid_flag is True, a MsdException is raised if the generated MSD is not in
|
||||||
|
the standard JOS set. Otherwise only a warning is given.
|
||||||
|
|
||||||
|
Any skipped positions among the Properties are represented as dashes in the MSD.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
properties(Properties): the properties to convert
|
||||||
|
language(str): the language for the Msd object to be returned: "en" (English) or "sl" (Slovene)
|
||||||
|
require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is generated
|
||||||
|
"""
|
||||||
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
category = self.specifications.find_category_by_name(properties.category, properties.language)
|
||||||
category_char = category.codes.get(language).upper()
|
category_char = category.codes.get(language).upper()
|
||||||
feature_map = properties.lexeme_feature_map.copy()
|
feature_map = properties.lexeme_feature_map.copy()
|
||||||
@@ -308,7 +433,51 @@ class Converter:
|
|||||||
msd_code += '-'
|
msd_code += '-'
|
||||||
i += 1
|
i += 1
|
||||||
msd_code += position_map[position]
|
msd_code += position_map[position]
|
||||||
return Msd(msd_code, language)
|
msd = Msd(msd_code, language)
|
||||||
|
self.check_valid_msd(msd, require_valid_flag)
|
||||||
|
return msd
|
||||||
|
|
||||||
|
def msd_to_ud(self, msd, lemma):
|
||||||
|
"""Convert Msd to Universal Dependencies object.
|
||||||
|
|
||||||
|
Partial Msds are currently not supported.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
msd(Msd): the Msd to convert
|
||||||
|
lemma(str): the lemma of the word form with the MSD
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.check_valid_msd(msd, False, allow_partial=False)
|
||||||
|
upos_category, *upos_features = self.mte_to_ud_features[msd.code].split()
|
||||||
|
final_upos = ""
|
||||||
|
|
||||||
|
for priority in sorted(self.mte_to_upos_rules, reverse=True):
|
||||||
|
for rule in self.mte_to_upos_rules[priority]:
|
||||||
|
rule_lemma, rule_category, rule_mte_features, _, rule_pos_ud, _ = rule
|
||||||
|
|
||||||
|
if (rule_category != upos_category
|
||||||
|
or (rule_lemma not in ("*", "*en") and lemma != rule_lemma)
|
||||||
|
or (rule_lemma == "*en" and not lemma.endswith("en"))
|
||||||
|
or (rule_mte_features != "*" and not all(f in upos_features for f in rule_mte_features.split("|")))):
|
||||||
|
continue
|
||||||
|
|
||||||
|
final_upos = rule_pos_ud
|
||||||
|
|
||||||
|
for priority in sorted(self.mte_to_ud_features_rules):
|
||||||
|
for rule in self.mte_to_ud_features_rules[priority]:
|
||||||
|
rule_lemma, rule_category, rule_mte_features, rule_pos_ud, rule_ud_features, _ = rule
|
||||||
|
|
||||||
|
if (rule_lemma != "*" and lemma != rule_lemma
|
||||||
|
or (rule_category != "*" and rule_category != upos_category)
|
||||||
|
or (rule_pos_ud != "*" and rule_pos_ud != final_upos)):
|
||||||
|
continue
|
||||||
|
|
||||||
|
upos_features = [rule_ud_features if f == rule_mte_features else f for f in upos_features]
|
||||||
|
if rule_mte_features == "*" and rule_ud_features != "-":
|
||||||
|
upos_features.append(rule_ud_features)
|
||||||
|
|
||||||
|
ud_features = dict(f.split("=", 1) for f in "|".join(upos_features).split("|") if f not in {"", "-"})
|
||||||
|
return UD(final_upos, ud_features)
|
||||||
|
|
||||||
def translate_msd(self, msd, language):
|
def translate_msd(self, msd, language):
|
||||||
return self.properties_to_msd(self.msd_to_properties(msd, language), language)
|
return self.properties_to_msd(self.msd_to_properties(msd, language), language)
|
||||||
|
|||||||
1900
conversion_utils/resources/jos-msd2features.tbl
Normal file
1900
conversion_utils/resources/jos-msd2features.tbl
Normal file
File diff suppressed because it is too large
Load Diff
128
conversion_utils/resources/jos2ud-features.tbl
Normal file
128
conversion_utils/resources/jos2ud-features.tbl
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
# Mapping from JOS features to UD features
|
||||||
|
# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek
|
||||||
|
# 2018-11-23
|
||||||
|
#
|
||||||
|
#Prio Lemma Category Feats PoS-UD ->Feature-UD #Comment
|
||||||
|
----------------------------------------------------------------------------------------------------
|
||||||
|
1 * Noun Type=common * -
|
||||||
|
1 * Noun Type=proper * -
|
||||||
|
|
||||||
|
1 * Verb Negative=no * Polarity=Pos
|
||||||
|
1 * Verb Negative=yes * Polarity=Neg
|
||||||
|
1 * Verb Type=auxiliary * -
|
||||||
|
1 * Verb Type=main * -
|
||||||
|
1 * Verb VForm=present * VerbForm=Fin|Mood=Ind|Tense=Pres
|
||||||
|
1 * Verb VForm=future * VerbForm=Fin|Mood=Ind|Tense=Fut
|
||||||
|
1 * Verb VForm=conditional * VerbForm=Fin|Mood=Cnd
|
||||||
|
1 * Verb VForm=imperative * VerbForm=Fin|Mood=Imp
|
||||||
|
1 * Verb VForm=infinitive * VerbForm=Inf
|
||||||
|
1 * Verb VForm=supine * VerbForm=Sup
|
||||||
|
1 * Verb VForm=participle * VerbForm=Part
|
||||||
|
|
||||||
|
1 * Adjective Type=general * -
|
||||||
|
1 * Adjective Type=possessive * Poss=Yes
|
||||||
|
1 * Adjective Type=participle * VerbForm=Part
|
||||||
|
|
||||||
|
2 * Adverb Type=participle * VerbForm=Conv
|
||||||
|
2 * Adverb Type=general * -
|
||||||
|
1 nekaj Adverb Type=general DET PronType=Ind
|
||||||
|
1 več Adverb Type=general DET PronType=Ind
|
||||||
|
1 veliko Adverb Type=general DET PronType=Ind
|
||||||
|
1 manj Adverb Type=general DET PronType=Ind
|
||||||
|
1 dovolj Adverb Type=general DET PronType=Ind
|
||||||
|
1 pol Adverb Type=general DET PronType=Ind
|
||||||
|
1 malo Adverb Type=general DET PronType=Ind
|
||||||
|
1 toliko Adverb Type=general DET PronType=Dem
|
||||||
|
1 največ Adverb Type=general DET PronType=Ind
|
||||||
|
1 mnogo Adverb Type=general DET PronType=Ind
|
||||||
|
1 preveč Adverb Type=general DET PronType=Ind
|
||||||
|
1 par Adverb Type=general DET PronType=Ind
|
||||||
|
1 koliko Adverb Type=general DET PronType=Int
|
||||||
|
1 dosti Adverb Type=general DET PronType=Ind
|
||||||
|
1 nešteto Adverb Type=general DET PronType=Ind
|
||||||
|
1 četrt Adverb Type=general DET PronType=Ind
|
||||||
|
1 ogromno Adverb Type=general DET PronType=Ind
|
||||||
|
1 čimveč Adverb Type=general DET PronType=Ind
|
||||||
|
1 obilo Adverb Type=general DET PronType=Ind
|
||||||
|
1 premnogo Adverb Type=general DET PronType=Ind
|
||||||
|
1 enormno Adverb Type=general DET PronType=Ind
|
||||||
|
1 majčkeno Adverb Type=general DET PronType=Ind
|
||||||
|
|
||||||
|
2 * Pronoun Type=reflexive * PronType=Prs|Reflex=Yes
|
||||||
|
2 * Pronoun Type=personal * PronType=Prs
|
||||||
|
2 * Pronoun Type=possessive * PronType=Prs|Poss=Yes
|
||||||
|
2 * Pronoun Type=interrogative * PronType=Int
|
||||||
|
2 * Pronoun Type=relative * PronType=Rel
|
||||||
|
2 * Pronoun Type=demonstrative * PronType=Dem
|
||||||
|
2 * Pronoun Type=general * PronType=Tot
|
||||||
|
2 * Pronoun Type=negative * PronType=Neg
|
||||||
|
2 * Pronoun Type=indefinite * PronType=Ind
|
||||||
|
1 * Pronoun Type=personal DET PronType=Prs
|
||||||
|
1 * Pronoun Type=possessive DET PronType=Prs|Poss=Yes
|
||||||
|
1 * Pronoun Owner_Gender=masculine * Gender[psor]=Masc #lg.spec.feature
|
||||||
|
1 * Pronoun Owner_Gender=feminine * Gender[psor]=Fem #lg.spec.feature
|
||||||
|
1 * Pronoun Owner_Gender=neuter * Gender[psor]=Neut #lg.spec.feature
|
||||||
|
1 * Pronoun Owner_Number=singular * Number[psor]=Sing #lg.spec.feature
|
||||||
|
1 * Pronoun Owner_Number=plural * Number[psor]=Plur #lg.spec.feature
|
||||||
|
1 * Pronoun Owner_Number=dual * Number[psor]=Dual #lg.spec.feature
|
||||||
|
1 * Pronoun Clitic=yes * Variant=Short #lg.spec.feature
|
||||||
|
1 * Pronoun Clitic=bound * Variant=Bound #lg.spec.feature
|
||||||
|
1 svoj Pronoun Type=reflexive * PronType=Prs|Reflex=Yes|Poss=Yes
|
||||||
|
|
||||||
|
2 * Numeral Type=pronominal * -
|
||||||
|
2 * Numeral Form=letter * -
|
||||||
|
2 * Numeral Type=cardinal NUM NumType=Card
|
||||||
|
1 * Numeral Form=letter NUM NumForm=Word #lg.spec.feature
|
||||||
|
1 * Numeral Form=digit NUM NumForm=Digit #lg.spec.feature
|
||||||
|
1 * Numeral Form=roman NUM NumForm=Roman #lg.spec.feature
|
||||||
|
1 * Numeral Type=ordinal * NumType=Ord
|
||||||
|
1 * Numeral Type=special ADJ NumType=Mult
|
||||||
|
1 * Numeral Type=special NUM NumType=Sets
|
||||||
|
1 en Numeral Type=pronominal * NumType=Card
|
||||||
|
1 eden Numeral Type=pronominal * NumType=Card
|
||||||
|
|
||||||
|
1 * Conjunction Type=subordinating * -
|
||||||
|
1 * Conjunction Type=coordinating * -
|
||||||
|
|
||||||
|
2 * Particle * * -
|
||||||
|
1 ne Particle * * Polarity=Neg
|
||||||
|
|
||||||
|
1 * Interjection * * -
|
||||||
|
|
||||||
|
1 * Abbreviation * * Abbr=Yes
|
||||||
|
|
||||||
|
2 * Residual * * -
|
||||||
|
1 * Residual Type=foreign * Foreign=Yes
|
||||||
|
1 * Residual Type=typo * -
|
||||||
|
1 * Residual Type=program * -
|
||||||
|
|
||||||
|
1 * Punctuation * * -
|
||||||
|
|
||||||
|
2 * * Degree=positive * Degree=Pos
|
||||||
|
2 * * Degree=comparative * Degree=Cmp
|
||||||
|
2 * * Degree=superlative * Degree=Sup
|
||||||
|
1 * * Degree=positive DET -
|
||||||
|
1 * * Degree=comparative DET -
|
||||||
|
1 * * Degree=superlative DET -
|
||||||
|
1 * * Animate=no * Animacy=Inan
|
||||||
|
1 * * Animate=yes * Animacy=Anim
|
||||||
|
1 * * Aspect=perfective * Aspect=Perf
|
||||||
|
1 * * Aspect=progressive * Aspect=Imp
|
||||||
|
1 * * Aspect=biaspectual * -
|
||||||
|
1 * * Case=nominative * Case=Nom
|
||||||
|
1 * * Case=genitive * Case=Gen
|
||||||
|
1 * * Case=dative * Case=Dat
|
||||||
|
1 * * Case=accusative * Case=Acc
|
||||||
|
1 * * Case=locative * Case=Loc
|
||||||
|
1 * * Case=instrumental * Case=Ins
|
||||||
|
1 * * Definiteness=no * Definite=Ind
|
||||||
|
1 * * Definiteness=yes * Definite=Def
|
||||||
|
1 * * Gender=masculine * Gender=Masc
|
||||||
|
1 * * Gender=feminine * Gender=Fem
|
||||||
|
1 * * Gender=neuter * Gender=Neut
|
||||||
|
1 * * Number=singular * Number=Sing
|
||||||
|
1 * * Number=plural * Number=Plur
|
||||||
|
1 * * Number=dual * Number=Dual
|
||||||
|
1 * * Person=first * Person=1
|
||||||
|
1 * * Person=second * Person=2
|
||||||
|
1 * * Person=third * Person=3
|
||||||
282
conversion_utils/resources/jos2ud-pos.tbl
Normal file
282
conversion_utils/resources/jos2ud-pos.tbl
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
# Mapping from JOS PoS to UD 2.0 PoS
|
||||||
|
# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek
|
||||||
|
# 2019-02-04
|
||||||
|
#
|
||||||
|
#Prio Lemma Category Feats Deps ->PoS-UD #Comment
|
||||||
|
#-------------------------------------------------------------------------------------------------------
|
||||||
|
3 * Noun Type=common * NOUN
|
||||||
|
3 * Noun Type=proper * PROPN
|
||||||
|
|
||||||
|
3 * Verb * * VERB
|
||||||
|
|
||||||
|
2 * Verb Type=auxiliary * AUX #This is one can in fact also be VERB, but this has to be determined by some other means
|
||||||
|
|
||||||
|
3 * Adjective * * ADJ
|
||||||
|
|
||||||
|
3 * Adverb * * ADV
|
||||||
|
1 četrt Adverb * * DET
|
||||||
|
1 čimmanj Adverb * * DET
|
||||||
|
1 čimveč Adverb * * DET
|
||||||
|
1 dosti Adverb * * DET
|
||||||
|
1 dovolj Adverb * * DET
|
||||||
|
1 enako Adverb * * ADV
|
||||||
|
1 enormno Adverb * * DET
|
||||||
|
1 ful Adverb * * ADV
|
||||||
|
1 koliko Adverb * * DET
|
||||||
|
1 majčkeno Adverb * * DET
|
||||||
|
1 maksimalno Adverb * * ADV
|
||||||
|
1 malce Adverb * * ADV
|
||||||
|
1 malo Adverb * * DET
|
||||||
|
1 manj Adverb * * DET
|
||||||
|
1 minimalno Adverb * * ADV
|
||||||
|
1 mnogo Adverb * * DET
|
||||||
|
1 najmanj Adverb * * ADV
|
||||||
|
1 največ Adverb * * DET
|
||||||
|
1 nekaj Adverb * * DET
|
||||||
|
1 nekoliko Adverb * * ADV
|
||||||
|
1 nemalo Adverb * * ADV
|
||||||
|
1 nešteto Adverb * * DET
|
||||||
|
1 nič Adverb * * ADV
|
||||||
|
1 ničkoliko Adverb * * DET
|
||||||
|
1 obilo Adverb * * DET
|
||||||
|
1 ogromno Adverb * * DET
|
||||||
|
1 par Adverb * * DET
|
||||||
|
1 pol Adverb * * DET
|
||||||
|
1 polno Adverb * * ADV
|
||||||
|
1 precej Adverb * * ADV
|
||||||
|
1 premalo Adverb * * ADV
|
||||||
|
1 premnogo Adverb * * DET
|
||||||
|
1 preveč Adverb * * DET
|
||||||
|
1 toliko Adverb * * DET
|
||||||
|
1 veliko Adverb * * DET
|
||||||
|
1 več Adverb * * DET
|
||||||
|
1 večidel Adverb * * ADV
|
||||||
|
1 vse Adverb * * ADV
|
||||||
|
1 zadosti Adverb * * ADV
|
||||||
|
|
||||||
|
##All Pronouns should be explicitly defined
|
||||||
|
##But are not because of jos1M wrong lemmatisations for e.g. "ti", "te" etc.
|
||||||
|
3 * Pronoun * * PRON
|
||||||
|
##2 * Pronoun Type=demonstrative * DET
|
||||||
|
##2 * Pronoun Type=possessive * DET
|
||||||
|
1 bogsigavedikakšen Pronoun Type=indefinite * DET
|
||||||
|
1 bogvedikaj Pronoun Type=indefinite * PRON
|
||||||
|
1 bogvedikateri Pronoun Type=indefinite * DET
|
||||||
|
1 bogvekaj Pronoun Type=indefinite * PRON
|
||||||
|
1 bogvekakšen Pronoun Type=indefinite * DET
|
||||||
|
1 bogvekateri Pronoun Type=indefinite * DET
|
||||||
|
1 bogvekolik Pronoun Type=indefinite * DET
|
||||||
|
1 bogvekolikšen Pronoun Type=indefinite * DET
|
||||||
|
1 čezme Pronoun Type=personal * PRON
|
||||||
|
1 čezse Pronoun Type=reflexive * PRON
|
||||||
|
1 čigar Pronoun Type=relative * DET
|
||||||
|
1 čigarkoli Pronoun Type=relative * DET
|
||||||
|
1 čigarsižebodi Pronoun Type=relative * DET
|
||||||
|
1 čigav Pronoun Type=interrogative * DET
|
||||||
|
1 čigaver Pronoun Type=relative * DET
|
||||||
|
1 čigaverkoli Pronoun Type=relative * DET
|
||||||
|
1 čigavršen Pronoun Type=relative * DET
|
||||||
|
1 čigavršnji Pronoun Type=relative * DET
|
||||||
|
1 enak Pronoun Type=indefinite * DET
|
||||||
|
1 enaki Pronoun Type=indefinite * DET
|
||||||
|
1 enakšen Pronoun Type=indefinite * DET
|
||||||
|
1 isti Pronoun Type=indefinite * DET
|
||||||
|
1 jaz Pronoun Type=personal * PRON
|
||||||
|
1 jest Pronoun Type=personal * PRON
|
||||||
|
1 kaj Pronoun Type=interrogative * PRON
|
||||||
|
1 kak Pronoun Type=interrogative * DET
|
||||||
|
1 kakov Pronoun Type=interrogative * DET
|
||||||
|
1 kakošen Pronoun Type=interrogative * DET
|
||||||
|
1 kakršen Pronoun Type=relative * DET
|
||||||
|
1 kakršenkoli Pronoun Type=relative * DET
|
||||||
|
1 kakršensižebodi Pronoun Type=relative * DET
|
||||||
|
1 kakšen Pronoun Type=interrogative * DET
|
||||||
|
1 kar Pronoun Type=relative * PRON
|
||||||
|
1 karkoli Pronoun Type=relative * PRON
|
||||||
|
1 karsibodi Pronoun Type=relative * PRON
|
||||||
|
1 karsižebodi Pronoun Type=relative * PRON
|
||||||
|
1 kateri Pronoun Type=interrogative * DET
|
||||||
|
1 katerikoli Pronoun Type=relative * DET
|
||||||
|
1 katerisibodi Pronoun Type=relative * DET
|
||||||
|
1 kdo Pronoun Type=interrogative * PRON
|
||||||
|
1 kdor Pronoun Type=relative * PRON
|
||||||
|
1 kdorkoli Pronoun Type=relative * PRON
|
||||||
|
1 kdorsibodi Pronoun Type=relative * PRON
|
||||||
|
1 kdorsižebodi Pronoun Type=relative * PRON
|
||||||
|
1 kdovekaj Pronoun Type=indefinite * PRON
|
||||||
|
1 kdovekak Pronoun Type=indefinite * DET
|
||||||
|
1 kdovekakšen Pronoun Type=indefinite * DET
|
||||||
|
1 kdovekateri Pronoun Type=indefinite * DET
|
||||||
|
1 kdovekdo Pronoun Type=indefinite * PRON
|
||||||
|
1 kdovekolik Pronoun Type=indefinite * DET
|
||||||
|
1 koji Pronoun Type=interrogative * DET
|
||||||
|
1 kolik Pronoun Type=interrogative * DET
|
||||||
|
1 kolik Pronoun Type=indefinite * DET
|
||||||
|
1 koliker Pronoun Type=interrogative * DET
|
||||||
|
1 kolikršen Pronoun Type=relative * DET
|
||||||
|
1 kolikšen Pronoun Type=interrogative * DET
|
||||||
|
1 malokaj Pronoun Type=indefinite * PRON
|
||||||
|
1 malokak Pronoun Type=indefinite * DET
|
||||||
|
1 malokakšen Pronoun Type=indefinite * DET
|
||||||
|
1 malokateri Pronoun Type=indefinite * DET
|
||||||
|
1 malokdo Pronoun Type=indefinite * PRON
|
||||||
|
1 marsikaj Pronoun Type=indefinite * PRON
|
||||||
|
1 marsikak Pronoun Type=indefinite * DET
|
||||||
|
1 marsikakšen Pronoun Type=indefinite * DET
|
||||||
|
1 marsikateri Pronoun Type=indefinite * DET
|
||||||
|
1 marsikdo Pronoun Type=indefinite * PRON
|
||||||
|
1 marsičigav Pronoun Type=indefinite * DET
|
||||||
|
1 medme Pronoun Type=personal * PRON
|
||||||
|
1 medse Pronoun Type=reflexive * PRON
|
||||||
|
1 mnog Pronoun Type=indefinite * DET
|
||||||
|
1 mnogokaj Pronoun Type=indefinite * PRON
|
||||||
|
1 mnogokateri Pronoun Type=indefinite * DET
|
||||||
|
1 mnogokdo Pronoun Type=indefinite * PRON
|
||||||
|
1 moj Pronoun Type=possessive * DET
|
||||||
|
1 nadme Pronoun Type=personal * PRON
|
||||||
|
1 nadse Pronoun Type=reflexive * PRON
|
||||||
|
1 najin Pronoun Type=possessive * DET
|
||||||
|
1 name Pronoun Type=personal * PRON
|
||||||
|
1 nase Pronoun Type=reflexive * PRON
|
||||||
|
1 naš Pronoun Type=possessive * DET
|
||||||
|
1 negdo Pronoun Type=indefinite * PRON
|
||||||
|
1 nek Pronoun Type=indefinite * DET
|
||||||
|
1 nekaj Pronoun Type=indefinite * PRON
|
||||||
|
1 nekak Pronoun Type=indefinite * DET
|
||||||
|
1 nekakov Pronoun Type=indefinite * DET
|
||||||
|
1 nekakšen Pronoun Type=indefinite * DET
|
||||||
|
1 nekateri Pronoun Type=indefinite * DET
|
||||||
|
1 nekdo Pronoun Type=indefinite * PRON
|
||||||
|
1 neki Pronoun Type=indefinite * DET
|
||||||
|
1 nekolik Pronoun Type=indefinite * DET
|
||||||
|
1 nekolikšen Pronoun Type=indefinite * DET
|
||||||
|
1 nekolikšnji Pronoun Type=indefinite * DET
|
||||||
|
1 nekov Pronoun Type=indefinite * DET
|
||||||
|
1 nekšen Pronoun Type=indefinite * DET
|
||||||
|
1 nevemkakšen Pronoun Type=indefinite * DET
|
||||||
|
1 nihče Pronoun Type=negative * PRON
|
||||||
|
1 nikak Pronoun Type=negative * DET
|
||||||
|
1 nikakršen Pronoun Type=negative * DET
|
||||||
|
1 nikakšen Pronoun Type=negative * DET
|
||||||
|
1 nikdo Pronoun Type=negative * PRON
|
||||||
|
1 nikogaršen Pronoun Type=negative * DET
|
||||||
|
1 nikogaršnji Pronoun Type=negative * DET
|
||||||
|
1 nič Pronoun Type=negative * PRON
|
||||||
|
1 njegov Pronoun Type=possessive * DET
|
||||||
|
1 njen Pronoun Type=possessive * DET
|
||||||
|
1 njihen Pronoun Type=possessive * DET
|
||||||
|
1 njihnji Pronoun Type=possessive * DET
|
||||||
|
1 njihov Pronoun Type=possessive * DET
|
||||||
|
1 njun Pronoun Type=possessive * DET
|
||||||
|
1 nobeden Pronoun Type=negative * PRON
|
||||||
|
1 noben Pronoun Type=negative * DET
|
||||||
|
1 oba Pronoun Type=general * DET
|
||||||
|
1 obadva Pronoun Type=general * PRON
|
||||||
|
1 obme Pronoun Type=personal * PRON
|
||||||
|
1 oboj Pronoun Type=general * DET
|
||||||
|
1 obojen Pronoun Type=general * DET
|
||||||
|
1 obse Pronoun Type=reflexive * PRON
|
||||||
|
1 on Pronoun Type=personal * PRON
|
||||||
|
1 oni Pronoun Type=demonstrative * DET
|
||||||
|
1 onile Pronoun Type=demonstrative * PRON
|
||||||
|
1 podme Pronoun Type=personal * PRON
|
||||||
|
1 podse Pronoun Type=reflexive * PRON
|
||||||
|
1 pome Pronoun Type=personal * PRON
|
||||||
|
1 predme Pronoun Type=personal * PRON
|
||||||
|
1 predse Pronoun Type=reflexive * PRON
|
||||||
|
1 premarsikateri Pronoun Type=indefinite * DET
|
||||||
|
1 premnog Pronoun Type=indefinite * DET
|
||||||
|
1 prenekaj Pronoun Type=indefinite * PRON
|
||||||
|
1 prenekateri Pronoun Type=indefinite * DET
|
||||||
|
1 prenekdo Pronoun Type=indefinite * PRON
|
||||||
|
1 redkokateri Pronoun Type=indefinite * DET
|
||||||
|
1 redkokdo Pronoun Type=indefinite * PRON
|
||||||
|
1 se Pronoun Type=reflexive * PRON
|
||||||
|
1 skozme Pronoun Type=personal * PRON
|
||||||
|
1 skozse Pronoun Type=reflexive * PRON
|
||||||
|
1 svoj Pronoun Type=reflexive * DET
|
||||||
|
1 ta Pronoun Type=demonstrative * DET
|
||||||
|
1 tadva Pronoun Type=demonstrative * PRON
|
||||||
|
1 taisti Pronoun Type=demonstrative * DET
|
||||||
|
1 tak Pronoun Type=demonstrative * DET
|
||||||
|
1 takisti Pronoun Type=demonstrative * DET
|
||||||
|
1 takle Pronoun Type=demonstrative * DET
|
||||||
|
1 takov Pronoun Type=demonstrative * DET
|
||||||
|
1 takošen Pronoun Type=demonstrative * DET
|
||||||
|
1 takšen Pronoun Type=demonstrative * DET
|
||||||
|
1 takšenle Pronoun Type=demonstrative * DET
|
||||||
|
1 tale Pronoun Type=demonstrative * DET
|
||||||
|
1 talele Pronoun Type=demonstrative * DET
|
||||||
|
1 teu Pronoun Type=personal * PRON
|
||||||
|
1 ti Pronoun Type=personal * PRON
|
||||||
|
1 tisti Pronoun Type=demonstrative * DET
|
||||||
|
1 tistile Pronoun Type=demonstrative * DET
|
||||||
|
1 tolik Pronoun Type=demonstrative * DET
|
||||||
|
1 toliker Pronoun Type=demonstrative * DET
|
||||||
|
1 tolikšen Pronoun Type=demonstrative * DET
|
||||||
|
1 tolikšnji Pronoun Type=demonstrative * DET
|
||||||
|
1 toti Pronoun Type=demonstrative * DET
|
||||||
|
1 tvoj Pronoun Type=possessive * DET
|
||||||
|
1 un Pronoun Type=demonstrative * DET
|
||||||
|
1 vajin Pronoun Type=possessive * DET
|
||||||
|
1 vame Pronoun Type=personal * PRON
|
||||||
|
1 vase Pronoun Type=reflexive * PRON
|
||||||
|
1 vaš Pronoun Type=possessive * DET
|
||||||
|
1 ves Pronoun Type=general * DET
|
||||||
|
1 vsak Pronoun Type=general * DET
|
||||||
|
1 vsakateri Pronoun Type=general * DET
|
||||||
|
1 vsakdo Pronoun Type=general * PRON
|
||||||
|
1 vsakogaršen Pronoun Type=general * DET
|
||||||
|
1 vsakogaršnji Pronoun Type=general * DET
|
||||||
|
1 vsakršen Pronoun Type=general * DET
|
||||||
|
1 vsakteri Pronoun Type=general * DET
|
||||||
|
1 zame Pronoun Type=personal * PRON
|
||||||
|
1 zase Pronoun Type=reflexive * PRON
|
||||||
|
|
||||||
|
3 * Numeral Form=digit * NUM
|
||||||
|
3 * Numeral Form=roman * NUM
|
||||||
|
3 * Numeral Form=letter|Type=special * NUM
|
||||||
|
3 * Numeral Form=letter|Type=cardinal * NUM
|
||||||
|
2 * Numeral Form=letter|Type=ordinal * ADJ
|
||||||
|
1 drug Numeral Form=letter|Type=pronominal * ADJ
|
||||||
|
1 en Numeral Form=letter|Type=pronominal * NUM
|
||||||
|
1 *en Numeral Form=letter|Type=special * ADJ #enojen, dvojen
|
||||||
|
1 eden Numeral Form=letter|Type=pronominal * NUM #Dodal E.T.
|
||||||
|
|
||||||
|
3 * Adposition * * ADP #MULTEXT-East name
|
||||||
|
3 * Preposition * * ADP #JOS name
|
||||||
|
|
||||||
|
3 * Conjunction Type=coordinating * CCONJ
|
||||||
|
3 * Conjunction Type=subordinating * SCONJ
|
||||||
|
|
||||||
|
3 * Particle * * PART
|
||||||
|
|
||||||
|
3 * Interjection * * INTJ
|
||||||
|
|
||||||
|
3 * Abbreviation * * X
|
||||||
|
|
||||||
|
3 * Residual * * X
|
||||||
|
2 * Residual Type=web * SYM
|
||||||
|
2 * Residual Type=emo * SYM
|
||||||
|
2 * Residual Type=hashtag * SYM #Better mapping?
|
||||||
|
2 * Residual Type=at * SYM #Better mapping?
|
||||||
|
2 * Residual Type=foreign * X #Better mapping?
|
||||||
|
|
||||||
|
3 * Punctuation * * PUNCT
|
||||||
|
1 # Punctuation * * SYM
|
||||||
|
1 % Punctuation * * SYM
|
||||||
|
1 & Punctuation * * SYM
|
||||||
|
1 < Punctuation * * SYM
|
||||||
|
1 > Punctuation * * SYM
|
||||||
|
1 + Punctuation * * SYM
|
||||||
|
1 = Punctuation * * SYM
|
||||||
|
1 ° Punctuation * * SYM
|
||||||
|
1 × Punctuation * * SYM
|
||||||
|
1 ÷ Punctuation * * SYM
|
||||||
|
1 $ Punctuation * * SYM
|
||||||
|
1 @ Punctuation * * SYM
|
||||||
|
1 µ Punctuation * * SYM
|
||||||
|
1 © Punctuation * * SYM
|
||||||
|
1 § Punctuation * * SYM
|
||||||
|
1 € Punctuation * * SYM
|
||||||
|
1 £ Punctuation * * SYM
|
||||||
@@ -1,12 +1,19 @@
|
|||||||
|
"""Convert a TEI file to a XML file of the CJVT standard schema.
|
||||||
|
|
||||||
|
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
|
|
||||||
from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
|
from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
|
||||||
|
|
||||||
|
|
||||||
def get_parsed_unit_string(parsed_unit):
|
def get_parsed_unit_string(parsed_unit):
|
||||||
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
|
elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
|
||||||
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
|
return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
|
||||||
|
|
||||||
|
|
||||||
def convert(input_file_name, output_file_name):
|
def convert(input_file_name, output_file_name):
|
||||||
|
|
||||||
output_root = lxml.Element('dictionary')
|
output_root = lxml.Element('dictionary')
|
||||||
@@ -55,4 +62,6 @@ if (__name__ == '__main__'):
|
|||||||
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
|
arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
|
||||||
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
|
arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
|
||||||
arguments = arg_parser.parse_args()
|
arguments = arg_parser.parse_args()
|
||||||
|
input_file_name = arguments.infile
|
||||||
|
output_file_name = arguments.outfile
|
||||||
convert(input_file_name, output_file_name)
|
convert(input_file_name, output_file_name)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from conversion_utils.jos_msds_and_properties import Converter, Msd
|
from conversion_utils.jos_msds_and_properties import Converter, Msd, MsdException
|
||||||
|
|
||||||
class JosMsdToPropertiesTestCase(unittest.TestCase):
|
class JosMsdToPropertiesTestCase(unittest.TestCase):
|
||||||
|
|
||||||
@@ -55,3 +55,25 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
|
|||||||
self.assertEqual(properties.category, 'punctuation')
|
self.assertEqual(properties.category, 'punctuation')
|
||||||
self.assertEqual(properties.lexeme_feature_map, {})
|
self.assertEqual(properties.lexeme_feature_map, {})
|
||||||
self.assertEqual(properties.form_feature_map, {})
|
self.assertEqual(properties.form_feature_map, {})
|
||||||
|
|
||||||
|
def test_good_msd_with_require_valid(self):
|
||||||
|
properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en', require_valid_flag=True)
|
||||||
|
self.assertEqual(properties.language, 'en')
|
||||||
|
self.assertEqual(properties.category, 'noun')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
|
||||||
|
|
||||||
|
def test_bad_msd(self):
|
||||||
|
properties = self.converter.msd_to_properties(Msd('N---d', 'en'), 'en')
|
||||||
|
self.assertEqual(properties.language, 'en')
|
||||||
|
self.assertEqual(properties.category, 'noun')
|
||||||
|
self.assertEqual(properties.lexeme_feature_map, {})
|
||||||
|
self.assertEqual(properties.form_feature_map, {'case':'dative'})
|
||||||
|
|
||||||
|
def test_bad_msd_with_require_valid(self):
|
||||||
|
try:
|
||||||
|
self.converter.msd_to_properties(Msd('N---d', 'en'), 'en', require_valid_flag=True)
|
||||||
|
fails = False
|
||||||
|
except MsdException:
|
||||||
|
fails = True
|
||||||
|
self.assertEqual(fails, True)
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from conversion_utils.jos_msds_and_properties import Converter, Properties
|
from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException, Msd
|
||||||
|
|
||||||
class JosPropertiesToMsdTestCase(unittest.TestCase):
|
class JosPropertiesToMsdTestCase(unittest.TestCase):
|
||||||
|
|
||||||
@@ -41,3 +41,40 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
|
|||||||
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
|
msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
|
||||||
self.assertEqual(msd.language, 'sl')
|
self.assertEqual(msd.language, 'sl')
|
||||||
self.assertEqual(msd.code, 'U')
|
self.assertEqual(msd.code, 'U')
|
||||||
|
|
||||||
|
def test_good_msd_with_require_valid(self):
|
||||||
|
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en', require_valid_flag=True)
|
||||||
|
self.assertEqual(msd.language, 'en')
|
||||||
|
self.assertEqual(msd.code, 'Ncfdn')
|
||||||
|
|
||||||
|
def test_bad_msd(self):
|
||||||
|
msd = self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en')
|
||||||
|
self.assertEqual(msd.language, 'en')
|
||||||
|
self.assertEqual(msd.code, 'Nc-d')
|
||||||
|
|
||||||
|
def test_msd_to_jos(self):
|
||||||
|
ud = self.converter.msd_to_ud(Msd('Ppnzei', 'sl'), 'slovenski')
|
||||||
|
self.assertEqual(ud.pos, 'ADJ')
|
||||||
|
self.assertEqual(ud.to_full_string(), 'UposTag=ADJ|Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
|
||||||
|
self.assertEqual(ud.to_features_string(), 'Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
|
||||||
|
|
||||||
|
ud = self.converter.msd_to_ud(Msd('Sommr', 'sl'), 'dečko')
|
||||||
|
self.assertEqual(ud.pos, 'NOUN')
|
||||||
|
self.assertEqual(ud.to_full_string(), 'UposTag=NOUN|Case=Gen|Gender=Masc|Number=Plur')
|
||||||
|
self.assertEqual(ud.to_features_string(), 'Case=Gen|Gender=Masc|Number=Plur')
|
||||||
|
|
||||||
|
def test_msd_to_jos_partial_msd(self):
|
||||||
|
try:
|
||||||
|
self.converter.msd_to_ud(Msd('Soz', 'sl'), 'vlada')
|
||||||
|
fails = False
|
||||||
|
except MsdException:
|
||||||
|
fails = True
|
||||||
|
self.assertEqual(fails, True)
|
||||||
|
|
||||||
|
def test_bad_msd_with_require_valid(self):
|
||||||
|
try:
|
||||||
|
self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
|
||||||
|
fails = False
|
||||||
|
except MsdException:
|
||||||
|
fails = True
|
||||||
|
self.assertEqual(fails, True)
|
||||||
@@ -1,26 +1,29 @@
|
|||||||
#!/usr/bin/python3
|
"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import codecs
|
|
||||||
import lxml.etree as lxml
|
import lxml.etree as lxml
|
||||||
from importlib_resources import files
|
from importlib_resources import files
|
||||||
|
|
||||||
from conversion_utils.jos_msds_and_properties import Converter, Msd
|
from conversion_utils.jos_msds_and_properties import Converter, Msd
|
||||||
|
|
||||||
|
|
||||||
def get_syn_map():
|
def get_syn_map():
|
||||||
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
|
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
|
||||||
dict_file = codecs.open(dict_file_name, 'r')
|
dict_file = open(dict_file_name, 'r', encoding='utf-8')
|
||||||
root = lxml.parse(dict_file).getroot()
|
root = lxml.parse(dict_file).getroot()
|
||||||
dict_file.close()
|
dict_file.close()
|
||||||
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
|
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
|
||||||
|
|
||||||
def translate(input_file_name, output_file_name):
|
|
||||||
|
def translate(input_file_name, scope, output_file_name):
|
||||||
|
|
||||||
syn_map = get_syn_map()
|
syn_map = get_syn_map()
|
||||||
|
|
||||||
output_file = codecs.open(output_file_name, 'w')
|
output_file = open(output_file_name, 'w', encoding='utf-8')
|
||||||
input_file = codecs.open(input_file_name, 'r')
|
input_file = open(input_file_name, 'r', encoding='utf-8')
|
||||||
|
|
||||||
converter = Converter()
|
converter = Converter()
|
||||||
|
|
||||||
@@ -29,8 +32,10 @@ def translate(input_file_name, output_file_name):
|
|||||||
if (len(columns) != 10):
|
if (len(columns) != 10):
|
||||||
output_file.write(line)
|
output_file.write(line)
|
||||||
else:
|
else:
|
||||||
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
|
if (scope in {'msd', 'both'}):
|
||||||
columns[7] = syn_map[columns[7]]
|
columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
|
||||||
|
if (scope in {'dep', 'both'}):
|
||||||
|
columns[7] = syn_map[columns[7]]
|
||||||
output_file.write('\t'.join(columns) + '\n')
|
output_file.write('\t'.join(columns) + '\n')
|
||||||
|
|
||||||
input_file.close()
|
input_file.close()
|
||||||
@@ -41,6 +46,7 @@ if (__name__ == '__main__'):
|
|||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
|
arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
|
||||||
arg_parser.add_argument('-infile', type=str, help='Input conllu')
|
arg_parser.add_argument('-infile', type=str, help='Input conllu')
|
||||||
|
arg_parser.add_argument('-scope', type=str, options=['msd', 'dep', 'both'], default='both', help='Input conllu')
|
||||||
arg_parser.add_argument('-outfile', type=str, help='Output conllu')
|
arg_parser.add_argument('-outfile', type=str, help='Output conllu')
|
||||||
arguments = arg_parser.parse_args()
|
arguments = arg_parser.parse_args()
|
||||||
input_file_name = arguments.infile
|
input_file_name = arguments.infile
|
||||||
|
|||||||
@@ -1,11 +1,16 @@
|
|||||||
|
"""A few convenience TEI/XML constants and functions."""
|
||||||
|
|
||||||
|
|
||||||
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
|
||||||
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
|
TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
|
||||||
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
|
XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
|
||||||
|
|
||||||
|
|
||||||
def xpath_find(element,expression):
|
def xpath_find(element,expression):
|
||||||
"""Executes XPath expression, with TEI namespace."""
|
"""Executes XPath expression, with TEI namespace."""
|
||||||
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
|
||||||
|
|
||||||
|
|
||||||
def get_xml_id(element):
|
def get_xml_id(element):
|
||||||
"""Returns the element's @xml:id attribute."""
|
"""Returns the element's @xml:id attribute."""
|
||||||
return element.get(XML_ID_ATTRIBUTE_NAME)
|
return element.get(XML_ID_ATTRIBUTE_NAME)
|
||||||
|
|||||||
@@ -1,3 +1,13 @@
|
|||||||
|
"""Parse source TEI specifications and save as a pickle.
|
||||||
|
|
||||||
|
You can use this script to create a new pickle file to replace the one stored at
|
||||||
|
../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
|
||||||
|
of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml. However, the specifications
|
||||||
|
are not expected to change, and if they do, the package pickle there should be updated upstream, so
|
||||||
|
you probably should not have to use this script.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
import argparse
|
import argparse
|
||||||
from conversion_utils.jos_msds_and_properties import SpecificationsParser
|
from conversion_utils.jos_msds_and_properties import SpecificationsParser
|
||||||
|
|||||||
20
setup.py
20
setup.py
@@ -1,12 +1,20 @@
|
|||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
import os
|
||||||
|
|
||||||
setup(name='conversion_utils',
|
here = os.path.abspath(os.path.dirname(__file__))
|
||||||
version='0.1',
|
with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
|
||||||
|
long_description = f.read()
|
||||||
|
|
||||||
|
setup(name='cjvt_conversion_utils',
|
||||||
|
version='0.3',
|
||||||
description='CJVT conversion utilities',
|
description='CJVT conversion utilities',
|
||||||
|
long_description=long_description,
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
url='https://gitea.cjvt.si/generic/conversion_utils',
|
url='https://gitea.cjvt.si/generic/conversion_utils',
|
||||||
author='Cyprian Laskowski',
|
author='CJVT',
|
||||||
author_email='cyp@cjvt.si',
|
author_email='pypi@cjvt.si',
|
||||||
packages=['conversion_utils', 'conversion_utils.resources'],
|
license='MIT',
|
||||||
install_requires=['importlib_resources'],
|
packages=['conversion_utils', 'conversion_utils.resources', 'conversion_utils.tests'],
|
||||||
|
install_requires=['lxml', 'importlib_resources'],
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
zip_safe=True)
|
zip_safe=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user