Small bug fix regarding UD features conversion to string

Added conversion from msd to universal dependencies based on Jaka's implementation
No warning messages for partial msds
2026-04-01 12:54:09 +02:00 · 2026-03-30 22:31:08 +02:00 · 2026-03-30 10:03:49 +02:00 · 2025-11-28 16:55:04 +01:00 · 2023-10-31 10:39:19 +01:00 · 2023-10-26 17:13:54 +02:00
19 changed files with 2715 additions and 55 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,7 @@
 *.pyc
 venv
 data
 .idea
 build
 dist
 *.egg-info
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -0,0 +1,22 @@
 MIT License
 Copyright (c) 2023 CLARIN.SI
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,6 @@
 include conversion_utils/resources/jos_specifications.pickle
 include conversion_utils/resources/dict.xml
 include conversion_utils/resources/structure_conversions.csv
 include conversion_utils/resources/jos-msd2features.tbl
 include conversion_utils/resources/jos2ud-features.tbl
 include conversion_utils/resources/jos2ud-pos.tbl
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
-## Conversion utilities
+## CJVT conversion utilities
-This repository is currently intended for common conversions needed by CJVT developers. For the
+This repository is intended for common conversions needed by CJVT developers. It can of course also
-moment, this is limited to JOS msds and properties.
+be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
 were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.
 ### JOS msds and properties
--- a/conversion_utils/conllu_to_tei.py
+++ b/conversion_utils/conllu_to_tei.py
@@ -1,23 +1,36 @@
 """Convert a series of CoNNL-U files to a TEI file.
 This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
 """
 import argparse
 import re
 import sys
-
+from glob import glob
 from lxml import etree
 class Sentence:
    def __init__(self, _id, no_ud=False, system='jos'):
        self._id = _id
        self.items = []
        self.links = []
        self.srl_links = []
        self.no_ud = no_ud
        self.system = system
    def add_item(self, token, lemma, upos, upos_other, xpos, misc):
-        self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')])
+        no_space_after = 'SpaceAfter' in misc and misc['SpaceAfter'] == 'No'
        ner = misc['NER'] if 'NER' in misc else 'O'
        self.items.append([token, lemma, upos, upos_other, xpos, no_space_after, ner])
    def add_link(self, link_ref, link_type):
        self.links.append([link_ref, link_type])
    def add_srl_link(self, link_ref, link_type):
        self.srl_links.append([link_ref, link_type])
    def as_xml(self, id_prefix=None):
        if id_prefix:
            xml_id = id_prefix + '.' + self._id
@@ -27,8 +40,24 @@ class Sentence:
        set_xml_attr(base, 'id', xml_id)
        id_counter = 1
        in_seg = False
        sentence_base = base
        for item in self.items:
-            token, lemma, upos, upos_other, xpos, no_space_after = item
+            token, lemma, upos, upos_other, xpos, no_space_after, ner = item
            if ner[0] == 'B':
                if in_seg:
                    sentence_base.append(base)
                in_seg = True
                base = etree.Element('seg')
                base.set('type', 'name')
                base.set('subtype', f'{ner[2:].lower()}')
            elif ner[0] == 'O':
                if in_seg:
                    sentence_base.append(base)
                    base = sentence_base
                in_seg = False
            if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
                to_add = etree.Element('pc')
@@ -53,6 +82,11 @@ class Sentence:
            base.append(to_add)
        if in_seg:
            sentence_base.append(base)
            base = sentence_base
        # depparsing linkGrp
        link_grp = etree.Element('linkGrp')
        link_grp.set('corresp', '#'+xml_id)
        link_grp.set('targFunc', 'head argument')
@@ -67,6 +101,23 @@ class Sentence:
                link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
            link_grp.append(link)
        base.append(link_grp)
        # srl linkGrp
        if self.srl_links:
            link_grp = etree.Element('linkGrp')
            link_grp.set('corresp', '#' + xml_id)
            link_grp.set('targFunc', 'head argument')
            link_grp.set('type', 'SRL')
            for link_id, item in enumerate(self.srl_links):
                link_ref, link_type = item
                link = etree.Element('link')
                link.set('ana', 'srl:' + link_type.replace(':', '_'))
                if link_ref == u'0':
                    link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1))
                else:
                    link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
                link_grp.append(link)
            base.append(link_grp)
        return base
@@ -234,7 +285,7 @@ def construct_sentence(sent_id, lines):
        upos_other = tokens[5]
        depparse_link = tokens[6]
        depparse_link_name = tokens[7]
-        misc = tokens[9]
+        misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')} if tokens[9] != '_' else {}
        sentence.add_item(
                token,
@@ -247,6 +298,11 @@ def construct_sentence(sent_id, lines):
        sentence.add_link(
            depparse_link,
            depparse_link_name)
        if 'SRL' in misc:
            sentence.add_srl_link(
                depparse_link,
                misc['SRL'])
    return sentence
@@ -256,7 +312,7 @@ def construct_tei_etrees(conllu_lines):
 def convert_file(input_file_name, output_file_name):
-    input_file = open(input_file_name, 'r')
+    input_file = open(input_file_name, 'r', encoding='utf-8')
    root = construct_tei_etrees(input_file)[0]
    tree = etree.ElementTree(root)
    tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@@ -267,19 +323,16 @@ def convert_file(input_file_name, output_file_name):
 if __name__ == '__main__':
    import argparse
    from glob import glob
    parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
    parser.add_argument('files', nargs='+', help='CoNNL-U file')
-    parser.add_argument('-o', '--out-file', dest='out', default=None,
+    parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
                help='Write output to file instead of stdout.')
    parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])
    args = parser.parse_args()
    if args.out:
-        f_out = open(args.out, 'w')
+        f_out = open(args.out, 'w', encoding='utf-8')
    else:
        f_out = sys.stdout
@@ -288,7 +341,7 @@ if __name__ == '__main__':
    for arg in args.files:
        filelist = glob(arg)
        for f in filelist:
-            with open(f, 'r') as conllu_f:
+            with open(f, 'r', encoding='utf-8') as conllu_f:
                tei_etrees = construct_tei_etrees(conllu_f)
            for tei_etree in tei_etrees:
                f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
--- a/conversion_utils/jos_msds_and_properties.py
+++ b/conversion_utils/jos_msds_and_properties.py
@@ -1,12 +1,21 @@
 import lxml.etree as lxml
 import re
 import pickle
-import importlib_resources as pkg_resources
+import lxml.etree as lxml
 from collections import defaultdict
 from importlib_resources import files
 from enum import Enum
 from conversion_utils.utils import xpath_find, get_xml_id
 JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'
 RESOURCES_DIR = "conversion_utils.resources"
 MSD_TO_FEATURES = "jos-msd2features.tbl"
 JOS_TO_UD_FEATURES_RULES = "jos2ud-features.tbl"
 JOS_TO_UPOS_RULES = "jos2ud-pos.tbl"
 ## Positions of lexeme-level features for each category
 LEXEME_FEATURE_MAP = {'noun':{1,2},
                      'verb':{1,2},
@@ -53,6 +62,14 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
                    ('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
                    ('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
 class MsdState(Enum):
    FULL = 1
    PARTIAL = 2
    UNKNOWN = 3
 class MsdException(Exception):
    pass
 class Specifications:
    """JOS specifications with list of all word categories."""
@@ -216,6 +233,36 @@ class Properties:
            and self.language == obj.language
 class UD:
    """Universal Dependencies object.
    Can be converted to a valid UD features string.
    """
    def __init__(self, pos, features_map):
        self.pos = pos
        self.features_map = features_map
    def to_features_string(self):
        return self._features_string()
    def to_full_string(self):
        features = self._features_string()
        if features:
            return "UposTag=" + self.pos + "|" + features
        else:
            return "UposTag=" + self.pos
    def _features_string(self):
        return "|".join([f"{feature}={value}" for feature, value in self._sort_features(self.features_map)])
    def _sort_features(self, features_map):
        return sorted(features_map.items(), key=lambda x: x[0].lower(), reverse=False)
    def __str__(self):
        return f"pos={self.pos}, features_map={self.features_map}"
 class Msd:
    """JOS msd."""  
@@ -230,17 +277,15 @@ class Msd:
        return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
 class ConverterException(Exception):
    pass
 class Converter:
    """Converter between Msd and Properties objects."""
    def __init__(self, xml_file_name=None):
        if (xml_file_name is None):
-            if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)):
+            resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
            if (resource.is_file()):
                try:
-                    with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file:
+                    with resource.open('rb') as pickle_file:
                        self.specifications = pickle.load(pickle_file)
                except:
                    exit('Could not parse specifications pickle file installed.')
@@ -253,17 +298,84 @@ class Converter:
            except:
                exit('Could not parse specifications xml file provided.')
-    def msd_to_properties(self, msd, language, lemma=None):
+        self.mte_to_ud_features = self._parse_msd_ud_conversion(MSD_TO_FEATURES)
-        """Convert Msd to Properties (possibly in the other language).
+        self.mte_to_ud_features_rules = self._parse_ud_rules(JOS_TO_UD_FEATURES_RULES)
        self.mte_to_upos_rules = self._parse_ud_rules(JOS_TO_UPOS_RULES)
    def _parse_msd_ud_conversion(self, file_name):
        """Parse file with direct conversions from English Msd to Universal Dependencies."""
        conversion_map = defaultdict()
        with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as conversion_file:
            for line in conversion_file.readlines():
                mte_msd_en, mte_features_en = line.strip("\n").split("\t")
                mte_sl = self.translate_msd(Msd(mte_msd_en, "en"), "sl").code
                conversion_map[mte_msd_en] = mte_features_en
                conversion_map[mte_sl] = mte_features_en
        return conversion_map
    def _parse_ud_rules(self, file_name):
        """Parse file with rules additional rules for converting from applied to conversion from English Msd to Universal Dependencies."""
        all_rules = defaultdict(list)
        with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as rules_file:
            for line in [l for l in rules_file.readlines() if l[0].isdigit()]:
                priority, *current_rules = line.strip("\n").split("\t")
                current_rules += [""] * (6 - len(current_rules))
                all_rules[priority].append(current_rules)
        return all_rules
    def is_valid_msd(self, msd):
        """Verify if the Msd code is in the standard JOS set."""
        return msd.code in self.specifications.codes_map[msd.language]
    def get_msd_state(self, msd):
        """Determine if the Msd code is full, partial or unknown."""
        code_map = self.specifications.codes_map[msd.language]
        if msd.code in code_map:
            return MsdState.FULL
        for msd_code in code_map:
            if msd_code.startswith(msd.code):
                return MsdState.PARTIAL
        return MsdState.UNKNOWN
    def check_valid_msd(self, msd, require_valid_flag, allow_partial=True):
        """If the Msd code is not valid, raise an exception or give a warning."""
        msd_state = self.get_msd_state(msd)
        if msd_state == MsdState.UNKNOWN:
            message = f"The msd '{msd.code}' is unknown"
            if require_valid_flag:
                raise MsdException(message)
            else:
                print('[WARN] ' + message)
        if msd_state == MsdState.PARTIAL and not allow_partial:
            raise MsdException(f"Partial msd '{msd.code}' is not allowed. Full msd is required.")
    def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
        """Convert Msd to Properties.
        The language of the generated Properties is specified and can differ from the Msd language.
        If require_valid_flag is True, a MsdException is raised if the MSD is not in the standard
        JOS set. Otherwise only a warning is given.
        If you care about accurate level information (i.e., which properties are lexeme-level and
        which are form-level), note that some features depends on the particular lemma. For such
        features, if lemma is not provided and warn_level_flag is True, a warning will be given.
        If a MSD has dashes in place of letters for certain features, they are skipped, so that
        these features are not included in the generated Properties object.
        Parameters:
        msd(Msd): the JOS MSD to convert
        language(str): the language for the Properties object to be generated: "en" (English) or "sl" (Slovene)
        lemma(str): the lemma of the word form with the MSD
        require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is provided
        warn_level_flag(boolean): whether to warn if cannot be sure of level of a property
        Returns:
        Properties: the result of the conversion of the Msd in the language requested
        The level (lexeme vs form) of certain reflexive msd features
        depends on the lemma, so set the lemma if you need accurate
        level information.
        """
-
+        self.check_valid_msd(msd, require_valid_flag)
        if (msd.code not in self.specifications.codes_map[msd.language]):
            raise ConverterException('The msd {} is unknown'.format(msd.code))
        category_char = msd.code[0].lower()
        value_chars = msd.code[1:]
        category = self.specifications.find_category_by_code(category_char, msd.language)
@@ -277,8 +389,8 @@ class Converter:
                value = feature.find_value_by_char(value_char, msd.language)
                feature_name = feature.names.get(language)
                feature_value = value.names.get(language)
-                if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
+                if (warn_level_flag and lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
-                    print('[WARN] The level of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
+                    print('[WARN] The level (lexeme vs form) of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
                          .format(category=category_name, position=index))
                level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
                lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
@@ -289,8 +401,21 @@ class Converter:
                    form_feature_map[feature_name] = feature_value
        return Properties(category_name, lexeme_feature_map, form_feature_map, language)
-    def properties_to_msd(self, properties, language):
+    def properties_to_msd(self, properties, language, require_valid_flag=False):
-        """Convert Properties to msd (possibly in the other language)."""
+        """Convert Properties to Msd.
        The language of the generated Msd is specified and can differ from the Properties language.
        If require_valid_flag is True, a MsdException is raised if the generated MSD is not in
        the standard JOS set. Otherwise only a warning is given.
        Any skipped positions among the Properties are represented as dashes in the MSD.
        Parameters:
        properties(Properties): the properties to convert
        language(str): the language for the Msd object to be returned: "en" (English) or "sl" (Slovene)
        require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is generated
        """
        category = self.specifications.find_category_by_name(properties.category, properties.language)
        category_char = category.codes.get(language).upper()
        feature_map = properties.lexeme_feature_map.copy()
@@ -308,7 +433,51 @@ class Converter:
                msd_code += '-'
                i += 1
            msd_code += position_map[position]
-        return Msd(msd_code, language)
+        msd = Msd(msd_code, language)
        self.check_valid_msd(msd, require_valid_flag)
        return msd
    def msd_to_ud(self, msd, lemma):
        """Convert Msd to Universal Dependencies object.
        Partial Msds are currently not supported.
        Parameters:
        msd(Msd): the Msd to convert
        lemma(str): the lemma of the word form with the MSD
        """
        self.check_valid_msd(msd, False, allow_partial=False)
        upos_category, *upos_features = self.mte_to_ud_features[msd.code].split()
        final_upos = ""
        for priority in sorted(self.mte_to_upos_rules, reverse=True):
            for rule in self.mte_to_upos_rules[priority]:
                rule_lemma, rule_category, rule_mte_features, _, rule_pos_ud, _ = rule
                if (rule_category != upos_category
                or (rule_lemma not in ("*", "*en") and lemma != rule_lemma)
                or (rule_lemma == "*en" and not lemma.endswith("en"))
                or (rule_mte_features != "*" and not all(f in upos_features for f in rule_mte_features.split("|")))):
                    continue
                final_upos = rule_pos_ud
        for priority in sorted(self.mte_to_ud_features_rules):
            for rule in self.mte_to_ud_features_rules[priority]:
                rule_lemma, rule_category, rule_mte_features, rule_pos_ud, rule_ud_features, _ = rule
                if (rule_lemma != "*" and lemma != rule_lemma
                or (rule_category != "*" and rule_category != upos_category)
                or (rule_pos_ud != "*" and rule_pos_ud != final_upos)):
                    continue
                upos_features = [rule_ud_features if f == rule_mte_features else f for f in upos_features]
                if rule_mte_features == "*" and rule_ud_features != "-":
                    upos_features.append(rule_ud_features)
        ud_features = dict(f.split("=", 1) for f in "|".join(upos_features).split("|") if f not in {"", "-"})
        return UD(final_upos, ud_features)
    def translate_msd(self, msd, language):
        return self.properties_to_msd(self.msd_to_properties(msd, language), language)
--- a/conversion_utils/resources/jos-msd2features.tbl
+++ b/conversion_utils/resources/jos-msd2features.tbl
--- a/conversion_utils/resources/jos2ud-features.tbl
+++ b/conversion_utils/resources/jos2ud-features.tbl
@@ -0,0 +1,128 @@
 # Mapping from JOS features to UD features						
 # Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek						
 # 2018-11-23						
 #						
 #Prio	Lemma	Category	Feats	PoS-UD	->Feature-UD	#Comment
 ----------------------------------------------------------------------------------------------------						
 1	*	Noun	Type=common	*	-	
 1	*	Noun	Type=proper	*	-	
 1	*	Verb	Negative=no	*	Polarity=Pos	
 1	*	Verb	Negative=yes	*	Polarity=Neg	
 1	*	Verb	Type=auxiliary	*	-	
 1	*	Verb	Type=main	*	-	
 1	*	Verb	VForm=present	*	VerbForm=Fin|Mood=Ind|Tense=Pres	
 1	*	Verb	VForm=future	*	VerbForm=Fin|Mood=Ind|Tense=Fut	
 1	*	Verb	VForm=conditional	*	VerbForm=Fin|Mood=Cnd	
 1	*	Verb	VForm=imperative	*	VerbForm=Fin|Mood=Imp	
 1	*	Verb	VForm=infinitive	*	VerbForm=Inf	
 1	*	Verb	VForm=supine	*	VerbForm=Sup	
 1	*	Verb	VForm=participle	*	VerbForm=Part	
 1	*	Adjective	Type=general	*	-	
 1	*	Adjective	Type=possessive	*	Poss=Yes	
 1	*	Adjective	Type=participle	*	VerbForm=Part	
 2	*	Adverb	Type=participle	*	VerbForm=Conv	
 2	*	Adverb	Type=general	*	-	
 1	nekaj	Adverb	Type=general	DET	PronType=Ind	
 1	več	Adverb	Type=general	DET	PronType=Ind	
 1	veliko	Adverb	Type=general	DET	PronType=Ind	
 1	manj	Adverb	Type=general	DET	PronType=Ind	
 1	dovolj	Adverb	Type=general	DET	PronType=Ind	
 1	pol	Adverb	Type=general	DET	PronType=Ind	
 1	malo	Adverb	Type=general	DET	PronType=Ind	
 1	toliko	Adverb	Type=general	DET	PronType=Dem	
 1	največ	Adverb	Type=general	DET	PronType=Ind	
 1	mnogo	Adverb	Type=general	DET	PronType=Ind	
 1	preveč	Adverb	Type=general	DET	PronType=Ind	
 1	par	Adverb	Type=general	DET	PronType=Ind	
 1	koliko	Adverb	Type=general	DET	PronType=Int	
 1	dosti	Adverb	Type=general	DET	PronType=Ind	
 1	nešteto	Adverb	Type=general	DET	PronType=Ind	
 1	četrt	Adverb	Type=general	DET	PronType=Ind	
 1	ogromno	Adverb	Type=general	DET	PronType=Ind	
 1	čimveč	Adverb	Type=general	DET	PronType=Ind	
 1	obilo	Adverb	Type=general	DET	PronType=Ind	
 1	premnogo	Adverb	Type=general	DET	PronType=Ind	
 1	enormno	Adverb	Type=general	DET	PronType=Ind
 1	majčkeno	Adverb	Type=general	DET	PronType=Ind	
 2	*	Pronoun	Type=reflexive	*	PronType=Prs|Reflex=Yes	
 2	*	Pronoun	Type=personal	*	PronType=Prs	
 2	*	Pronoun	Type=possessive	*	PronType=Prs|Poss=Yes	
 2	*	Pronoun	Type=interrogative	*	PronType=Int	
 2	*	Pronoun	Type=relative	*	PronType=Rel	
 2	*	Pronoun	Type=demonstrative	*	PronType=Dem	
 2	*	Pronoun	Type=general	*	PronType=Tot	
 2	*	Pronoun	Type=negative	*	PronType=Neg	
 2	*	Pronoun	Type=indefinite	*	PronType=Ind	
 1	*	Pronoun	Type=personal	DET	PronType=Prs	
 1	*	Pronoun	Type=possessive	DET	PronType=Prs|Poss=Yes	
 1	*	Pronoun	Owner_Gender=masculine	*	Gender[psor]=Masc	#lg.spec.feature
 1	*	Pronoun	Owner_Gender=feminine	*	Gender[psor]=Fem	#lg.spec.feature
 1	*	Pronoun	Owner_Gender=neuter	*	Gender[psor]=Neut	#lg.spec.feature
 1	*	Pronoun	Owner_Number=singular	*	Number[psor]=Sing	#lg.spec.feature
 1	*	Pronoun	Owner_Number=plural	*	Number[psor]=Plur	#lg.spec.feature
 1	*	Pronoun	Owner_Number=dual	*	Number[psor]=Dual	#lg.spec.feature
 1	*	Pronoun	Clitic=yes	*	Variant=Short	#lg.spec.feature
 1	*	Pronoun	Clitic=bound	*	Variant=Bound	#lg.spec.feature
 1	svoj	Pronoun	Type=reflexive	*	PronType=Prs|Reflex=Yes|Poss=Yes	
 2	*	Numeral	Type=pronominal	*	-	
 2	*	Numeral	Form=letter	*	-	
 2	*	Numeral	Type=cardinal	NUM	NumType=Card	
 1	*	Numeral	Form=letter	NUM	NumForm=Word	#lg.spec.feature
 1	*	Numeral	Form=digit	NUM	NumForm=Digit	#lg.spec.feature
 1	*	Numeral	Form=roman	NUM	NumForm=Roman	#lg.spec.feature
 1	*	Numeral	Type=ordinal	*	NumType=Ord	
 1	*	Numeral	Type=special	ADJ	NumType=Mult	
 1	*	Numeral	Type=special	NUM	NumType=Sets	
 1	en	Numeral	Type=pronominal	*	NumType=Card	
 1	eden	Numeral	Type=pronominal	*	NumType=Card	
 1	*	Conjunction	Type=subordinating	*	-	
 1	*	Conjunction	Type=coordinating	*	-	
 2	*	Particle	*	*	-	
 1	ne	Particle	*	*	Polarity=Neg	
 1	*	Interjection	*	*	-	
 1	*	Abbreviation	*	*	Abbr=Yes	
 2	*	Residual	*	*	-	
 1	*	Residual	Type=foreign	*	Foreign=Yes	
 1	*	Residual	Type=typo	*	-	
 1	*	Residual	Type=program	*	-	
 1	*	Punctuation	*	*	-	
 2	*	*	Degree=positive	*	Degree=Pos	
 2	*	*	Degree=comparative	*	Degree=Cmp	
 2	*	*	Degree=superlative	*	Degree=Sup	
 1	*	*	Degree=positive	DET	-	
 1	*	*	Degree=comparative	DET	-	
 1	*	*	Degree=superlative	DET	-	
 1	*	*	Animate=no	*	Animacy=Inan	
 1	*	*	Animate=yes	*	Animacy=Anim	
 1	*	*	Aspect=perfective	*	Aspect=Perf	
 1	*	*	Aspect=progressive	*	Aspect=Imp	
 1	*	*	Aspect=biaspectual	*	-	
 1	*	*	Case=nominative	*	Case=Nom	
 1	*	*	Case=genitive	*	Case=Gen	
 1	*	*	Case=dative	*	Case=Dat	
 1	*	*	Case=accusative	*	Case=Acc	
 1	*	*	Case=locative	*	Case=Loc	
 1	*	*	Case=instrumental	*	Case=Ins	
 1	*	*	Definiteness=no	*	Definite=Ind	
 1	*	*	Definiteness=yes	*	Definite=Def	
 1	*	*	Gender=masculine	*	Gender=Masc	
 1	*	*	Gender=feminine	*	Gender=Fem	
 1	*	*	Gender=neuter	*	Gender=Neut	
 1	*	*	Number=singular	*	Number=Sing	
 1	*	*	Number=plural	*	Number=Plur	
 1	*	*	Number=dual	*	Number=Dual	
 1	*	*	Person=first	*	Person=1	
 1	*	*	Person=second	*	Person=2	
 1	*	*	Person=third	*	Person=3	
--- a/conversion_utils/resources/jos2ud-pos.tbl
+++ b/conversion_utils/resources/jos2ud-pos.tbl
@@ -0,0 +1,282 @@
 # Mapping from JOS PoS to UD 2.0 PoS						
 # Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek						
 # 2019-02-04						
 #						
 #Prio	Lemma	Category	Feats	Deps	->PoS-UD	#Comment
 #-------------------------------------------------------------------------------------------------------						
 3	*	Noun	Type=common	*	NOUN	
 3	*	Noun	Type=proper	*	PROPN	
 3	*	Verb	*	*	VERB	
 2	*	Verb	Type=auxiliary	*	AUX	#This is one can in fact also be VERB, but this has to be determined by some other means
 3	*	Adjective	*	*	ADJ	
 3	*	Adverb	*	*	ADV	
 1	četrt	Adverb	*	*	DET	
 1	čimmanj	Adverb	*	*	DET	
 1	čimveč	Adverb	*	*	DET	
 1	dosti	Adverb	*	*	DET	
 1	dovolj	Adverb	*	*	DET	
 1	enako	Adverb	*	*	ADV	
 1	enormno	Adverb	*	*	DET	
 1	ful	Adverb	*	*	ADV	
 1	koliko	Adverb	*	*	DET	
 1	majčkeno	Adverb	*	*	DET	
 1	maksimalno	Adverb	*	*	ADV	
 1	malce	Adverb	*	*	ADV	
 1	malo	Adverb	*	*	DET	
 1	manj	Adverb	*	*	DET	
 1	minimalno	Adverb	*	*	ADV	
 1	mnogo	Adverb	*	*	DET	
 1	najmanj	Adverb	*	*	ADV	
 1	največ	Adverb	*	*	DET	
 1	nekaj	Adverb	*	*	DET	
 1	nekoliko	Adverb	*	*	ADV	
 1	nemalo	Adverb	*	*	ADV	
 1	nešteto	Adverb	*	*	DET	
 1	nič	Adverb	*	*	ADV	
 1	ničkoliko	Adverb	*	*	DET	
 1	obilo	Adverb	*	*	DET	
 1	ogromno	Adverb	*	*	DET	
 1	par	Adverb	*	*	DET	
 1	pol	Adverb	*	*	DET	
 1	polno	Adverb	*	*	ADV	
 1	precej	Adverb	*	*	ADV	
 1	premalo	Adverb	*	*	ADV	
 1	premnogo	Adverb	*	*	DET	
 1	preveč	Adverb	*	*	DET	
 1	toliko	Adverb	*	*	DET	
 1	veliko	Adverb	*	*	DET	
 1	več	Adverb	*	*	DET	
 1	večidel	Adverb	*	*	ADV	
 1	vse	Adverb	*	*	ADV	
 1	zadosti	Adverb	*	*	ADV	
 ##All Pronouns should be explicitly defined						
 ##But are not because of jos1M wrong lemmatisations for e.g. "ti", "te" etc.						
 3	*	Pronoun	*	*	PRON	
 ##2	*	Pronoun	Type=demonstrative	*	DET	
 ##2	*	Pronoun	Type=possessive	*	DET	
 1	bogsigavedikakšen	Pronoun	Type=indefinite	*	DET	
 1	bogvedikaj	Pronoun	Type=indefinite	*	PRON	
 1	bogvedikateri	Pronoun	Type=indefinite	*	DET	
 1	bogvekaj	Pronoun	Type=indefinite	*	PRON	
 1	bogvekakšen	Pronoun	Type=indefinite	*	DET	
 1	bogvekateri	Pronoun	Type=indefinite	*	DET	
 1	bogvekolik	Pronoun	Type=indefinite	*	DET	
 1	bogvekolikšen	Pronoun	Type=indefinite	*	DET	
 1	čezme	Pronoun	Type=personal	*	PRON	
 1	čezse	Pronoun	Type=reflexive	*	PRON	
 1	čigar	Pronoun	Type=relative	*	DET	
 1	čigarkoli	Pronoun	Type=relative	*	DET	
 1	čigarsižebodi	Pronoun	Type=relative	*	DET	
 1	čigav	Pronoun	Type=interrogative	*	DET	
 1	čigaver	Pronoun	Type=relative	*	DET	
 1	čigaverkoli	Pronoun	Type=relative	*	DET	
 1	čigavršen	Pronoun	Type=relative	*	DET	
 1	čigavršnji	Pronoun	Type=relative	*	DET	
 1	enak	Pronoun	Type=indefinite	*	DET	
 1	enaki	Pronoun	Type=indefinite	*	DET	
 1	enakšen	Pronoun	Type=indefinite	*	DET	
 1	isti	Pronoun	Type=indefinite	*	DET	
 1	jaz	Pronoun	Type=personal	*	PRON	
 1	jest	Pronoun	Type=personal	*	PRON	
 1	kaj	Pronoun	Type=interrogative	*	PRON	
 1	kak	Pronoun	Type=interrogative	*	DET	
 1	kakov	Pronoun	Type=interrogative	*	DET	
 1	kakošen	Pronoun	Type=interrogative	*	DET	
 1	kakršen	Pronoun	Type=relative	*	DET	
 1	kakršenkoli	Pronoun	Type=relative	*	DET	
 1	kakršensižebodi	Pronoun	Type=relative	*	DET	
 1	kakšen	Pronoun	Type=interrogative	*	DET	
 1	kar	Pronoun	Type=relative	*	PRON	
 1	karkoli	Pronoun	Type=relative	*	PRON	
 1	karsibodi	Pronoun	Type=relative	*	PRON	
 1	karsižebodi	Pronoun	Type=relative	*	PRON	
 1	kateri	Pronoun	Type=interrogative	*	DET	
 1	katerikoli	Pronoun	Type=relative	*	DET	
 1	katerisibodi	Pronoun	Type=relative	*	DET	
 1	kdo	Pronoun	Type=interrogative	*	PRON	
 1	kdor	Pronoun	Type=relative	*	PRON	
 1	kdorkoli	Pronoun	Type=relative	*	PRON	
 1	kdorsibodi	Pronoun	Type=relative	*	PRON	
 1	kdorsižebodi	Pronoun	Type=relative	*	PRON	
 1	kdovekaj	Pronoun	Type=indefinite	*	PRON	
 1	kdovekak	Pronoun	Type=indefinite	*	DET	
 1	kdovekakšen	Pronoun	Type=indefinite	*	DET	
 1	kdovekateri	Pronoun	Type=indefinite	*	DET	
 1	kdovekdo	Pronoun	Type=indefinite	*	PRON	
 1	kdovekolik	Pronoun	Type=indefinite	*	DET	
 1	koji	Pronoun	Type=interrogative	*	DET	
 1	kolik	Pronoun	Type=interrogative	*	DET	
 1	kolik	Pronoun	Type=indefinite	*	DET	
 1	koliker	Pronoun	Type=interrogative	*	DET	
 1	kolikršen	Pronoun	Type=relative	*	DET	
 1	kolikšen	Pronoun	Type=interrogative	*	DET	
 1	malokaj	Pronoun	Type=indefinite	*	PRON	
 1	malokak	Pronoun	Type=indefinite	*	DET	
 1	malokakšen	Pronoun	Type=indefinite	*	DET	
 1	malokateri	Pronoun	Type=indefinite	*	DET	
 1	malokdo	Pronoun	Type=indefinite	*	PRON	
 1	marsikaj	Pronoun	Type=indefinite	*	PRON	
 1	marsikak	Pronoun	Type=indefinite	*	DET	
 1	marsikakšen	Pronoun	Type=indefinite	*	DET	
 1	marsikateri	Pronoun	Type=indefinite	*	DET	
 1	marsikdo	Pronoun	Type=indefinite	*	PRON	
 1	marsičigav	Pronoun	Type=indefinite	*	DET	
 1	medme	Pronoun	Type=personal	*	PRON	
 1	medse	Pronoun	Type=reflexive	*	PRON	
 1	mnog	Pronoun	Type=indefinite	*	DET	
 1	mnogokaj	Pronoun	Type=indefinite	*	PRON	
 1	mnogokateri	Pronoun	Type=indefinite	*	DET	
 1	mnogokdo	Pronoun	Type=indefinite	*	PRON	
 1	moj	Pronoun	Type=possessive	*	DET	
 1	nadme	Pronoun	Type=personal	*	PRON	
 1	nadse	Pronoun	Type=reflexive	*	PRON	
 1	najin	Pronoun	Type=possessive	*	DET	
 1	name	Pronoun	Type=personal	*	PRON	
 1	nase	Pronoun	Type=reflexive	*	PRON	
 1	naš	Pronoun	Type=possessive	*	DET	
 1	negdo	Pronoun	Type=indefinite	*	PRON	
 1	nek	Pronoun	Type=indefinite	*	DET	
 1	nekaj	Pronoun	Type=indefinite	*	PRON	
 1	nekak	Pronoun	Type=indefinite	*	DET	
 1	nekakov	Pronoun	Type=indefinite	*	DET	
 1	nekakšen	Pronoun	Type=indefinite	*	DET	
 1	nekateri	Pronoun	Type=indefinite	*	DET	
 1	nekdo	Pronoun	Type=indefinite	*	PRON	
 1	neki	Pronoun	Type=indefinite	*	DET	
 1	nekolik	Pronoun	Type=indefinite	*	DET	
 1	nekolikšen	Pronoun	Type=indefinite	*	DET	
 1	nekolikšnji	Pronoun	Type=indefinite	*	DET	
 1	nekov	Pronoun	Type=indefinite	*	DET	
 1	nekšen	Pronoun	Type=indefinite	*	DET	
 1	nevemkakšen	Pronoun	Type=indefinite	*	DET	
 1	nihče	Pronoun	Type=negative	*	PRON	
 1	nikak	Pronoun	Type=negative	*	DET	
 1	nikakršen	Pronoun	Type=negative	*	DET	
 1	nikakšen	Pronoun	Type=negative	*	DET	
 1	nikdo	Pronoun	Type=negative	*	PRON	
 1	nikogaršen	Pronoun	Type=negative	*	DET	
 1	nikogaršnji	Pronoun	Type=negative	*	DET	
 1	nič	Pronoun	Type=negative	*	PRON	
 1	njegov	Pronoun	Type=possessive	*	DET	
 1	njen	Pronoun	Type=possessive	*	DET	
 1	njihen	Pronoun	Type=possessive	*	DET	
 1	njihnji	Pronoun	Type=possessive	*	DET	
 1	njihov	Pronoun	Type=possessive	*	DET	
 1	njun	Pronoun	Type=possessive	*	DET	
 1	nobeden	Pronoun	Type=negative	*	PRON	
 1	noben	Pronoun	Type=negative	*	DET	
 1	oba	Pronoun	Type=general	*	DET	
 1	obadva	Pronoun	Type=general	*	PRON	
 1	obme	Pronoun	Type=personal	*	PRON	
 1	oboj	Pronoun	Type=general	*	DET	
 1	obojen	Pronoun	Type=general	*	DET	
 1	obse	Pronoun	Type=reflexive	*	PRON	
 1	on	Pronoun	Type=personal	*	PRON	
 1	oni	Pronoun	Type=demonstrative	*	DET	
 1	onile	Pronoun	Type=demonstrative	*	PRON	
 1	podme	Pronoun	Type=personal	*	PRON	
 1	podse	Pronoun	Type=reflexive	*	PRON	
 1	pome	Pronoun	Type=personal	*	PRON	
 1	predme	Pronoun	Type=personal	*	PRON	
 1	predse	Pronoun	Type=reflexive	*	PRON	
 1	premarsikateri	Pronoun	Type=indefinite	*	DET	
 1	premnog	Pronoun	Type=indefinite	*	DET	
 1	prenekaj	Pronoun	Type=indefinite	*	PRON	
 1	prenekateri	Pronoun	Type=indefinite	*	DET	
 1	prenekdo	Pronoun	Type=indefinite	*	PRON	
 1	redkokateri	Pronoun	Type=indefinite	*	DET	
 1	redkokdo	Pronoun	Type=indefinite	*	PRON	
 1	se	Pronoun	Type=reflexive	*	PRON	
 1	skozme	Pronoun	Type=personal	*	PRON	
 1	skozse	Pronoun	Type=reflexive	*	PRON	
 1	svoj	Pronoun	Type=reflexive	*	DET	
 1	ta	Pronoun	Type=demonstrative	*	DET	
 1	tadva	Pronoun	Type=demonstrative	*	PRON	
 1	taisti	Pronoun	Type=demonstrative	*	DET	
 1	tak	Pronoun	Type=demonstrative	*	DET	
 1	takisti	Pronoun	Type=demonstrative	*	DET	
 1	takle	Pronoun	Type=demonstrative	*	DET	
 1	takov	Pronoun	Type=demonstrative	*	DET	
 1	takošen	Pronoun	Type=demonstrative	*	DET	
 1	takšen	Pronoun	Type=demonstrative	*	DET	
 1	takšenle	Pronoun	Type=demonstrative	*	DET	
 1	tale	Pronoun	Type=demonstrative	*	DET	
 1	talele	Pronoun	Type=demonstrative	*	DET	
 1	teu	Pronoun	Type=personal	*	PRON	
 1	ti	Pronoun	Type=personal	*	PRON	
 1	tisti	Pronoun	Type=demonstrative	*	DET	
 1	tistile	Pronoun	Type=demonstrative	*	DET	
 1	tolik	Pronoun	Type=demonstrative	*	DET	
 1	toliker	Pronoun	Type=demonstrative	*	DET	
 1	tolikšen	Pronoun	Type=demonstrative	*	DET	
 1	tolikšnji	Pronoun	Type=demonstrative	*	DET	
 1	toti	Pronoun	Type=demonstrative	*	DET	
 1	tvoj	Pronoun	Type=possessive	*	DET	
 1	un	Pronoun	Type=demonstrative	*	DET	
 1	vajin	Pronoun	Type=possessive	*	DET	
 1	vame	Pronoun	Type=personal	*	PRON	
 1	vase	Pronoun	Type=reflexive	*	PRON	
 1	vaš	Pronoun	Type=possessive	*	DET	
 1	ves	Pronoun	Type=general	*	DET	
 1	vsak	Pronoun	Type=general	*	DET	
 1	vsakateri	Pronoun	Type=general	*	DET	
 1	vsakdo	Pronoun	Type=general	*	PRON	
 1	vsakogaršen	Pronoun	Type=general	*	DET	
 1	vsakogaršnji	Pronoun	Type=general	*	DET	
 1	vsakršen	Pronoun	Type=general	*	DET	
 1	vsakteri	Pronoun	Type=general	*	DET	
 1	zame	Pronoun	Type=personal	*	PRON	
 1	zase	Pronoun	Type=reflexive	*	PRON	
 3	*	Numeral	Form=digit	*	NUM	
 3	*	Numeral	Form=roman	*	NUM	
 3	*	Numeral	Form=letter|Type=special	*	NUM	
 3	*	Numeral	Form=letter|Type=cardinal	*	NUM	
 2	*	Numeral	Form=letter|Type=ordinal	*	ADJ	
 1	drug	Numeral	Form=letter|Type=pronominal	*	ADJ	
 1	en	Numeral	Form=letter|Type=pronominal	*	NUM	
 1	*en	Numeral	Form=letter|Type=special	*	ADJ	#enojen, dvojen
 1	eden	Numeral	Form=letter|Type=pronominal	*	NUM	#Dodal E.T.
 3	*	Adposition	*	*	ADP	#MULTEXT-East name
 3	*	Preposition	*	*	ADP	#JOS name
 3	*	Conjunction	Type=coordinating	*	CCONJ	
 3	*	Conjunction	Type=subordinating	*	SCONJ	
 3	*	Particle	*	*	PART	
 3	*	Interjection	*	*	INTJ	
 3	*	Abbreviation	*	*	X	
 3	*	Residual	*	*	X	
 2	*	Residual	Type=web	*	SYM	
 2	*	Residual	Type=emo	*	SYM	
 2	*	Residual	Type=hashtag	*	SYM	#Better mapping?
 2	*	Residual	Type=at	*	SYM	#Better mapping?
 2	*	Residual	Type=foreign	*	X	#Better mapping?
 3	*	Punctuation	*	*	PUNCT	
 1	#	Punctuation	*	*	SYM	
 1	%	Punctuation	*	*	SYM	
 1	&	Punctuation	*	*	SYM	
 1	<	Punctuation	*	*	SYM	
 1	>	Punctuation	*	*	SYM	
 1	+	Punctuation	*	*	SYM	
 1	=	Punctuation	*	*	SYM	
 1	°	Punctuation	*	*	SYM	
 1	×	Punctuation	*	*	SYM	
 1	÷	Punctuation	*	*	SYM	
 1	$	Punctuation	*	*	SYM	
 1	@	Punctuation	*	*	SYM	
 1	µ	Punctuation	*	*	SYM	
 1	©	Punctuation	*	*	SYM	
 1	§	Punctuation	*	*	SYM	
 1	€	Punctuation	*	*	SYM
 1	£	Punctuation	*	*	SYM
--- a/conversion_utils/tei_to_dictionary.py
+++ b/conversion_utils/tei_to_dictionary.py
@@ -1,12 +1,19 @@
 """Convert a TEI file to a XML file of the CJVT standard schema.
 This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
 """
 import argparse
 import lxml.etree as lxml
-from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
+from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
 def get_parsed_unit_string(parsed_unit):
    elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
    return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()
 def convert(input_file_name, output_file_name):
    output_root = lxml.Element('dictionary')
@@ -55,4 +62,6 @@ if (__name__ == '__main__'):
    arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
    arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
    arguments = arg_parser.parse_args()
    input_file_name = arguments.infile
    output_file_name = arguments.outfile
    convert(input_file_name, output_file_name)
--- a/conversion_utils/tests/init.py
+++ b/conversion_utils/tests/init.py
--- a/conversion_utils/tests/test_jos_msd_to_properties.py
+++ b/conversion_utils/tests/test_jos_msd_to_properties.py
@@ -1,6 +1,6 @@
 import unittest
-from conversion_utils.jos_msds_and_properties import Converter, Msd
+from conversion_utils.jos_msds_and_properties import Converter, Msd, MsdException
 class JosMsdToPropertiesTestCase(unittest.TestCase):
@@ -55,3 +55,25 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
        self.assertEqual(properties.category, 'punctuation')
        self.assertEqual(properties.lexeme_feature_map, {})
        self.assertEqual(properties.form_feature_map, {})
    def test_good_msd_with_require_valid(self):
        properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en', require_valid_flag=True)
        self.assertEqual(properties.language, 'en')
        self.assertEqual(properties.category, 'noun')
        self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
        self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
    def test_bad_msd(self):
        properties = self.converter.msd_to_properties(Msd('N---d', 'en'), 'en')
        self.assertEqual(properties.language, 'en')
        self.assertEqual(properties.category, 'noun')
        self.assertEqual(properties.lexeme_feature_map, {})
        self.assertEqual(properties.form_feature_map, {'case':'dative'})
    def test_bad_msd_with_require_valid(self):
        try:
            self.converter.msd_to_properties(Msd('N---d', 'en'), 'en', require_valid_flag=True)
            fails = False
        except MsdException:
            fails = True
        self.assertEqual(fails, True)
--- a/conversion_utils/tests/test_jos_properties_to_msd.py
+++ b/conversion_utils/tests/test_jos_properties_to_msd.py
@@ -1,6 +1,6 @@
 import unittest
-from conversion_utils.jos_msds_and_properties import Converter, Properties
+from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException, Msd
 class JosPropertiesToMsdTestCase(unittest.TestCase):
@@ -41,3 +41,40 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
        msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
        self.assertEqual(msd.language, 'sl')
        self.assertEqual(msd.code, 'U')
    def test_good_msd_with_require_valid(self):
        msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en', require_valid_flag=True)
        self.assertEqual(msd.language, 'en')
        self.assertEqual(msd.code, 'Ncfdn')
    def test_bad_msd(self):
        msd = self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en')
        self.assertEqual(msd.language, 'en')
        self.assertEqual(msd.code, 'Nc-d')
    def test_msd_to_jos(self):
        ud = self.converter.msd_to_ud(Msd('Ppnzei', 'sl'), 'slovenski')
        self.assertEqual(ud.pos, 'ADJ')
        self.assertEqual(ud.to_full_string(), 'UposTag=ADJ|Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
        self.assertEqual(ud.to_features_string(), 'Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
        ud = self.converter.msd_to_ud(Msd('Sommr', 'sl'), 'dečko')
        self.assertEqual(ud.pos, 'NOUN')
        self.assertEqual(ud.to_full_string(), 'UposTag=NOUN|Case=Gen|Gender=Masc|Number=Plur')
        self.assertEqual(ud.to_features_string(), 'Case=Gen|Gender=Masc|Number=Plur')
    def test_msd_to_jos_partial_msd(self):
        try:
            self.converter.msd_to_ud(Msd('Soz', 'sl'), 'vlada')
            fails = False
        except MsdException:
            fails = True
        self.assertEqual(fails, True)
    def test_bad_msd_with_require_valid(self):
        try:
            self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
            fails = False
        except MsdException:
            fails = True
        self.assertEqual(fails, True)
--- a/conversion_utils/tests/test_jos_translate_msd.py
+++ b/conversion_utils/tests/test_jos_translate_msd.py
--- a/conversion_utils/tests/test_jos_translate_properties.py
+++ b/conversion_utils/tests/test_jos_translate_properties.py
--- a/conversion_utils/translate_conllu_jos.py
+++ b/conversion_utils/translate_conllu_jos.py
@@ -1,26 +1,29 @@
-#!/usr/bin/python3
+"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
-# -*- coding: utf-8 -*-
+
 This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
 """
 import argparse
 import codecs
 import lxml.etree as lxml
 from importlib_resources import files
 from conversion_utils.jos_msds_and_properties import Converter, Msd
 def get_syn_map():
    dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
-    dict_file = codecs.open(dict_file_name, 'r')
+    dict_file = open(dict_file_name, 'r', encoding='utf-8')
    root = lxml.parse(dict_file).getroot()
    dict_file.close() 
    return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
-def translate(input_file_name, output_file_name):
+
 def translate(input_file_name, scope, output_file_name):
    syn_map = get_syn_map()
-    output_file = codecs.open(output_file_name, 'w')
+    output_file = open(output_file_name, 'w', encoding='utf-8')
-    input_file = codecs.open(input_file_name, 'r')
+    input_file = open(input_file_name, 'r', encoding='utf-8')
    converter = Converter()
@@ -29,8 +32,10 @@ def translate(input_file_name, output_file_name):
        if (len(columns) != 10):
            output_file.write(line)
        else:
-            columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
+            if (scope in {'msd', 'both'}):
-            columns[7] = syn_map[columns[7]]
+                columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
            if (scope in {'dep', 'both'}):
                columns[7] = syn_map[columns[7]]
            output_file.write('\t'.join(columns) + '\n')
    input_file.close()
@@ -41,6 +46,7 @@ if (__name__ == '__main__'):
    arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
    arg_parser.add_argument('-infile', type=str, help='Input conllu')
    arg_parser.add_argument('-scope', type=str, options=['msd', 'dep', 'both'], default='both', help='Input conllu')
    arg_parser.add_argument('-outfile', type=str, help='Output conllu')
    arguments = arg_parser.parse_args()
    input_file_name = arguments.infile
--- a/conversion_utils/utils.py
+++ b/conversion_utils/utils.py
@@ -1,11 +1,16 @@
 """A few convenience TEI/XML constants and functions."""
 TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
 TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
 XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'
 def xpath_find(element,expression):
    """Executes XPath expression, with TEI namespace."""
    return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})
 def get_xml_id(element):
    """Returns the element's @xml:id attribute."""
    return element.get(XML_ID_ATTRIBUTE_NAME)
--- a/scripts/install_jos_specifications.py
+++ b/scripts/install_jos_specifications.py
@@ -1,3 +1,13 @@
 """Parse source TEI specifications and save as a pickle.
 You can use this script to create a new pickle file to replace the one stored at
 ../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
 of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml.  However, the specifications
 are not expected to change, and if they do, the package pickle there should be updated upstream, so
 you probably should not have to use this script.
 """
 import pickle
 import argparse
 from conversion_utils.jos_msds_and_properties import SpecificationsParser
--- a/setup.py
+++ b/setup.py
@@ -1,12 +1,20 @@
 from setuptools import setup
 import os
-setup(name='conversion_utils',
+here = os.path.abspath(os.path.dirname(__file__))
-      version='0.1',
+with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
    long_description = f.read()
 setup(name='cjvt_conversion_utils',
      version='0.3',
      description='CJVT conversion utilities',
      long_description=long_description,
      long_description_content_type="text/markdown",
      url='https://gitea.cjvt.si/generic/conversion_utils',
-      author='Cyprian Laskowski',
+      author='CJVT',
-      author_email='cyp@cjvt.si',
+      author_email='pypi@cjvt.si',
-      packages=['conversion_utils', 'conversion_utils.resources'],
+      license='MIT',
-      install_requires=['importlib_resources'],
+      packages=['conversion_utils', 'conversion_utils.resources', 'conversion_utils.tests'],
      install_requires=['lxml', 'importlib_resources'],
      include_package_data=True,
      zip_safe=True)
Author	SHA1	Message	Date
Luka Dragar	e62c096126	Small bug fix regarding UD features conversion to string	2026-04-01 12:54:09 +02:00
Luka Dragar	165f24c64c	Added conversion from msd to universal dependencies based on Jaka's implementation	2026-03-30 22:31:08 +02:00
Luka Dragar	4d86631283	No warning messages for partial msds	2026-03-30 10:03:49 +02:00
Luka Dragar	b711fae3b5	UTF-8 encoding fix	2025-11-28 16:55:04 +01:00
lkrsnik	f43ea39f1b	Updated setup.py and licence	2023-10-31 10:39:19 +01:00
Cyprian Laskowski	03ce9f8ac7	Added rudimentary module documentation and made a couple of basic fixes	2023-10-26 17:13:54 +02:00
Cyprian Laskowski	f28b5a3a01	Allowed for restricting of JOS translation to one column	2023-10-18 21:54:00 +02:00
Cyprian Laskowski	89be603103	Allowed for empty misc conllu column	2023-08-16 16:41:02 +02:00
Cyprian Laskowski	99ac426e4b	Replace deprecated code and add missing dependency	2023-08-09 18:08:21 +02:00
Luka	89bcde58aa	Added NER + SRL to conllu_to_tei script	2023-02-17 16:24:02 +01:00
Cyprian Laskowski	d7be39d894	Made msd and feature-level checking optional, added docstrings	2022-09-15 11:01:05 +02:00
Cyprian Laskowski	4ca67ec8cc	Turned unit test directory into package	2022-09-15 10:57:58 +02:00