Small bug fix regarding UD features conversion to string

Added conversion from msd to universal dependencies based on Jaka's implementation
No warning messages for partial msds
2026-04-01 12:54:09 +02:00 · 2026-03-30 22:31:08 +02:00 · 2026-03-30 10:03:49 +02:00 · 2025-11-28 16:55:04 +01:00 · 2023-10-31 10:39:19 +01:00 · 2023-10-26 17:13:54 +02:00
19 changed files with 2715 additions and 55 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,7 @@
 *.pyc
 venv
+data
+.idea
+build
+dist
+*.egg-info
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2023 CLARIN.SI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,6 @@
 include conversion_utils/resources/jos_specifications.pickle
 include conversion_utils/resources/dict.xml
 include conversion_utils/resources/structure_conversions.csv
+include conversion_utils/resources/jos-msd2features.tbl
+include conversion_utils/resources/jos2ud-features.tbl
+include conversion_utils/resources/jos2ud-pos.tbl
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
-## Conversion utilities
+## CJVT conversion utilities

-This repository is currently intended for common conversions needed by CJVT developers. For the
-moment, this is limited to JOS msds and properties.
+This repository is intended for common conversions needed by CJVT developers. It can of course also
+be used more broadly, but most of the scripts (with the exception of `jos_msds_and_properties.py`)
+were written with specific tasks in mind, and may not generalise as expected. Use at your own risk.

 ### JOS msds and properties

--- a/conversion_utils/conllu_to_tei.py
+++ b/conversion_utils/conllu_to_tei.py
@@ -1,23 +1,36 @@
+"""Convert a series of CoNNL-U files to a TEI file.
+
+This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
+"""
+
+
 import argparse
 import re
 import sys
-
+from glob import glob
 from lxml import etree

+
 class Sentence:
    def __init__(self, _id, no_ud=False, system='jos'):
        self._id = _id
        self.items = []
        self.links = []
+        self.srl_links = []
        self.no_ud = no_ud
        self.system = system

    def add_item(self, token, lemma, upos, upos_other, xpos, misc):
-        self.items.append([token, lemma, upos, upos_other, xpos, "SpaceAfter=No" in misc.split('|')])
+        no_space_after = 'SpaceAfter' in misc and misc['SpaceAfter'] == 'No'
+        ner = misc['NER'] if 'NER' in misc else 'O'
+        self.items.append([token, lemma, upos, upos_other, xpos, no_space_after, ner])

    def add_link(self, link_ref, link_type):
        self.links.append([link_ref, link_type])

+    def add_srl_link(self, link_ref, link_type):
+        self.srl_links.append([link_ref, link_type])
+
    def as_xml(self, id_prefix=None):
        if id_prefix:
            xml_id = id_prefix + '.' + self._id
@@ -27,8 +40,24 @@ class Sentence:
        set_xml_attr(base, 'id', xml_id)
        id_counter = 1

+        in_seg = False
+        sentence_base = base
+
        for item in self.items:
-            token, lemma, upos, upos_other, xpos, no_space_after = item
+            token, lemma, upos, upos_other, xpos, no_space_after, ner = item
+
+            if ner[0] == 'B':
+                if in_seg:
+                    sentence_base.append(base)
+                in_seg = True
+                base = etree.Element('seg')
+                base.set('type', 'name')
+                base.set('subtype', f'{ner[2:].lower()}')
+            elif ner[0] == 'O':
+                if in_seg:
+                    sentence_base.append(base)
+                    base = sentence_base
+                in_seg = False

            if xpos in {'U', 'Z'}: # hmm, safe only as long as U is unused in English tagset and Z in Slovenian one
                to_add = etree.Element('pc')
@@ -53,6 +82,11 @@ class Sentence:

            base.append(to_add)

+        if in_seg:
+            sentence_base.append(base)
+            base = sentence_base
+
+        # depparsing linkGrp
        link_grp = etree.Element('linkGrp')
        link_grp.set('corresp', '#'+xml_id)
        link_grp.set('targFunc', 'head argument')
@@ -67,6 +101,23 @@ class Sentence:
                link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
            link_grp.append(link)
        base.append(link_grp)
+
+        # srl linkGrp
+        if self.srl_links:
+            link_grp = etree.Element('linkGrp')
+            link_grp.set('corresp', '#' + xml_id)
+            link_grp.set('targFunc', 'head argument')
+            link_grp.set('type', 'SRL')
+            for link_id, item in enumerate(self.srl_links):
+                link_ref, link_type = item
+                link = etree.Element('link')
+                link.set('ana', 'srl:' + link_type.replace(':', '_'))
+                if link_ref == u'0':
+                    link.set('target', '#' + xml_id + ' #' + xml_id + '.' + str(link_id + 1))
+                else:
+                    link.set('target', '#' + xml_id + '.' + link_ref + ' #' + xml_id + '.' + str(link_id + 1))
+                link_grp.append(link)
+            base.append(link_grp)
        return base


@@ -234,7 +285,7 @@ def construct_sentence(sent_id, lines):
        upos_other = tokens[5]
        depparse_link = tokens[6]
        depparse_link_name = tokens[7]
-        misc = tokens[9]
+        misc = {el.split('=')[0]: el.split('=')[1] for el in tokens[9].split('|')} if tokens[9] != '_' else {}

        sentence.add_item(
                token,
@@ -247,6 +298,11 @@ def construct_sentence(sent_id, lines):
        sentence.add_link(
            depparse_link,
            depparse_link_name)
+
+        if 'SRL' in misc:
+            sentence.add_srl_link(
+                depparse_link,
+                misc['SRL'])
    return sentence


@@ -256,7 +312,7 @@ def construct_tei_etrees(conllu_lines):


 def convert_file(input_file_name, output_file_name):
-    input_file = open(input_file_name, 'r')
+    input_file = open(input_file_name, 'r', encoding='utf-8')
    root = construct_tei_etrees(input_file)[0]
    tree = etree.ElementTree(root)
    tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@@ -267,19 +323,16 @@ def convert_file(input_file_name, output_file_name):


 if __name__ == '__main__':
-    import argparse
-    from glob import glob

    parser = argparse.ArgumentParser(description='Convert CoNNL-U to TEI.')
    parser.add_argument('files', nargs='+', help='CoNNL-U file')
-    parser.add_argument('-o', '--out-file', dest='out', default=None,
-                help='Write output to file instead of stdout.')
+    parser.add_argument('-o', '--out-file', dest='out', default=None, help='Write output to file instead of stdout.')
    parser.add_argument('-s', '--system', dest='system', default='jos', choices=['jos', 'ud'])

    args = parser.parse_args()

    if args.out:
-        f_out = open(args.out, 'w')
+        f_out = open(args.out, 'w', encoding='utf-8')
    else:
        f_out = sys.stdout

@@ -288,7 +341,7 @@ if __name__ == '__main__':
    for arg in args.files:
        filelist = glob(arg)
        for f in filelist:
-            with open(f, 'r') as conllu_f:
+            with open(f, 'r', encoding='utf-8') as conllu_f:
                tei_etrees = construct_tei_etrees(conllu_f)
            for tei_etree in tei_etrees:
                f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
--- a/conversion_utils/jos_msds_and_properties.py
+++ b/conversion_utils/jos_msds_and_properties.py
@@ -1,12 +1,21 @@
-import lxml.etree as lxml
 import re
 import pickle
-import importlib_resources as pkg_resources
+import lxml.etree as lxml
+from collections import defaultdict
+from importlib_resources import files
+
+from enum import Enum

 from conversion_utils.utils import xpath_find, get_xml_id

 JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'

+RESOURCES_DIR = "conversion_utils.resources"
+
+MSD_TO_FEATURES = "jos-msd2features.tbl"
+JOS_TO_UD_FEATURES_RULES = "jos2ud-features.tbl"
+JOS_TO_UPOS_RULES = "jos2ud-pos.tbl"
+
 ## Positions of lexeme-level features for each category
 LEXEME_FEATURE_MAP = {'noun':{1,2},
                      'verb':{1,2},
@@ -53,6 +62,14 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
                    ('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
                    ('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}

+class MsdState(Enum):
+    FULL = 1
+    PARTIAL = 2
+    UNKNOWN = 3
+
+class MsdException(Exception):
+    pass
+

 class Specifications:
    """JOS specifications with list of all word categories."""
@@ -214,7 +231,37 @@ class Properties:
            and self.lexeme_feature_map == obj.lexeme_feature_map\
            and self.form_feature_map == obj.form_feature_map\
            and self.language == obj.language
-            
+          
+
+class UD:
+    """Universal Dependencies object.
+    
+    Can be converted to a valid UD features string.
+    """
+    
+    def __init__(self, pos, features_map):
+        self.pos = pos
+        self.features_map = features_map
+
+    def to_features_string(self):
+        return self._features_string()
+
+    def to_full_string(self):
+        features = self._features_string()
+        if features:
+            return "UposTag=" + self.pos + "|" + features
+        else:
+            return "UposTag=" + self.pos
+
+    def _features_string(self):
+        return "|".join([f"{feature}={value}" for feature, value in self._sort_features(self.features_map)])
+
+    def _sort_features(self, features_map):
+        return sorted(features_map.items(), key=lambda x: x[0].lower(), reverse=False)
+
+    def __str__(self):
+        return f"pos={self.pos}, features_map={self.features_map}"
+

 class Msd:
    """JOS msd."""  
@@ -228,19 +275,17 @@ class Msd:

    def __eq__(self, obj):
        return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
-
-
-class ConverterException(Exception):
-    pass
+    

 class Converter:
    """Converter between Msd and Properties objects."""

    def __init__(self, xml_file_name=None):
        if (xml_file_name is None):
-            if (pkg_resources.is_resource('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE)):
+            resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
+            if (resource.is_file()):
                try:
-                    with pkg_resources.open_binary('conversion_utils.resources', JOS_SPECIFICATIONS_PICKLE_RESOURCE) as pickle_file:
+                    with resource.open('rb') as pickle_file:
                        self.specifications = pickle.load(pickle_file)
                except:
                    exit('Could not parse specifications pickle file installed.')
@@ -252,18 +297,85 @@ class Converter:
                self.specifications = parser.parse(xml_file_name)
            except:
                exit('Could not parse specifications xml file provided.')
+     
+        self.mte_to_ud_features = self._parse_msd_ud_conversion(MSD_TO_FEATURES)
+        self.mte_to_ud_features_rules = self._parse_ud_rules(JOS_TO_UD_FEATURES_RULES)
+        self.mte_to_upos_rules = self._parse_ud_rules(JOS_TO_UPOS_RULES)

-    def msd_to_properties(self, msd, language, lemma=None):
-        """Convert Msd to Properties (possibly in the other language).
+    def _parse_msd_ud_conversion(self, file_name):
+        """Parse file with direct conversions from English Msd to Universal Dependencies."""
+        conversion_map = defaultdict()
+        with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as conversion_file:
+            for line in conversion_file.readlines():
+                mte_msd_en, mte_features_en = line.strip("\n").split("\t")
+                mte_sl = self.translate_msd(Msd(mte_msd_en, "en"), "sl").code
+                conversion_map[mte_msd_en] = mte_features_en
+                conversion_map[mte_sl] = mte_features_en
+        return conversion_map
+
+    def _parse_ud_rules(self, file_name):
+        """Parse file with rules additional rules for converting from applied to conversion from English Msd to Universal Dependencies."""
+        all_rules = defaultdict(list)
+        with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as rules_file:
+            for line in [l for l in rules_file.readlines() if l[0].isdigit()]:
+                priority, *current_rules = line.strip("\n").split("\t")
+                current_rules += [""] * (6 - len(current_rules))
+                all_rules[priority].append(current_rules)
+        return all_rules
+
+    def is_valid_msd(self, msd):
+        """Verify if the Msd code is in the standard JOS set."""
+        return msd.code in self.specifications.codes_map[msd.language]
+
+    def get_msd_state(self, msd):
+        """Determine if the Msd code is full, partial or unknown."""
+        code_map = self.specifications.codes_map[msd.language]
+        if msd.code in code_map:
+            return MsdState.FULL
+        for msd_code in code_map:
+            if msd_code.startswith(msd.code):
+                return MsdState.PARTIAL
+        return MsdState.UNKNOWN
+
+    def check_valid_msd(self, msd, require_valid_flag, allow_partial=True):
+        """If the Msd code is not valid, raise an exception or give a warning."""
+        msd_state = self.get_msd_state(msd)
+        if msd_state == MsdState.UNKNOWN:
+            message = f"The msd '{msd.code}' is unknown"
+            if require_valid_flag:
+                raise MsdException(message)
+            else:
+                print('[WARN] ' + message)
+        if msd_state == MsdState.PARTIAL and not allow_partial:
+            raise MsdException(f"Partial msd '{msd.code}' is not allowed. Full msd is required.")
+
+    def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
+        """Convert Msd to Properties.
+
+        The language of the generated Properties is specified and can differ from the Msd language.
+
+        If require_valid_flag is True, a MsdException is raised if the MSD is not in the standard
+        JOS set. Otherwise only a warning is given.
+
+        If you care about accurate level information (i.e., which properties are lexeme-level and
+        which are form-level), note that some features depends on the particular lemma. For such
+        features, if lemma is not provided and warn_level_flag is True, a warning will be given.
+
+        If a MSD has dashes in place of letters for certain features, they are skipped, so that
+        these features are not included in the generated Properties object.
+
+        Parameters:
+        msd(Msd): the JOS MSD to convert
+        language(str): the language for the Properties object to be generated: "en" (English) or "sl" (Slovene)
+        lemma(str): the lemma of the word form with the MSD
+        require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is provided
+        warn_level_flag(boolean): whether to warn if cannot be sure of level of a property
+
+        Returns:
+        Properties: the result of the conversion of the Msd in the language requested

-        The level (lexeme vs form) of certain reflexive msd features
-        depends on the lemma, so set the lemma if you need accurate
-        level information.
        """
-
-        if (msd.code not in self.specifications.codes_map[msd.language]):
-            raise ConverterException('The msd {} is unknown'.format(msd.code))
-
+        self.check_valid_msd(msd, require_valid_flag)
        category_char = msd.code[0].lower()
        value_chars = msd.code[1:]
        category = self.specifications.find_category_by_code(category_char, msd.language)
@@ -277,8 +389,8 @@ class Converter:
                value = feature.find_value_by_char(value_char, msd.language)
                feature_name = feature.names.get(language)
                feature_value = value.names.get(language)
-                if (lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
-                    print('[WARN] The level of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
+                if (warn_level_flag and lemma is None and (category_name, index) in [(le[0], le[1]) for le in LEVEL_EXCEPTIONS]):
+                    print('[WARN] The level (lexeme vs form) of feature (category={category}, position={position}) may be incorrect, as it is lemma-specific and no lemma has been specified.'
                          .format(category=category_name, position=index))
                level_exception_flag = (category_name, feature.position, lemma) in LEVEL_EXCEPTIONS
                lexeme_level_flag = feature.lexeme_level_flag if not level_exception_flag else not feature.lexeme_level_flag
@@ -289,8 +401,21 @@ class Converter:
                    form_feature_map[feature_name] = feature_value
        return Properties(category_name, lexeme_feature_map, form_feature_map, language)

-    def properties_to_msd(self, properties, language):
-        """Convert Properties to msd (possibly in the other language)."""
+    def properties_to_msd(self, properties, language, require_valid_flag=False):
+        """Convert Properties to Msd.
+
+        The language of the generated Msd is specified and can differ from the Properties language.
+
+        If require_valid_flag is True, a MsdException is raised if the generated MSD is not in
+        the standard JOS set. Otherwise only a warning is given.
+
+        Any skipped positions among the Properties are represented as dashes in the MSD.
+
+        Parameters:
+        properties(Properties): the properties to convert
+        language(str): the language for the Msd object to be returned: "en" (English) or "sl" (Slovene)
+        require_valid_flag(boolean): whether to raise a MsdException or only warn if a non-standard MSD is generated
+        """
        category = self.specifications.find_category_by_name(properties.category, properties.language)
        category_char = category.codes.get(language).upper()
        feature_map = properties.lexeme_feature_map.copy()
@@ -308,7 +433,51 @@ class Converter:
                msd_code += '-'
                i += 1
            msd_code += position_map[position]
-        return Msd(msd_code, language)
+        msd = Msd(msd_code, language)
+        self.check_valid_msd(msd, require_valid_flag)
+        return msd
+
+    def msd_to_ud(self, msd, lemma):
+        """Convert Msd to Universal Dependencies object.
+
+        Partial Msds are currently not supported.
+
+        Parameters:
+        msd(Msd): the Msd to convert
+        lemma(str): the lemma of the word form with the MSD
+        """
+
+        self.check_valid_msd(msd, False, allow_partial=False)
+        upos_category, *upos_features = self.mte_to_ud_features[msd.code].split()
+        final_upos = ""
+
+        for priority in sorted(self.mte_to_upos_rules, reverse=True):
+            for rule in self.mte_to_upos_rules[priority]:
+                rule_lemma, rule_category, rule_mte_features, _, rule_pos_ud, _ = rule
+
+                if (rule_category != upos_category
+                or (rule_lemma not in ("*", "*en") and lemma != rule_lemma)
+                or (rule_lemma == "*en" and not lemma.endswith("en"))
+                or (rule_mte_features != "*" and not all(f in upos_features for f in rule_mte_features.split("|")))):
+                    continue
+
+                final_upos = rule_pos_ud
+
+        for priority in sorted(self.mte_to_ud_features_rules):
+            for rule in self.mte_to_ud_features_rules[priority]:
+                rule_lemma, rule_category, rule_mte_features, rule_pos_ud, rule_ud_features, _ = rule
+
+                if (rule_lemma != "*" and lemma != rule_lemma
+                or (rule_category != "*" and rule_category != upos_category)
+                or (rule_pos_ud != "*" and rule_pos_ud != final_upos)):
+                    continue
+
+                upos_features = [rule_ud_features if f == rule_mte_features else f for f in upos_features]
+                if rule_mte_features == "*" and rule_ud_features != "-":
+                    upos_features.append(rule_ud_features)
+
+        ud_features = dict(f.split("=", 1) for f in "|".join(upos_features).split("|") if f not in {"", "-"})
+        return UD(final_upos, ud_features)

    def translate_msd(self, msd, language):
        return self.properties_to_msd(self.msd_to_properties(msd, language), language)
--- a/conversion_utils/resources/jos-msd2features.tbl
+++ b/conversion_utils/resources/jos-msd2features.tbl
--- a/conversion_utils/resources/jos2ud-features.tbl
+++ b/conversion_utils/resources/jos2ud-features.tbl
@@ -0,0 +1,128 @@
+# Mapping from JOS features to UD features						
+# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek						
+# 2018-11-23						
+#						
+#Prio	Lemma	Category	Feats	PoS-UD	->Feature-UD	#Comment
+----------------------------------------------------------------------------------------------------						
+1	*	Noun	Type=common	*	-	
+1	*	Noun	Type=proper	*	-	
+						
+1	*	Verb	Negative=no	*	Polarity=Pos	
+1	*	Verb	Negative=yes	*	Polarity=Neg	
+1	*	Verb	Type=auxiliary	*	-	
+1	*	Verb	Type=main	*	-	
+1	*	Verb	VForm=present	*	VerbForm=Fin|Mood=Ind|Tense=Pres	
+1	*	Verb	VForm=future	*	VerbForm=Fin|Mood=Ind|Tense=Fut	
+1	*	Verb	VForm=conditional	*	VerbForm=Fin|Mood=Cnd	
+1	*	Verb	VForm=imperative	*	VerbForm=Fin|Mood=Imp	
+1	*	Verb	VForm=infinitive	*	VerbForm=Inf	
+1	*	Verb	VForm=supine	*	VerbForm=Sup	
+1	*	Verb	VForm=participle	*	VerbForm=Part	
+						
+1	*	Adjective	Type=general	*	-	
+1	*	Adjective	Type=possessive	*	Poss=Yes	
+1	*	Adjective	Type=participle	*	VerbForm=Part	
+						
+2	*	Adverb	Type=participle	*	VerbForm=Conv	
+2	*	Adverb	Type=general	*	-	
+1	nekaj	Adverb	Type=general	DET	PronType=Ind	
+1	več	Adverb	Type=general	DET	PronType=Ind	
+1	veliko	Adverb	Type=general	DET	PronType=Ind	
+1	manj	Adverb	Type=general	DET	PronType=Ind	
+1	dovolj	Adverb	Type=general	DET	PronType=Ind	
+1	pol	Adverb	Type=general	DET	PronType=Ind	
+1	malo	Adverb	Type=general	DET	PronType=Ind	
+1	toliko	Adverb	Type=general	DET	PronType=Dem	
+1	največ	Adverb	Type=general	DET	PronType=Ind	
+1	mnogo	Adverb	Type=general	DET	PronType=Ind	
+1	preveč	Adverb	Type=general	DET	PronType=Ind	
+1	par	Adverb	Type=general	DET	PronType=Ind	
+1	koliko	Adverb	Type=general	DET	PronType=Int	
+1	dosti	Adverb	Type=general	DET	PronType=Ind	
+1	nešteto	Adverb	Type=general	DET	PronType=Ind	
+1	četrt	Adverb	Type=general	DET	PronType=Ind	
+1	ogromno	Adverb	Type=general	DET	PronType=Ind	
+1	čimveč	Adverb	Type=general	DET	PronType=Ind	
+1	obilo	Adverb	Type=general	DET	PronType=Ind	
+1	premnogo	Adverb	Type=general	DET	PronType=Ind	
+1	enormno	Adverb	Type=general	DET	PronType=Ind
+1	majčkeno	Adverb	Type=general	DET	PronType=Ind	
+						
+2	*	Pronoun	Type=reflexive	*	PronType=Prs|Reflex=Yes	
+2	*	Pronoun	Type=personal	*	PronType=Prs	
+2	*	Pronoun	Type=possessive	*	PronType=Prs|Poss=Yes	
+2	*	Pronoun	Type=interrogative	*	PronType=Int	
+2	*	Pronoun	Type=relative	*	PronType=Rel	
+2	*	Pronoun	Type=demonstrative	*	PronType=Dem	
+2	*	Pronoun	Type=general	*	PronType=Tot	
+2	*	Pronoun	Type=negative	*	PronType=Neg	
+2	*	Pronoun	Type=indefinite	*	PronType=Ind	
+1	*	Pronoun	Type=personal	DET	PronType=Prs	
+1	*	Pronoun	Type=possessive	DET	PronType=Prs|Poss=Yes	
+1	*	Pronoun	Owner_Gender=masculine	*	Gender[psor]=Masc	#lg.spec.feature
+1	*	Pronoun	Owner_Gender=feminine	*	Gender[psor]=Fem	#lg.spec.feature
+1	*	Pronoun	Owner_Gender=neuter	*	Gender[psor]=Neut	#lg.spec.feature
+1	*	Pronoun	Owner_Number=singular	*	Number[psor]=Sing	#lg.spec.feature
+1	*	Pronoun	Owner_Number=plural	*	Number[psor]=Plur	#lg.spec.feature
+1	*	Pronoun	Owner_Number=dual	*	Number[psor]=Dual	#lg.spec.feature
+1	*	Pronoun	Clitic=yes	*	Variant=Short	#lg.spec.feature
+1	*	Pronoun	Clitic=bound	*	Variant=Bound	#lg.spec.feature
+1	svoj	Pronoun	Type=reflexive	*	PronType=Prs|Reflex=Yes|Poss=Yes	
+						
+2	*	Numeral	Type=pronominal	*	-	
+2	*	Numeral	Form=letter	*	-	
+2	*	Numeral	Type=cardinal	NUM	NumType=Card	
+1	*	Numeral	Form=letter	NUM	NumForm=Word	#lg.spec.feature
+1	*	Numeral	Form=digit	NUM	NumForm=Digit	#lg.spec.feature
+1	*	Numeral	Form=roman	NUM	NumForm=Roman	#lg.spec.feature
+1	*	Numeral	Type=ordinal	*	NumType=Ord	
+1	*	Numeral	Type=special	ADJ	NumType=Mult	
+1	*	Numeral	Type=special	NUM	NumType=Sets	
+1	en	Numeral	Type=pronominal	*	NumType=Card	
+1	eden	Numeral	Type=pronominal	*	NumType=Card	
+						
+1	*	Conjunction	Type=subordinating	*	-	
+1	*	Conjunction	Type=coordinating	*	-	
+						
+2	*	Particle	*	*	-	
+1	ne	Particle	*	*	Polarity=Neg	
+						
+1	*	Interjection	*	*	-	
+						
+1	*	Abbreviation	*	*	Abbr=Yes	
+						
+2	*	Residual	*	*	-	
+1	*	Residual	Type=foreign	*	Foreign=Yes	
+1	*	Residual	Type=typo	*	-	
+1	*	Residual	Type=program	*	-	
+						
+1	*	Punctuation	*	*	-	
+						
+2	*	*	Degree=positive	*	Degree=Pos	
+2	*	*	Degree=comparative	*	Degree=Cmp	
+2	*	*	Degree=superlative	*	Degree=Sup	
+1	*	*	Degree=positive	DET	-	
+1	*	*	Degree=comparative	DET	-	
+1	*	*	Degree=superlative	DET	-	
+1	*	*	Animate=no	*	Animacy=Inan	
+1	*	*	Animate=yes	*	Animacy=Anim	
+1	*	*	Aspect=perfective	*	Aspect=Perf	
+1	*	*	Aspect=progressive	*	Aspect=Imp	
+1	*	*	Aspect=biaspectual	*	-	
+1	*	*	Case=nominative	*	Case=Nom	
+1	*	*	Case=genitive	*	Case=Gen	
+1	*	*	Case=dative	*	Case=Dat	
+1	*	*	Case=accusative	*	Case=Acc	
+1	*	*	Case=locative	*	Case=Loc	
+1	*	*	Case=instrumental	*	Case=Ins	
+1	*	*	Definiteness=no	*	Definite=Ind	
+1	*	*	Definiteness=yes	*	Definite=Def	
+1	*	*	Gender=masculine	*	Gender=Masc	
+1	*	*	Gender=feminine	*	Gender=Fem	
+1	*	*	Gender=neuter	*	Gender=Neut	
+1	*	*	Number=singular	*	Number=Sing	
+1	*	*	Number=plural	*	Number=Plur	
+1	*	*	Number=dual	*	Number=Dual	
+1	*	*	Person=first	*	Person=1	
+1	*	*	Person=second	*	Person=2	
+1	*	*	Person=third	*	Person=3	
--- a/conversion_utils/resources/jos2ud-pos.tbl
+++ b/conversion_utils/resources/jos2ud-pos.tbl
@@ -0,0 +1,282 @@
+# Mapping from JOS PoS to UD 2.0 PoS						
+# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek						
+# 2019-02-04						
+#						
+#Prio	Lemma	Category	Feats	Deps	->PoS-UD	#Comment
+#-------------------------------------------------------------------------------------------------------						
+3	*	Noun	Type=common	*	NOUN	
+3	*	Noun	Type=proper	*	PROPN	
+						
+3	*	Verb	*	*	VERB	
+						
+2	*	Verb	Type=auxiliary	*	AUX	#This is one can in fact also be VERB, but this has to be determined by some other means
+						
+3	*	Adjective	*	*	ADJ	
+						
+3	*	Adverb	*	*	ADV	
+1	četrt	Adverb	*	*	DET	
+1	čimmanj	Adverb	*	*	DET	
+1	čimveč	Adverb	*	*	DET	
+1	dosti	Adverb	*	*	DET	
+1	dovolj	Adverb	*	*	DET	
+1	enako	Adverb	*	*	ADV	
+1	enormno	Adverb	*	*	DET	
+1	ful	Adverb	*	*	ADV	
+1	koliko	Adverb	*	*	DET	
+1	majčkeno	Adverb	*	*	DET	
+1	maksimalno	Adverb	*	*	ADV	
+1	malce	Adverb	*	*	ADV	
+1	malo	Adverb	*	*	DET	
+1	manj	Adverb	*	*	DET	
+1	minimalno	Adverb	*	*	ADV	
+1	mnogo	Adverb	*	*	DET	
+1	najmanj	Adverb	*	*	ADV	
+1	največ	Adverb	*	*	DET	
+1	nekaj	Adverb	*	*	DET	
+1	nekoliko	Adverb	*	*	ADV	
+1	nemalo	Adverb	*	*	ADV	
+1	nešteto	Adverb	*	*	DET	
+1	nič	Adverb	*	*	ADV	
+1	ničkoliko	Adverb	*	*	DET	
+1	obilo	Adverb	*	*	DET	
+1	ogromno	Adverb	*	*	DET	
+1	par	Adverb	*	*	DET	
+1	pol	Adverb	*	*	DET	
+1	polno	Adverb	*	*	ADV	
+1	precej	Adverb	*	*	ADV	
+1	premalo	Adverb	*	*	ADV	
+1	premnogo	Adverb	*	*	DET	
+1	preveč	Adverb	*	*	DET	
+1	toliko	Adverb	*	*	DET	
+1	veliko	Adverb	*	*	DET	
+1	več	Adverb	*	*	DET	
+1	večidel	Adverb	*	*	ADV	
+1	vse	Adverb	*	*	ADV	
+1	zadosti	Adverb	*	*	ADV	
+						
+##All Pronouns should be explicitly defined						
+##But are not because of jos1M wrong lemmatisations for e.g. "ti", "te" etc.						
+3	*	Pronoun	*	*	PRON	
+##2	*	Pronoun	Type=demonstrative	*	DET	
+##2	*	Pronoun	Type=possessive	*	DET	
+1	bogsigavedikakšen	Pronoun	Type=indefinite	*	DET	
+1	bogvedikaj	Pronoun	Type=indefinite	*	PRON	
+1	bogvedikateri	Pronoun	Type=indefinite	*	DET	
+1	bogvekaj	Pronoun	Type=indefinite	*	PRON	
+1	bogvekakšen	Pronoun	Type=indefinite	*	DET	
+1	bogvekateri	Pronoun	Type=indefinite	*	DET	
+1	bogvekolik	Pronoun	Type=indefinite	*	DET	
+1	bogvekolikšen	Pronoun	Type=indefinite	*	DET	
+1	čezme	Pronoun	Type=personal	*	PRON	
+1	čezse	Pronoun	Type=reflexive	*	PRON	
+1	čigar	Pronoun	Type=relative	*	DET	
+1	čigarkoli	Pronoun	Type=relative	*	DET	
+1	čigarsižebodi	Pronoun	Type=relative	*	DET	
+1	čigav	Pronoun	Type=interrogative	*	DET	
+1	čigaver	Pronoun	Type=relative	*	DET	
+1	čigaverkoli	Pronoun	Type=relative	*	DET	
+1	čigavršen	Pronoun	Type=relative	*	DET	
+1	čigavršnji	Pronoun	Type=relative	*	DET	
+1	enak	Pronoun	Type=indefinite	*	DET	
+1	enaki	Pronoun	Type=indefinite	*	DET	
+1	enakšen	Pronoun	Type=indefinite	*	DET	
+1	isti	Pronoun	Type=indefinite	*	DET	
+1	jaz	Pronoun	Type=personal	*	PRON	
+1	jest	Pronoun	Type=personal	*	PRON	
+1	kaj	Pronoun	Type=interrogative	*	PRON	
+1	kak	Pronoun	Type=interrogative	*	DET	
+1	kakov	Pronoun	Type=interrogative	*	DET	
+1	kakošen	Pronoun	Type=interrogative	*	DET	
+1	kakršen	Pronoun	Type=relative	*	DET	
+1	kakršenkoli	Pronoun	Type=relative	*	DET	
+1	kakršensižebodi	Pronoun	Type=relative	*	DET	
+1	kakšen	Pronoun	Type=interrogative	*	DET	
+1	kar	Pronoun	Type=relative	*	PRON	
+1	karkoli	Pronoun	Type=relative	*	PRON	
+1	karsibodi	Pronoun	Type=relative	*	PRON	
+1	karsižebodi	Pronoun	Type=relative	*	PRON	
+1	kateri	Pronoun	Type=interrogative	*	DET	
+1	katerikoli	Pronoun	Type=relative	*	DET	
+1	katerisibodi	Pronoun	Type=relative	*	DET	
+1	kdo	Pronoun	Type=interrogative	*	PRON	
+1	kdor	Pronoun	Type=relative	*	PRON	
+1	kdorkoli	Pronoun	Type=relative	*	PRON	
+1	kdorsibodi	Pronoun	Type=relative	*	PRON	
+1	kdorsižebodi	Pronoun	Type=relative	*	PRON	
+1	kdovekaj	Pronoun	Type=indefinite	*	PRON	
+1	kdovekak	Pronoun	Type=indefinite	*	DET	
+1	kdovekakšen	Pronoun	Type=indefinite	*	DET	
+1	kdovekateri	Pronoun	Type=indefinite	*	DET	
+1	kdovekdo	Pronoun	Type=indefinite	*	PRON	
+1	kdovekolik	Pronoun	Type=indefinite	*	DET	
+1	koji	Pronoun	Type=interrogative	*	DET	
+1	kolik	Pronoun	Type=interrogative	*	DET	
+1	kolik	Pronoun	Type=indefinite	*	DET	
+1	koliker	Pronoun	Type=interrogative	*	DET	
+1	kolikršen	Pronoun	Type=relative	*	DET	
+1	kolikšen	Pronoun	Type=interrogative	*	DET	
+1	malokaj	Pronoun	Type=indefinite	*	PRON	
+1	malokak	Pronoun	Type=indefinite	*	DET	
+1	malokakšen	Pronoun	Type=indefinite	*	DET	
+1	malokateri	Pronoun	Type=indefinite	*	DET	
+1	malokdo	Pronoun	Type=indefinite	*	PRON	
+1	marsikaj	Pronoun	Type=indefinite	*	PRON	
+1	marsikak	Pronoun	Type=indefinite	*	DET	
+1	marsikakšen	Pronoun	Type=indefinite	*	DET	
+1	marsikateri	Pronoun	Type=indefinite	*	DET	
+1	marsikdo	Pronoun	Type=indefinite	*	PRON	
+1	marsičigav	Pronoun	Type=indefinite	*	DET	
+1	medme	Pronoun	Type=personal	*	PRON	
+1	medse	Pronoun	Type=reflexive	*	PRON	
+1	mnog	Pronoun	Type=indefinite	*	DET	
+1	mnogokaj	Pronoun	Type=indefinite	*	PRON	
+1	mnogokateri	Pronoun	Type=indefinite	*	DET	
+1	mnogokdo	Pronoun	Type=indefinite	*	PRON	
+1	moj	Pronoun	Type=possessive	*	DET	
+1	nadme	Pronoun	Type=personal	*	PRON	
+1	nadse	Pronoun	Type=reflexive	*	PRON	
+1	najin	Pronoun	Type=possessive	*	DET	
+1	name	Pronoun	Type=personal	*	PRON	
+1	nase	Pronoun	Type=reflexive	*	PRON	
+1	naš	Pronoun	Type=possessive	*	DET	
+1	negdo	Pronoun	Type=indefinite	*	PRON	
+1	nek	Pronoun	Type=indefinite	*	DET	
+1	nekaj	Pronoun	Type=indefinite	*	PRON	
+1	nekak	Pronoun	Type=indefinite	*	DET	
+1	nekakov	Pronoun	Type=indefinite	*	DET	
+1	nekakšen	Pronoun	Type=indefinite	*	DET	
+1	nekateri	Pronoun	Type=indefinite	*	DET	
+1	nekdo	Pronoun	Type=indefinite	*	PRON	
+1	neki	Pronoun	Type=indefinite	*	DET	
+1	nekolik	Pronoun	Type=indefinite	*	DET	
+1	nekolikšen	Pronoun	Type=indefinite	*	DET	
+1	nekolikšnji	Pronoun	Type=indefinite	*	DET	
+1	nekov	Pronoun	Type=indefinite	*	DET	
+1	nekšen	Pronoun	Type=indefinite	*	DET	
+1	nevemkakšen	Pronoun	Type=indefinite	*	DET	
+1	nihče	Pronoun	Type=negative	*	PRON	
+1	nikak	Pronoun	Type=negative	*	DET	
+1	nikakršen	Pronoun	Type=negative	*	DET	
+1	nikakšen	Pronoun	Type=negative	*	DET	
+1	nikdo	Pronoun	Type=negative	*	PRON	
+1	nikogaršen	Pronoun	Type=negative	*	DET	
+1	nikogaršnji	Pronoun	Type=negative	*	DET	
+1	nič	Pronoun	Type=negative	*	PRON	
+1	njegov	Pronoun	Type=possessive	*	DET	
+1	njen	Pronoun	Type=possessive	*	DET	
+1	njihen	Pronoun	Type=possessive	*	DET	
+1	njihnji	Pronoun	Type=possessive	*	DET	
+1	njihov	Pronoun	Type=possessive	*	DET	
+1	njun	Pronoun	Type=possessive	*	DET	
+1	nobeden	Pronoun	Type=negative	*	PRON	
+1	noben	Pronoun	Type=negative	*	DET	
+1	oba	Pronoun	Type=general	*	DET	
+1	obadva	Pronoun	Type=general	*	PRON	
+1	obme	Pronoun	Type=personal	*	PRON	
+1	oboj	Pronoun	Type=general	*	DET	
+1	obojen	Pronoun	Type=general	*	DET	
+1	obse	Pronoun	Type=reflexive	*	PRON	
+1	on	Pronoun	Type=personal	*	PRON	
+1	oni	Pronoun	Type=demonstrative	*	DET	
+1	onile	Pronoun	Type=demonstrative	*	PRON	
+1	podme	Pronoun	Type=personal	*	PRON	
+1	podse	Pronoun	Type=reflexive	*	PRON	
+1	pome	Pronoun	Type=personal	*	PRON	
+1	predme	Pronoun	Type=personal	*	PRON	
+1	predse	Pronoun	Type=reflexive	*	PRON	
+1	premarsikateri	Pronoun	Type=indefinite	*	DET	
+1	premnog	Pronoun	Type=indefinite	*	DET	
+1	prenekaj	Pronoun	Type=indefinite	*	PRON	
+1	prenekateri	Pronoun	Type=indefinite	*	DET	
+1	prenekdo	Pronoun	Type=indefinite	*	PRON	
+1	redkokateri	Pronoun	Type=indefinite	*	DET	
+1	redkokdo	Pronoun	Type=indefinite	*	PRON	
+1	se	Pronoun	Type=reflexive	*	PRON	
+1	skozme	Pronoun	Type=personal	*	PRON	
+1	skozse	Pronoun	Type=reflexive	*	PRON	
+1	svoj	Pronoun	Type=reflexive	*	DET	
+1	ta	Pronoun	Type=demonstrative	*	DET	
+1	tadva	Pronoun	Type=demonstrative	*	PRON	
+1	taisti	Pronoun	Type=demonstrative	*	DET	
+1	tak	Pronoun	Type=demonstrative	*	DET	
+1	takisti	Pronoun	Type=demonstrative	*	DET	
+1	takle	Pronoun	Type=demonstrative	*	DET	
+1	takov	Pronoun	Type=demonstrative	*	DET	
+1	takošen	Pronoun	Type=demonstrative	*	DET	
+1	takšen	Pronoun	Type=demonstrative	*	DET	
+1	takšenle	Pronoun	Type=demonstrative	*	DET	
+1	tale	Pronoun	Type=demonstrative	*	DET	
+1	talele	Pronoun	Type=demonstrative	*	DET	
+1	teu	Pronoun	Type=personal	*	PRON	
+1	ti	Pronoun	Type=personal	*	PRON	
+1	tisti	Pronoun	Type=demonstrative	*	DET	
+1	tistile	Pronoun	Type=demonstrative	*	DET	
+1	tolik	Pronoun	Type=demonstrative	*	DET	
+1	toliker	Pronoun	Type=demonstrative	*	DET	
+1	tolikšen	Pronoun	Type=demonstrative	*	DET	
+1	tolikšnji	Pronoun	Type=demonstrative	*	DET	
+1	toti	Pronoun	Type=demonstrative	*	DET	
+1	tvoj	Pronoun	Type=possessive	*	DET	
+1	un	Pronoun	Type=demonstrative	*	DET	
+1	vajin	Pronoun	Type=possessive	*	DET	
+1	vame	Pronoun	Type=personal	*	PRON	
+1	vase	Pronoun	Type=reflexive	*	PRON	
+1	vaš	Pronoun	Type=possessive	*	DET	
+1	ves	Pronoun	Type=general	*	DET	
+1	vsak	Pronoun	Type=general	*	DET	
+1	vsakateri	Pronoun	Type=general	*	DET	
+1	vsakdo	Pronoun	Type=general	*	PRON	
+1	vsakogaršen	Pronoun	Type=general	*	DET	
+1	vsakogaršnji	Pronoun	Type=general	*	DET	
+1	vsakršen	Pronoun	Type=general	*	DET	
+1	vsakteri	Pronoun	Type=general	*	DET	
+1	zame	Pronoun	Type=personal	*	PRON	
+1	zase	Pronoun	Type=reflexive	*	PRON	
+						
+3	*	Numeral	Form=digit	*	NUM	
+3	*	Numeral	Form=roman	*	NUM	
+3	*	Numeral	Form=letter|Type=special	*	NUM	
+3	*	Numeral	Form=letter|Type=cardinal	*	NUM	
+2	*	Numeral	Form=letter|Type=ordinal	*	ADJ	
+1	drug	Numeral	Form=letter|Type=pronominal	*	ADJ	
+1	en	Numeral	Form=letter|Type=pronominal	*	NUM	
+1	*en	Numeral	Form=letter|Type=special	*	ADJ	#enojen, dvojen
+1	eden	Numeral	Form=letter|Type=pronominal	*	NUM	#Dodal E.T.
+						
+3	*	Adposition	*	*	ADP	#MULTEXT-East name
+3	*	Preposition	*	*	ADP	#JOS name
+						
+3	*	Conjunction	Type=coordinating	*	CCONJ	
+3	*	Conjunction	Type=subordinating	*	SCONJ	
+						
+3	*	Particle	*	*	PART	
+						
+3	*	Interjection	*	*	INTJ	
+						
+3	*	Abbreviation	*	*	X	
+						
+3	*	Residual	*	*	X	
+2	*	Residual	Type=web	*	SYM	
+2	*	Residual	Type=emo	*	SYM	
+2	*	Residual	Type=hashtag	*	SYM	#Better mapping?
+2	*	Residual	Type=at	*	SYM	#Better mapping?
+2	*	Residual	Type=foreign	*	X	#Better mapping?
+						
+3	*	Punctuation	*	*	PUNCT	
+1	#	Punctuation	*	*	SYM	
+1	%	Punctuation	*	*	SYM	
+1	&	Punctuation	*	*	SYM	
+1	<	Punctuation	*	*	SYM	
+1	>	Punctuation	*	*	SYM	
+1	+	Punctuation	*	*	SYM	
+1	=	Punctuation	*	*	SYM	
+1	°	Punctuation	*	*	SYM	
+1	×	Punctuation	*	*	SYM	
+1	÷	Punctuation	*	*	SYM	
+1	$	Punctuation	*	*	SYM	
+1	@	Punctuation	*	*	SYM	
+1	µ	Punctuation	*	*	SYM	
+1	©	Punctuation	*	*	SYM	
+1	§	Punctuation	*	*	SYM	
+1	€	Punctuation	*	*	SYM
+1	£	Punctuation	*	*	SYM
--- a/conversion_utils/tei_to_dictionary.py
+++ b/conversion_utils/tei_to_dictionary.py
@@ -1,12 +1,19 @@
+"""Convert a TEI file to a XML file of the CJVT standard schema.
+
+This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
+"""
+
 import argparse
 import lxml.etree as lxml

-from conversion_utils.utils import xpath_find, get_xml_id, TEI_NAMESPACE_QUALIFIER
+from conversion_utils.utils import xpath_find, TEI_NAMESPACE_QUALIFIER
+

 def get_parsed_unit_string(parsed_unit):
    elements = xpath_find(parsed_unit, 'tei:w|tei:pc')
    return ''.join([e.text if e.get('join') == 'right' else e.text + ' ' for e in elements]).strip()

+
 def convert(input_file_name, output_file_name):

    output_root = lxml.Element('dictionary')
@@ -55,4 +62,6 @@ if (__name__ == '__main__'):
    arg_parser.add_argument('-infile', type=str, help='Input TEI xml')
    arg_parser.add_argument('-outfile', type=str, help='Output xml in standard cjvt schema')
    arguments = arg_parser.parse_args()
+    input_file_name = arguments.infile
+    output_file_name = arguments.outfile
    convert(input_file_name, output_file_name)
--- a/conversion_utils/tests/init.py
+++ b/conversion_utils/tests/init.py
--- a/conversion_utils/tests/test_jos_msd_to_properties.py
+++ b/conversion_utils/tests/test_jos_msd_to_properties.py
@@ -1,6 +1,6 @@
 import unittest

-from conversion_utils.jos_msds_and_properties import Converter, Msd
+from conversion_utils.jos_msds_and_properties import Converter, Msd, MsdException

 class JosMsdToPropertiesTestCase(unittest.TestCase):

@@ -55,3 +55,25 @@ class JosMsdToPropertiesTestCase(unittest.TestCase):
        self.assertEqual(properties.category, 'punctuation')
        self.assertEqual(properties.lexeme_feature_map, {})
        self.assertEqual(properties.form_feature_map, {})
+
+    def test_good_msd_with_require_valid(self):
+        properties = self.converter.msd_to_properties(Msd('Ncfpd', 'en'), 'en', require_valid_flag=True)
+        self.assertEqual(properties.language, 'en')
+        self.assertEqual(properties.category, 'noun')
+        self.assertEqual(properties.lexeme_feature_map, {'type':'common', 'gender':'feminine'})
+        self.assertEqual(properties.form_feature_map, {'number':'plural', 'case':'dative'})
+
+    def test_bad_msd(self):
+        properties = self.converter.msd_to_properties(Msd('N---d', 'en'), 'en')
+        self.assertEqual(properties.language, 'en')
+        self.assertEqual(properties.category, 'noun')
+        self.assertEqual(properties.lexeme_feature_map, {})
+        self.assertEqual(properties.form_feature_map, {'case':'dative'})
+
+    def test_bad_msd_with_require_valid(self):
+        try:
+            self.converter.msd_to_properties(Msd('N---d', 'en'), 'en', require_valid_flag=True)
+            fails = False
+        except MsdException:
+            fails = True
+        self.assertEqual(fails, True)
--- a/conversion_utils/tests/test_jos_properties_to_msd.py
+++ b/conversion_utils/tests/test_jos_properties_to_msd.py
@@ -1,6 +1,6 @@
 import unittest

-from conversion_utils.jos_msds_and_properties import Converter, Properties
+from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException, Msd

 class JosPropertiesToMsdTestCase(unittest.TestCase):

@@ -41,3 +41,40 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
        msd = self.converter.properties_to_msd(Properties('punctuation', {}, {}, 'en'), 'sl')
        self.assertEqual(msd.language, 'sl')
        self.assertEqual(msd.code, 'U')
+
+    def test_good_msd_with_require_valid(self):
+        msd = self.converter.properties_to_msd(Properties('noun', {'type':'common', 'gender':'feminine'}, {'number':'dual', 'case':'nominative'}, 'en'), 'en', require_valid_flag=True)
+        self.assertEqual(msd.language, 'en')
+        self.assertEqual(msd.code, 'Ncfdn')
+
+    def test_bad_msd(self):
+        msd = self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en')
+        self.assertEqual(msd.language, 'en')
+        self.assertEqual(msd.code, 'Nc-d')
+
+    def test_msd_to_jos(self):
+        ud = self.converter.msd_to_ud(Msd('Ppnzei', 'sl'), 'slovenski')
+        self.assertEqual(ud.pos, 'ADJ')
+        self.assertEqual(ud.to_full_string(), 'UposTag=ADJ|Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
+        self.assertEqual(ud.to_features_string(), 'Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
+        
+        ud = self.converter.msd_to_ud(Msd('Sommr', 'sl'), 'dečko')
+        self.assertEqual(ud.pos, 'NOUN')
+        self.assertEqual(ud.to_full_string(), 'UposTag=NOUN|Case=Gen|Gender=Masc|Number=Plur')
+        self.assertEqual(ud.to_features_string(), 'Case=Gen|Gender=Masc|Number=Plur')
+        
+    def test_msd_to_jos_partial_msd(self):
+        try:
+            self.converter.msd_to_ud(Msd('Soz', 'sl'), 'vlada')
+            fails = False
+        except MsdException:
+            fails = True
+        self.assertEqual(fails, True)
+
+    def test_bad_msd_with_require_valid(self):
+        try:
+            self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
+            fails = False
+        except MsdException:
+            fails = True
+        self.assertEqual(fails, True)
--- a/conversion_utils/tests/test_jos_translate_msd.py
+++ b/conversion_utils/tests/test_jos_translate_msd.py
--- a/conversion_utils/tests/test_jos_translate_properties.py
+++ b/conversion_utils/tests/test_jos_translate_properties.py
--- a/conversion_utils/translate_conllu_jos.py
+++ b/conversion_utils/translate_conllu_jos.py
@@ -1,26 +1,29 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
+"""Convert the MSD and/or syntactic dependency tags in a CoNLL-U file from English to Slovene tags.
+
+This script was developed in the context of a specific task and may not generalise as expected. Use at your own risk.
+"""

 import argparse
-import codecs
 import lxml.etree as lxml
 from importlib_resources import files

 from conversion_utils.jos_msds_and_properties import Converter, Msd

+
 def get_syn_map():
    dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
-    dict_file = codecs.open(dict_file_name, 'r')
+    dict_file = open(dict_file_name, 'r', encoding='utf-8')
    root = lxml.parse(dict_file).getroot()
    dict_file.close() 
    return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
    
-def translate(input_file_name, output_file_name):
+
+def translate(input_file_name, scope, output_file_name):

    syn_map = get_syn_map()

-    output_file = codecs.open(output_file_name, 'w')
-    input_file = codecs.open(input_file_name, 'r')
+    output_file = open(output_file_name, 'w', encoding='utf-8')
+    input_file = open(input_file_name, 'r', encoding='utf-8')

    converter = Converter()

@@ -29,8 +32,10 @@ def translate(input_file_name, output_file_name):
        if (len(columns) != 10):
            output_file.write(line)
        else:
-            columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
-            columns[7] = syn_map[columns[7]]
+            if (scope in {'msd', 'both'}):
+                columns[4] = converter.translate_msd(Msd(columns[4], 'en'), 'sl').code
+            if (scope in {'dep', 'both'}):
+                columns[7] = syn_map[columns[7]]
            output_file.write('\t'.join(columns) + '\n')

    input_file.close()
@@ -41,6 +46,7 @@ if (__name__ == '__main__'):

    arg_parser = argparse.ArgumentParser(description='Translate JOS msds and dependency labels.')
    arg_parser.add_argument('-infile', type=str, help='Input conllu')
+    arg_parser.add_argument('-scope', type=str, options=['msd', 'dep', 'both'], default='both', help='Input conllu')
    arg_parser.add_argument('-outfile', type=str, help='Output conllu')
    arguments = arg_parser.parse_args()
    input_file_name = arguments.infile
--- a/conversion_utils/utils.py
+++ b/conversion_utils/utils.py
@@ -1,11 +1,16 @@
+"""A few convenience TEI/XML constants and functions."""
+
+
 TEI_NAMESPACE = 'http://www.tei-c.org/ns/1.0'
 TEI_NAMESPACE_QUALIFIER = '{' + TEI_NAMESPACE + '}'
 XML_ID_ATTRIBUTE_NAME = '{http://www.w3.org/XML/1998/namespace}id'

+
 def xpath_find(element,expression):
    """Executes XPath expression, with TEI namespace."""
    return element.xpath(expression, namespaces={'tei':TEI_NAMESPACE})

+
 def get_xml_id(element):
    """Returns the element's @xml:id attribute."""
    return element.get(XML_ID_ATTRIBUTE_NAME)
--- a/scripts/install_jos_specifications.py
+++ b/scripts/install_jos_specifications.py
@@ -1,3 +1,13 @@
+"""Parse source TEI specifications and save as a pickle.
+
+You can use this script to create a new pickle file to replace the one stored at
+../conversion_utils/resources/jos_specifications.pickle. The input file is expected to be a version
+of https://github.com/clarinsi/mte-msd/blob/master/xml/msd-sl.spc.xml.  However, the specifications
+are not expected to change, and if they do, the package pickle there should be updated upstream, so
+you probably should not have to use this script.
+"""
+
+
 import pickle
 import argparse
 from conversion_utils.jos_msds_and_properties import SpecificationsParser
--- a/setup.py
+++ b/setup.py
@@ -1,12 +1,20 @@
 from setuptools import setup
+import os

-setup(name='conversion_utils',
-      version='0.1',
+here = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+    long_description = f.read()
+
+setup(name='cjvt_conversion_utils',
+      version='0.3',
      description='CJVT conversion utilities',
+      long_description=long_description,
+      long_description_content_type="text/markdown",
      url='https://gitea.cjvt.si/generic/conversion_utils',
-      author='Cyprian Laskowski',
-      author_email='cyp@cjvt.si',
-      packages=['conversion_utils', 'conversion_utils.resources'],
-      install_requires=['importlib_resources'],
+      author='CJVT',
+      author_email='pypi@cjvt.si',
+      license='MIT',
+      packages=['conversion_utils', 'conversion_utils.resources', 'conversion_utils.tests'],
+      install_requires=['lxml', 'importlib_resources'],
      include_package_data=True,
      zip_safe=True)
Author	SHA1	Message	Date
Luka Dragar	e62c096126	Small bug fix regarding UD features conversion to string	2026-04-01 12:54:09 +02:00
Luka Dragar	165f24c64c	Added conversion from msd to universal dependencies based on Jaka's implementation	2026-03-30 22:31:08 +02:00
Luka Dragar	4d86631283	No warning messages for partial msds	2026-03-30 10:03:49 +02:00
Luka Dragar	b711fae3b5	UTF-8 encoding fix	2025-11-28 16:55:04 +01:00
lkrsnik	f43ea39f1b	Updated setup.py and licence	2023-10-31 10:39:19 +01:00
Cyprian Laskowski	03ce9f8ac7	Added rudimentary module documentation and made a couple of basic fixes	2023-10-26 17:13:54 +02:00
Cyprian Laskowski	f28b5a3a01	Allowed for restricting of JOS translation to one column	2023-10-18 21:54:00 +02:00
Cyprian Laskowski	89be603103	Allowed for empty misc conllu column	2023-08-16 16:41:02 +02:00
Cyprian Laskowski	99ac426e4b	Replace deprecated code and add missing dependency	2023-08-09 18:08:21 +02:00
Luka	89bcde58aa	Added NER + SRL to conllu_to_tei script	2023-02-17 16:24:02 +01:00
Cyprian Laskowski	d7be39d894	Made msd and feature-level checking optional, added docstrings	2022-09-15 11:01:05 +02:00
Cyprian Laskowski	4ca67ec8cc	Turned unit test directory into package	2022-09-15 10:57:58 +02:00