Updated to version 0.4

Specifications are loaded on import
Added static list of partial msds inside jos_specifications.pickle
2026-05-08 08:08:10 +02:00 · 2026-04-15 08:23:06 +02:00 · 2026-04-08 15:19:40 +02:00 · 2026-04-01 12:54:09 +02:00 · 2026-03-30 22:31:08 +02:00 · 2026-03-30 10:03:49 +02:00
10 changed files with 2514 additions and 44 deletions
@@ -1,3 +1,6 @@
 include conversion_utils/resources/jos_specifications.pickle
 include conversion_utils/resources/dict.xml
 include conversion_utils/resources/structure_conversions.csv
 include conversion_utils/resources/jos-msd2features.tbl
 include conversion_utils/resources/jos2ud-features.tbl
 include conversion_utils/resources/jos2ud-pos.tbl
@@ -312,7 +312,7 @@ def construct_tei_etrees(conllu_lines):
 def convert_file(input_file_name, output_file_name):
-    input_file = open(input_file_name, 'r')
+    input_file = open(input_file_name, 'r', encoding='utf-8')
    root = construct_tei_etrees(input_file)[0]
    tree = etree.ElementTree(root)
    tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@@ -332,7 +332,7 @@ if __name__ == '__main__':
    args = parser.parse_args()
    if args.out:
-        f_out = open(args.out, 'w')
+        f_out = open(args.out, 'w', encoding='utf-8')
    else:
        f_out = sys.stdout
@@ -341,7 +341,7 @@ if __name__ == '__main__':
    for arg in args.files:
        filelist = glob(arg)
        for f in filelist:
-            with open(f, 'r') as conllu_f:
+            with open(f, 'r', encoding='utf-8') as conllu_f:
                tei_etrees = construct_tei_etrees(conllu_f)
            for tei_etree in tei_etrees:
                f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
@@ -1,12 +1,21 @@
 import lxml.etree as lxml
 import re
 import pickle
 import lxml.etree as lxml
 from collections import defaultdict
 from importlib_resources import files
 from enum import IntEnum
 from conversion_utils.utils import xpath_find, get_xml_id
 JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'
 RESOURCES_DIR = "conversion_utils.resources"
 MSD_TO_FEATURES = "jos-msd2features.tbl"
 JOS_TO_UD_FEATURES_RULES = "jos2ud-features.tbl"
 JOS_TO_UPOS_RULES = "jos2ud-pos.tbl"
 ## Positions of lexeme-level features for each category
 LEXEME_FEATURE_MAP = {'noun':{1,2},
                      'verb':{1,2},
@@ -53,6 +62,14 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
                    ('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
                    ('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
 class MsdState(IntEnum):
    UNKNOWN = -1
    PARTIAL = 1
    FULL = 2
 class MsdException(Exception):
    pass
 class Specifications:
    """JOS specifications with list of all word categories."""
@@ -214,42 +231,84 @@ class Properties:
            and self.lexeme_feature_map == obj.lexeme_feature_map\
            and self.form_feature_map == obj.form_feature_map\
            and self.language == obj.language
-            
+          
 class UD:
    """Universal Dependencies object.
    Can be converted to a valid UD features string.
    """
    def __init__(self, pos, features_map):
        self.pos = pos
        self.features_map = features_map
    def to_features_string(self):
        return self._features_string()
    def to_full_string(self):
        features = self._features_string()
        if features:
            return "UposTag=" + self.pos + "|" + features
        else:
            return "UposTag=" + self.pos
    def _features_string(self):
        return "|".join([f"{feature}={value}" for feature, value in self._sort_features(self.features_map)])
    def _sort_features(self, features_map):
        return sorted(features_map.items(), key=lambda x: x[0].lower(), reverse=False)
    def __str__(self):
        return f"pos={self.pos}, features_map={self.features_map}"
 class Msd:
    """JOS msd."""  
-    def __init__(self, code, language):
+    class State(IntEnum):
        UNKNOWN = -1
        PARTIAL = 1
        FULL = 2
    def __init__(self, code, language, expected_state=State.FULL, require_valid=False):
        self.code = code
        self.language = language
        self.expected_state = expected_state
        self.require_valid = require_valid
        self.state = self._validate_and_get_state()
    def _validate_and_get_state(self):
        states = set()
        if self.code in DEFAULT_SPECIFICATIONS.codes_map[self.language]:
            states.add(self.State.FULL)
        if self.code in DEFAULT_SPECIFICATIONS.partial_codes_map[self.language]:
            states.add(self.State.PARTIAL)
        if len(states) == 0:
            states.add(self.State.UNKNOWN)
        if self.expected_state not in states:
            if self.require_valid:
                raise MsdException(f"Given msd '{self.codecode}' is '{self.state.name}', but expected state is '{self.expected_state.name}'.")
            else:
                if self.state == self.State.UNKNOWN:
                    print(f"[WARN] The Msd '{self.code}' is unknown.")
                else:
                    print(f"[WARN] The Msd '{self.code}' is unknown for expected state '{self.expected_state.name}'.")
        return max(states)
    def __str__(self):
        return 'code={code}, language={language}'.format(code=self.code, language=self.language)
    def __eq__(self, obj):
        return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
-
+    
 class CustomException(Exception):
    pass
 class MsdException(CustomException):
    pass
 class Converter:
    """Converter between Msd and Properties objects."""
    def __init__(self, xml_file_name=None):
        if (xml_file_name is None):
-            resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
+            self.specifications = DEFAULT_SPECIFICATIONS
            if (resource.is_file()):
                try:
                    with resource.open('rb') as pickle_file:
                        self.specifications = pickle.load(pickle_file)
                except:
                    exit('Could not parse specifications pickle file installed.')
            else:
                exit('No pickle installed or xml provided.')
        else:
            parser = SpecificationsParser()
            try:
@@ -257,20 +316,45 @@ class Converter:
            except:
                exit('Could not parse specifications xml file provided.')
-    def is_valid_msd(self, msd):
+        self.mte_to_ud_features = self._parse_msd_ud_conversion(MSD_TO_FEATURES)
-        """Verify if the Msd code is in the standard JOS set."""
+        self.mte_to_ud_features_rules = self._parse_ud_rules(JOS_TO_UD_FEATURES_RULES)
-        return msd.code in self.specifications.codes_map[msd.language]
+        self.mte_to_upos_rules = self._parse_ud_rules(JOS_TO_UPOS_RULES)
-    def check_valid_msd(self, msd, require_valid_flag):
+    def _get_partial_msd(self, msd):
-        """If the Msd code is not valid, raise an exception or give a warning."""
+        properties = self.msd_to_properties(msd, msd.language)
-        if (not self.is_valid_msd(msd)):
+        category_char = msd.code[0].lower()
-            message = 'The msd {} is unknown'.format(msd.code)
+        category = self.specifications.find_category_by_code(category_char, msd.language)
-            if (require_valid_flag):
+        category_name = category.names.get(msd.language)
-                raise MsdException(message)
+        properties = Properties(
-            else:
+            category=category_name,
-                print('[WARN] ' + message)
+            lexeme_feature_map=properties.lexeme_feature_map,
            form_feature_map={},
            language=msd.language
        )
        return self.properties_to_msd(properties, msd.language, expected_state=Msd.State.PARTIAL).code
-    def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
+    def _parse_msd_ud_conversion(self, file_name):
        """Parse file with direct conversions from English Msd to Universal Dependencies."""
        conversion_map = defaultdict()
        with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as conversion_file:
            for line in conversion_file.readlines():
                mte_msd_en, mte_features_en = line.strip("\n").split("\t")
                mte_sl = self.translate_msd(Msd(mte_msd_en, "en"), "sl").code
                conversion_map[mte_msd_en] = mte_features_en
                conversion_map[mte_sl] = mte_features_en
        return conversion_map
    def _parse_ud_rules(self, file_name):
        """Parse file with rules additional rules for converting from applied to conversion from English Msd to Universal Dependencies."""
        all_rules = defaultdict(list)
        with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as rules_file:
            for line in [l for l in rules_file.readlines() if l[0].isdigit()]:
                priority, *current_rules = line.strip("\n").split("\t")
                current_rules += [""] * (6 - len(current_rules))
                all_rules[priority].append(current_rules)
        return all_rules
    def msd_to_properties(self, msd, language, lemma=None, warn_level_flag=False):
        """Convert Msd to Properties.
        The language of the generated Properties is specified and can differ from the Msd language.
@@ -296,7 +380,6 @@ class Converter:
        Properties: the result of the conversion of the Msd in the language requested
        """
        self.check_valid_msd(msd, require_valid_flag)
        category_char = msd.code[0].lower()
        value_chars = msd.code[1:]
        category = self.specifications.find_category_by_code(category_char, msd.language)
@@ -322,7 +405,7 @@ class Converter:
                    form_feature_map[feature_name] = feature_value
        return Properties(category_name, lexeme_feature_map, form_feature_map, language)
-    def properties_to_msd(self, properties, language, require_valid_flag=False):
+    def properties_to_msd(self, properties, language, expected_state=Msd.State.FULL):
        """Convert Properties to Msd.
        The language of the generated Msd is specified and can differ from the Properties language.
@@ -354,12 +437,68 @@ class Converter:
                msd_code += '-'
                i += 1
            msd_code += position_map[position]
-        msd = Msd(msd_code, language)
+        msd = Msd(msd_code, language, expected_state=expected_state)
        self.check_valid_msd(msd, require_valid_flag)
        return msd
    def msd_to_ud(self, msd, lemma):
        """Convert Msd to Universal Dependencies object.
        Partial Msds are currently not supported.
        Parameters:
        msd(Msd): the Msd to convert
        lemma(str): the lemma of the word form with the MSD
        """
        if msd.state != Msd.State.FULL:
            raise MsdException(f"Msd must be full to be converted to UD.")
        upos_category, *upos_features = self.mte_to_ud_features[msd.code].split()
        final_upos = ""
        for priority in sorted(self.mte_to_upos_rules, reverse=True):
            for rule in self.mte_to_upos_rules[priority]:
                rule_lemma, rule_category, rule_mte_features, _, rule_pos_ud, _ = rule
                if (rule_category != upos_category
                or (rule_lemma not in ("*", "*en") and lemma != rule_lemma)
                or (rule_lemma == "*en" and not lemma.endswith("en"))
                or (rule_mte_features != "*" and not all(f in upos_features for f in rule_mte_features.split("|")))):
                    continue
                final_upos = rule_pos_ud
        for priority in sorted(self.mte_to_ud_features_rules):
            for rule in self.mte_to_ud_features_rules[priority]:
                rule_lemma, rule_category, rule_mte_features, rule_pos_ud, rule_ud_features, _ = rule
                if (rule_lemma != "*" and lemma != rule_lemma
                or (rule_category != "*" and rule_category != upos_category)
                or (rule_pos_ud != "*" and rule_pos_ud != final_upos)):
                    continue
                upos_features = [rule_ud_features if f == rule_mte_features else f for f in upos_features]
                if rule_mte_features == "*" and rule_ud_features != "-":
                    upos_features.append(rule_ud_features)
        ud_features = dict(f.split("=", 1) for f in "|".join(upos_features).split("|") if f not in {"", "-"})
        return UD(final_upos, ud_features)
    def translate_msd(self, msd, language):
        return self.properties_to_msd(self.msd_to_properties(msd, language), language)
    def translate_properties(self, properties, language):
        return self.msd_to_properties(self.properties_to_msd(properties, language), language)
 def _load_default_specifications():
    global DEFAULT_SPECIFICATIONS
    resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
    if resource.is_file():
        try:
            with resource.open('rb') as pickle_file:
                DEFAULT_SPECIFICATIONS = pickle.load(pickle_file)
        except Exception as e:
            exit('Could not parse specifications pickle file installed.')
    else:
        exit("Default specifications not found.")
 _load_default_specifications()
@@ -0,0 +1,128 @@
 # Mapping from JOS features to UD features						
 # Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek						
 # 2018-11-23						
 #						
 #Prio	Lemma	Category	Feats	PoS-UD	->Feature-UD	#Comment
 ----------------------------------------------------------------------------------------------------						
 1	*	Noun	Type=common	*	-	
 1	*	Noun	Type=proper	*	-	
 1	*	Verb	Negative=no	*	Polarity=Pos	
 1	*	Verb	Negative=yes	*	Polarity=Neg	
 1	*	Verb	Type=auxiliary	*	-	
 1	*	Verb	Type=main	*	-	
 1	*	Verb	VForm=present	*	VerbForm=Fin|Mood=Ind|Tense=Pres	
 1	*	Verb	VForm=future	*	VerbForm=Fin|Mood=Ind|Tense=Fut	
 1	*	Verb	VForm=conditional	*	VerbForm=Fin|Mood=Cnd	
 1	*	Verb	VForm=imperative	*	VerbForm=Fin|Mood=Imp	
 1	*	Verb	VForm=infinitive	*	VerbForm=Inf	
 1	*	Verb	VForm=supine	*	VerbForm=Sup	
 1	*	Verb	VForm=participle	*	VerbForm=Part	
 1	*	Adjective	Type=general	*	-	
 1	*	Adjective	Type=possessive	*	Poss=Yes	
 1	*	Adjective	Type=participle	*	VerbForm=Part	
 2	*	Adverb	Type=participle	*	VerbForm=Conv	
 2	*	Adverb	Type=general	*	-	
 1	nekaj	Adverb	Type=general	DET	PronType=Ind	
 1	več	Adverb	Type=general	DET	PronType=Ind	
 1	veliko	Adverb	Type=general	DET	PronType=Ind	
 1	manj	Adverb	Type=general	DET	PronType=Ind	
 1	dovolj	Adverb	Type=general	DET	PronType=Ind	
 1	pol	Adverb	Type=general	DET	PronType=Ind	
 1	malo	Adverb	Type=general	DET	PronType=Ind	
 1	toliko	Adverb	Type=general	DET	PronType=Dem	
 1	največ	Adverb	Type=general	DET	PronType=Ind	
 1	mnogo	Adverb	Type=general	DET	PronType=Ind	
 1	preveč	Adverb	Type=general	DET	PronType=Ind	
 1	par	Adverb	Type=general	DET	PronType=Ind	
 1	koliko	Adverb	Type=general	DET	PronType=Int	
 1	dosti	Adverb	Type=general	DET	PronType=Ind	
 1	nešteto	Adverb	Type=general	DET	PronType=Ind	
 1	četrt	Adverb	Type=general	DET	PronType=Ind	
 1	ogromno	Adverb	Type=general	DET	PronType=Ind	
 1	čimveč	Adverb	Type=general	DET	PronType=Ind	
 1	obilo	Adverb	Type=general	DET	PronType=Ind	
 1	premnogo	Adverb	Type=general	DET	PronType=Ind	
 1	enormno	Adverb	Type=general	DET	PronType=Ind
 1	majčkeno	Adverb	Type=general	DET	PronType=Ind	
 2	*	Pronoun	Type=reflexive	*	PronType=Prs|Reflex=Yes	
 2	*	Pronoun	Type=personal	*	PronType=Prs	
 2	*	Pronoun	Type=possessive	*	PronType=Prs|Poss=Yes	
 2	*	Pronoun	Type=interrogative	*	PronType=Int	
 2	*	Pronoun	Type=relative	*	PronType=Rel	
 2	*	Pronoun	Type=demonstrative	*	PronType=Dem	
 2	*	Pronoun	Type=general	*	PronType=Tot	
 2	*	Pronoun	Type=negative	*	PronType=Neg	
 2	*	Pronoun	Type=indefinite	*	PronType=Ind	
 1	*	Pronoun	Type=personal	DET	PronType=Prs	
 1	*	Pronoun	Type=possessive	DET	PronType=Prs|Poss=Yes	
 1	*	Pronoun	Owner_Gender=masculine	*	Gender[psor]=Masc	#lg.spec.feature
 1	*	Pronoun	Owner_Gender=feminine	*	Gender[psor]=Fem	#lg.spec.feature
 1	*	Pronoun	Owner_Gender=neuter	*	Gender[psor]=Neut	#lg.spec.feature
 1	*	Pronoun	Owner_Number=singular	*	Number[psor]=Sing	#lg.spec.feature
 1	*	Pronoun	Owner_Number=plural	*	Number[psor]=Plur	#lg.spec.feature
 1	*	Pronoun	Owner_Number=dual	*	Number[psor]=Dual	#lg.spec.feature
 1	*	Pronoun	Clitic=yes	*	Variant=Short	#lg.spec.feature
 1	*	Pronoun	Clitic=bound	*	Variant=Bound	#lg.spec.feature
 1	svoj	Pronoun	Type=reflexive	*	PronType=Prs|Reflex=Yes|Poss=Yes	
 2	*	Numeral	Type=pronominal	*	-	
 2	*	Numeral	Form=letter	*	-	
 2	*	Numeral	Type=cardinal	NUM	NumType=Card	
 1	*	Numeral	Form=letter	NUM	NumForm=Word	#lg.spec.feature
 1	*	Numeral	Form=digit	NUM	NumForm=Digit	#lg.spec.feature
 1	*	Numeral	Form=roman	NUM	NumForm=Roman	#lg.spec.feature
 1	*	Numeral	Type=ordinal	*	NumType=Ord	
 1	*	Numeral	Type=special	ADJ	NumType=Mult	
 1	*	Numeral	Type=special	NUM	NumType=Sets	
 1	en	Numeral	Type=pronominal	*	NumType=Card	
 1	eden	Numeral	Type=pronominal	*	NumType=Card	
 1	*	Conjunction	Type=subordinating	*	-	
 1	*	Conjunction	Type=coordinating	*	-	
 2	*	Particle	*	*	-	
 1	ne	Particle	*	*	Polarity=Neg	
 1	*	Interjection	*	*	-	
 1	*	Abbreviation	*	*	Abbr=Yes	
 2	*	Residual	*	*	-	
 1	*	Residual	Type=foreign	*	Foreign=Yes	
 1	*	Residual	Type=typo	*	-	
 1	*	Residual	Type=program	*	-	
 1	*	Punctuation	*	*	-	
 2	*	*	Degree=positive	*	Degree=Pos	
 2	*	*	Degree=comparative	*	Degree=Cmp	
 2	*	*	Degree=superlative	*	Degree=Sup	
 1	*	*	Degree=positive	DET	-	
 1	*	*	Degree=comparative	DET	-	
 1	*	*	Degree=superlative	DET	-	
 1	*	*	Animate=no	*	Animacy=Inan	
 1	*	*	Animate=yes	*	Animacy=Anim	
 1	*	*	Aspect=perfective	*	Aspect=Perf	
 1	*	*	Aspect=progressive	*	Aspect=Imp	
 1	*	*	Aspect=biaspectual	*	-	
 1	*	*	Case=nominative	*	Case=Nom	
 1	*	*	Case=genitive	*	Case=Gen	
 1	*	*	Case=dative	*	Case=Dat	
 1	*	*	Case=accusative	*	Case=Acc	
 1	*	*	Case=locative	*	Case=Loc	
 1	*	*	Case=instrumental	*	Case=Ins	
 1	*	*	Definiteness=no	*	Definite=Ind	
 1	*	*	Definiteness=yes	*	Definite=Def	
 1	*	*	Gender=masculine	*	Gender=Masc	
 1	*	*	Gender=feminine	*	Gender=Fem	
 1	*	*	Gender=neuter	*	Gender=Neut	
 1	*	*	Number=singular	*	Number=Sing	
 1	*	*	Number=plural	*	Number=Plur	
 1	*	*	Number=dual	*	Number=Dual	
 1	*	*	Person=first	*	Person=1	
 1	*	*	Person=second	*	Person=2	
 1	*	*	Person=third	*	Person=3	
@@ -0,0 +1,282 @@
 # Mapping from JOS PoS to UD 2.0 PoS						
 # Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek						
 # 2019-02-04						
 #						
 #Prio	Lemma	Category	Feats	Deps	->PoS-UD	#Comment
 #-------------------------------------------------------------------------------------------------------						
 3	*	Noun	Type=common	*	NOUN	
 3	*	Noun	Type=proper	*	PROPN	
 3	*	Verb	*	*	VERB	
 2	*	Verb	Type=auxiliary	*	AUX	#This is one can in fact also be VERB, but this has to be determined by some other means
 3	*	Adjective	*	*	ADJ	
 3	*	Adverb	*	*	ADV	
 1	četrt	Adverb	*	*	DET	
 1	čimmanj	Adverb	*	*	DET	
 1	čimveč	Adverb	*	*	DET	
 1	dosti	Adverb	*	*	DET	
 1	dovolj	Adverb	*	*	DET	
 1	enako	Adverb	*	*	ADV	
 1	enormno	Adverb	*	*	DET	
 1	ful	Adverb	*	*	ADV	
 1	koliko	Adverb	*	*	DET	
 1	majčkeno	Adverb	*	*	DET	
 1	maksimalno	Adverb	*	*	ADV	
 1	malce	Adverb	*	*	ADV	
 1	malo	Adverb	*	*	DET	
 1	manj	Adverb	*	*	DET	
 1	minimalno	Adverb	*	*	ADV	
 1	mnogo	Adverb	*	*	DET	
 1	najmanj	Adverb	*	*	ADV	
 1	največ	Adverb	*	*	DET	
 1	nekaj	Adverb	*	*	DET	
 1	nekoliko	Adverb	*	*	ADV	
 1	nemalo	Adverb	*	*	ADV	
 1	nešteto	Adverb	*	*	DET	
 1	nič	Adverb	*	*	ADV	
 1	ničkoliko	Adverb	*	*	DET	
 1	obilo	Adverb	*	*	DET	
 1	ogromno	Adverb	*	*	DET	
 1	par	Adverb	*	*	DET	
 1	pol	Adverb	*	*	DET	
 1	polno	Adverb	*	*	ADV	
 1	precej	Adverb	*	*	ADV	
 1	premalo	Adverb	*	*	ADV	
 1	premnogo	Adverb	*	*	DET	
 1	preveč	Adverb	*	*	DET	
 1	toliko	Adverb	*	*	DET	
 1	veliko	Adverb	*	*	DET	
 1	več	Adverb	*	*	DET	
 1	večidel	Adverb	*	*	ADV	
 1	vse	Adverb	*	*	ADV	
 1	zadosti	Adverb	*	*	ADV	
 ##All Pronouns should be explicitly defined						
 ##But are not because of jos1M wrong lemmatisations for e.g. "ti", "te" etc.						
 3	*	Pronoun	*	*	PRON	
 ##2	*	Pronoun	Type=demonstrative	*	DET	
 ##2	*	Pronoun	Type=possessive	*	DET	
 1	bogsigavedikakšen	Pronoun	Type=indefinite	*	DET	
 1	bogvedikaj	Pronoun	Type=indefinite	*	PRON	
 1	bogvedikateri	Pronoun	Type=indefinite	*	DET	
 1	bogvekaj	Pronoun	Type=indefinite	*	PRON	
 1	bogvekakšen	Pronoun	Type=indefinite	*	DET	
 1	bogvekateri	Pronoun	Type=indefinite	*	DET	
 1	bogvekolik	Pronoun	Type=indefinite	*	DET	
 1	bogvekolikšen	Pronoun	Type=indefinite	*	DET	
 1	čezme	Pronoun	Type=personal	*	PRON	
 1	čezse	Pronoun	Type=reflexive	*	PRON	
 1	čigar	Pronoun	Type=relative	*	DET	
 1	čigarkoli	Pronoun	Type=relative	*	DET	
 1	čigarsižebodi	Pronoun	Type=relative	*	DET	
 1	čigav	Pronoun	Type=interrogative	*	DET	
 1	čigaver	Pronoun	Type=relative	*	DET	
 1	čigaverkoli	Pronoun	Type=relative	*	DET	
 1	čigavršen	Pronoun	Type=relative	*	DET	
 1	čigavršnji	Pronoun	Type=relative	*	DET	
 1	enak	Pronoun	Type=indefinite	*	DET	
 1	enaki	Pronoun	Type=indefinite	*	DET	
 1	enakšen	Pronoun	Type=indefinite	*	DET	
 1	isti	Pronoun	Type=indefinite	*	DET	
 1	jaz	Pronoun	Type=personal	*	PRON	
 1	jest	Pronoun	Type=personal	*	PRON	
 1	kaj	Pronoun	Type=interrogative	*	PRON	
 1	kak	Pronoun	Type=interrogative	*	DET	
 1	kakov	Pronoun	Type=interrogative	*	DET	
 1	kakošen	Pronoun	Type=interrogative	*	DET	
 1	kakršen	Pronoun	Type=relative	*	DET	
 1	kakršenkoli	Pronoun	Type=relative	*	DET	
 1	kakršensižebodi	Pronoun	Type=relative	*	DET	
 1	kakšen	Pronoun	Type=interrogative	*	DET	
 1	kar	Pronoun	Type=relative	*	PRON	
 1	karkoli	Pronoun	Type=relative	*	PRON	
 1	karsibodi	Pronoun	Type=relative	*	PRON	
 1	karsižebodi	Pronoun	Type=relative	*	PRON	
 1	kateri	Pronoun	Type=interrogative	*	DET	
 1	katerikoli	Pronoun	Type=relative	*	DET	
 1	katerisibodi	Pronoun	Type=relative	*	DET	
 1	kdo	Pronoun	Type=interrogative	*	PRON	
 1	kdor	Pronoun	Type=relative	*	PRON	
 1	kdorkoli	Pronoun	Type=relative	*	PRON	
 1	kdorsibodi	Pronoun	Type=relative	*	PRON	
 1	kdorsižebodi	Pronoun	Type=relative	*	PRON	
 1	kdovekaj	Pronoun	Type=indefinite	*	PRON	
 1	kdovekak	Pronoun	Type=indefinite	*	DET	
 1	kdovekakšen	Pronoun	Type=indefinite	*	DET	
 1	kdovekateri	Pronoun	Type=indefinite	*	DET	
 1	kdovekdo	Pronoun	Type=indefinite	*	PRON	
 1	kdovekolik	Pronoun	Type=indefinite	*	DET	
 1	koji	Pronoun	Type=interrogative	*	DET	
 1	kolik	Pronoun	Type=interrogative	*	DET	
 1	kolik	Pronoun	Type=indefinite	*	DET	
 1	koliker	Pronoun	Type=interrogative	*	DET	
 1	kolikršen	Pronoun	Type=relative	*	DET	
 1	kolikšen	Pronoun	Type=interrogative	*	DET	
 1	malokaj	Pronoun	Type=indefinite	*	PRON	
 1	malokak	Pronoun	Type=indefinite	*	DET	
 1	malokakšen	Pronoun	Type=indefinite	*	DET	
 1	malokateri	Pronoun	Type=indefinite	*	DET	
 1	malokdo	Pronoun	Type=indefinite	*	PRON	
 1	marsikaj	Pronoun	Type=indefinite	*	PRON	
 1	marsikak	Pronoun	Type=indefinite	*	DET	
 1	marsikakšen	Pronoun	Type=indefinite	*	DET	
 1	marsikateri	Pronoun	Type=indefinite	*	DET	
 1	marsikdo	Pronoun	Type=indefinite	*	PRON	
 1	marsičigav	Pronoun	Type=indefinite	*	DET	
 1	medme	Pronoun	Type=personal	*	PRON	
 1	medse	Pronoun	Type=reflexive	*	PRON	
 1	mnog	Pronoun	Type=indefinite	*	DET	
 1	mnogokaj	Pronoun	Type=indefinite	*	PRON	
 1	mnogokateri	Pronoun	Type=indefinite	*	DET	
 1	mnogokdo	Pronoun	Type=indefinite	*	PRON	
 1	moj	Pronoun	Type=possessive	*	DET	
 1	nadme	Pronoun	Type=personal	*	PRON	
 1	nadse	Pronoun	Type=reflexive	*	PRON	
 1	najin	Pronoun	Type=possessive	*	DET	
 1	name	Pronoun	Type=personal	*	PRON	
 1	nase	Pronoun	Type=reflexive	*	PRON	
 1	naš	Pronoun	Type=possessive	*	DET	
 1	negdo	Pronoun	Type=indefinite	*	PRON	
 1	nek	Pronoun	Type=indefinite	*	DET	
 1	nekaj	Pronoun	Type=indefinite	*	PRON	
 1	nekak	Pronoun	Type=indefinite	*	DET	
 1	nekakov	Pronoun	Type=indefinite	*	DET	
 1	nekakšen	Pronoun	Type=indefinite	*	DET	
 1	nekateri	Pronoun	Type=indefinite	*	DET	
 1	nekdo	Pronoun	Type=indefinite	*	PRON	
 1	neki	Pronoun	Type=indefinite	*	DET	
 1	nekolik	Pronoun	Type=indefinite	*	DET	
 1	nekolikšen	Pronoun	Type=indefinite	*	DET	
 1	nekolikšnji	Pronoun	Type=indefinite	*	DET	
 1	nekov	Pronoun	Type=indefinite	*	DET	
 1	nekšen	Pronoun	Type=indefinite	*	DET	
 1	nevemkakšen	Pronoun	Type=indefinite	*	DET	
 1	nihče	Pronoun	Type=negative	*	PRON	
 1	nikak	Pronoun	Type=negative	*	DET	
 1	nikakršen	Pronoun	Type=negative	*	DET	
 1	nikakšen	Pronoun	Type=negative	*	DET	
 1	nikdo	Pronoun	Type=negative	*	PRON	
 1	nikogaršen	Pronoun	Type=negative	*	DET	
 1	nikogaršnji	Pronoun	Type=negative	*	DET	
 1	nič	Pronoun	Type=negative	*	PRON	
 1	njegov	Pronoun	Type=possessive	*	DET	
 1	njen	Pronoun	Type=possessive	*	DET	
 1	njihen	Pronoun	Type=possessive	*	DET	
 1	njihnji	Pronoun	Type=possessive	*	DET	
 1	njihov	Pronoun	Type=possessive	*	DET	
 1	njun	Pronoun	Type=possessive	*	DET	
 1	nobeden	Pronoun	Type=negative	*	PRON	
 1	noben	Pronoun	Type=negative	*	DET	
 1	oba	Pronoun	Type=general	*	DET	
 1	obadva	Pronoun	Type=general	*	PRON	
 1	obme	Pronoun	Type=personal	*	PRON	
 1	oboj	Pronoun	Type=general	*	DET	
 1	obojen	Pronoun	Type=general	*	DET	
 1	obse	Pronoun	Type=reflexive	*	PRON	
 1	on	Pronoun	Type=personal	*	PRON	
 1	oni	Pronoun	Type=demonstrative	*	DET	
 1	onile	Pronoun	Type=demonstrative	*	PRON	
 1	podme	Pronoun	Type=personal	*	PRON	
 1	podse	Pronoun	Type=reflexive	*	PRON	
 1	pome	Pronoun	Type=personal	*	PRON	
 1	predme	Pronoun	Type=personal	*	PRON	
 1	predse	Pronoun	Type=reflexive	*	PRON	
 1	premarsikateri	Pronoun	Type=indefinite	*	DET	
 1	premnog	Pronoun	Type=indefinite	*	DET	
 1	prenekaj	Pronoun	Type=indefinite	*	PRON	
 1	prenekateri	Pronoun	Type=indefinite	*	DET	
 1	prenekdo	Pronoun	Type=indefinite	*	PRON	
 1	redkokateri	Pronoun	Type=indefinite	*	DET	
 1	redkokdo	Pronoun	Type=indefinite	*	PRON	
 1	se	Pronoun	Type=reflexive	*	PRON	
 1	skozme	Pronoun	Type=personal	*	PRON	
 1	skozse	Pronoun	Type=reflexive	*	PRON	
 1	svoj	Pronoun	Type=reflexive	*	DET	
 1	ta	Pronoun	Type=demonstrative	*	DET	
 1	tadva	Pronoun	Type=demonstrative	*	PRON	
 1	taisti	Pronoun	Type=demonstrative	*	DET	
 1	tak	Pronoun	Type=demonstrative	*	DET	
 1	takisti	Pronoun	Type=demonstrative	*	DET	
 1	takle	Pronoun	Type=demonstrative	*	DET	
 1	takov	Pronoun	Type=demonstrative	*	DET	
 1	takošen	Pronoun	Type=demonstrative	*	DET	
 1	takšen	Pronoun	Type=demonstrative	*	DET	
 1	takšenle	Pronoun	Type=demonstrative	*	DET	
 1	tale	Pronoun	Type=demonstrative	*	DET	
 1	talele	Pronoun	Type=demonstrative	*	DET	
 1	teu	Pronoun	Type=personal	*	PRON	
 1	ti	Pronoun	Type=personal	*	PRON	
 1	tisti	Pronoun	Type=demonstrative	*	DET	
 1	tistile	Pronoun	Type=demonstrative	*	DET	
 1	tolik	Pronoun	Type=demonstrative	*	DET	
 1	toliker	Pronoun	Type=demonstrative	*	DET	
 1	tolikšen	Pronoun	Type=demonstrative	*	DET	
 1	tolikšnji	Pronoun	Type=demonstrative	*	DET	
 1	toti	Pronoun	Type=demonstrative	*	DET	
 1	tvoj	Pronoun	Type=possessive	*	DET	
 1	un	Pronoun	Type=demonstrative	*	DET	
 1	vajin	Pronoun	Type=possessive	*	DET	
 1	vame	Pronoun	Type=personal	*	PRON	
 1	vase	Pronoun	Type=reflexive	*	PRON	
 1	vaš	Pronoun	Type=possessive	*	DET	
 1	ves	Pronoun	Type=general	*	DET	
 1	vsak	Pronoun	Type=general	*	DET	
 1	vsakateri	Pronoun	Type=general	*	DET	
 1	vsakdo	Pronoun	Type=general	*	PRON	
 1	vsakogaršen	Pronoun	Type=general	*	DET	
 1	vsakogaršnji	Pronoun	Type=general	*	DET	
 1	vsakršen	Pronoun	Type=general	*	DET	
 1	vsakteri	Pronoun	Type=general	*	DET	
 1	zame	Pronoun	Type=personal	*	PRON	
 1	zase	Pronoun	Type=reflexive	*	PRON	
 3	*	Numeral	Form=digit	*	NUM	
 3	*	Numeral	Form=roman	*	NUM	
 3	*	Numeral	Form=letter|Type=special	*	NUM	
 3	*	Numeral	Form=letter|Type=cardinal	*	NUM	
 2	*	Numeral	Form=letter|Type=ordinal	*	ADJ	
 1	drug	Numeral	Form=letter|Type=pronominal	*	ADJ	
 1	en	Numeral	Form=letter|Type=pronominal	*	NUM	
 1	*en	Numeral	Form=letter|Type=special	*	ADJ	#enojen, dvojen
 1	eden	Numeral	Form=letter|Type=pronominal	*	NUM	#Dodal E.T.
 3	*	Adposition	*	*	ADP	#MULTEXT-East name
 3	*	Preposition	*	*	ADP	#JOS name
 3	*	Conjunction	Type=coordinating	*	CCONJ	
 3	*	Conjunction	Type=subordinating	*	SCONJ	
 3	*	Particle	*	*	PART	
 3	*	Interjection	*	*	INTJ	
 3	*	Abbreviation	*	*	X	
 3	*	Residual	*	*	X	
 2	*	Residual	Type=web	*	SYM	
 2	*	Residual	Type=emo	*	SYM	
 2	*	Residual	Type=hashtag	*	SYM	#Better mapping?
 2	*	Residual	Type=at	*	SYM	#Better mapping?
 2	*	Residual	Type=foreign	*	X	#Better mapping?
 3	*	Punctuation	*	*	PUNCT	
 1	#	Punctuation	*	*	SYM	
 1	%	Punctuation	*	*	SYM	
 1	&	Punctuation	*	*	SYM	
 1	<	Punctuation	*	*	SYM	
 1	>	Punctuation	*	*	SYM	
 1	+	Punctuation	*	*	SYM	
 1	=	Punctuation	*	*	SYM	
 1	°	Punctuation	*	*	SYM	
 1	×	Punctuation	*	*	SYM	
 1	÷	Punctuation	*	*	SYM	
 1	$	Punctuation	*	*	SYM	
 1	@	Punctuation	*	*	SYM	
 1	µ	Punctuation	*	*	SYM	
 1	©	Punctuation	*	*	SYM	
 1	§	Punctuation	*	*	SYM	
 1	€	Punctuation	*	*	SYM
 1	£	Punctuation	*	*	SYM
@@ -1,6 +1,6 @@
 import unittest
-from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException
+from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException, Msd
 class JosPropertiesToMsdTestCase(unittest.TestCase):
@@ -52,6 +52,25 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
        self.assertEqual(msd.language, 'en')
        self.assertEqual(msd.code, 'Nc-d')
    def test_msd_to_jos(self):
        ud = self.converter.msd_to_ud(Msd('Ppnzei', 'sl'), 'slovenski')
        self.assertEqual(ud.pos, 'ADJ')
        self.assertEqual(ud.to_full_string(), 'UposTag=ADJ|Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
        self.assertEqual(ud.to_features_string(), 'Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
        ud = self.converter.msd_to_ud(Msd('Sommr', 'sl'), 'dečko')
        self.assertEqual(ud.pos, 'NOUN')
        self.assertEqual(ud.to_full_string(), 'UposTag=NOUN|Case=Gen|Gender=Masc|Number=Plur')
        self.assertEqual(ud.to_features_string(), 'Case=Gen|Gender=Masc|Number=Plur')
    def test_msd_to_jos_partial_msd(self):
        try:
            self.converter.msd_to_ud(Msd('Soz', 'sl'), 'vlada')
            fails = False
        except MsdException:
            fails = True
        self.assertEqual(fails, True)
    def test_bad_msd_with_require_valid(self):
        try:
            self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
@@ -4,7 +4,6 @@ This script was developed in the context of a specific task and may not generali
 """
 import argparse
 import codecs
 import lxml.etree as lxml
 from importlib_resources import files
@@ -13,7 +12,7 @@ from conversion_utils.jos_msds_and_properties import Converter, Msd
 def get_syn_map():
    dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
-    dict_file = codecs.open(dict_file_name, 'r')
+    dict_file = open(dict_file_name, 'r', encoding='utf-8')
    root = lxml.parse(dict_file).getroot()
    dict_file.close() 
    return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
@@ -23,8 +22,8 @@ def translate(input_file_name, scope, output_file_name):
    syn_map = get_syn_map()
-    output_file = codecs.open(output_file_name, 'w')
+    output_file = open(output_file_name, 'w', encoding='utf-8')
-    input_file = codecs.open(input_file_name, 'r')
+    input_file = open(input_file_name, 'r', encoding='utf-8')
    converter = Converter()
@@ -6,7 +6,7 @@ with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
    long_description = f.read()
 setup(name='cjvt_conversion_utils',
-      version='0.3',
+      version='0.4',
      description='CJVT conversion utilities',
      long_description=long_description,
      long_description_content_type="text/markdown",
Author	SHA1	Message	Date
orglce	d7fd608037	Updated to version 0.4	2026-05-08 08:08:10 +02:00
orglce	bb3c673e29	Specifications are loaded on import	2026-04-15 08:23:06 +02:00
orglce	aef9a3698f	Added static list of partial msds inside jos_specifications.pickle	2026-04-08 15:19:40 +02:00
orglce	e62c096126	Small bug fix regarding UD features conversion to string	2026-04-01 12:54:09 +02:00
orglce	165f24c64c	Added conversion from msd to universal dependencies based on Jaka's implementation	2026-03-30 22:31:08 +02:00
orglce	4d86631283	No warning messages for partial msds	2026-03-30 10:03:49 +02:00
orglce	b711fae3b5	UTF-8 encoding fix	2025-11-28 16:55:04 +01:00