7 Commits

10 changed files with 2514 additions and 44 deletions
+3
View File
@@ -1,3 +1,6 @@
include conversion_utils/resources/jos_specifications.pickle
include conversion_utils/resources/dict.xml
include conversion_utils/resources/structure_conversions.csv
include conversion_utils/resources/jos-msd2features.tbl
include conversion_utils/resources/jos2ud-features.tbl
include conversion_utils/resources/jos2ud-pos.tbl
+3 -3
View File
@@ -312,7 +312,7 @@ def construct_tei_etrees(conllu_lines):
def convert_file(input_file_name, output_file_name):
input_file = open(input_file_name, 'r')
input_file = open(input_file_name, 'r', encoding='utf-8')
root = construct_tei_etrees(input_file)[0]
tree = etree.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@@ -332,7 +332,7 @@ if __name__ == '__main__':
args = parser.parse_args()
if args.out:
f_out = open(args.out, 'w')
f_out = open(args.out, 'w', encoding='utf-8')
else:
f_out = sys.stdout
@@ -341,7 +341,7 @@ if __name__ == '__main__':
for arg in args.files:
filelist = glob(arg)
for f in filelist:
with open(f, 'r') as conllu_f:
with open(f, 'r', encoding='utf-8') as conllu_f:
tei_etrees = construct_tei_etrees(conllu_f)
for tei_etree in tei_etrees:
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
+172 -33
View File
@@ -1,12 +1,21 @@
import lxml.etree as lxml
import re
import pickle
import lxml.etree as lxml
from collections import defaultdict
from importlib_resources import files
from enum import IntEnum
from conversion_utils.utils import xpath_find, get_xml_id
JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'
RESOURCES_DIR = "conversion_utils.resources"
MSD_TO_FEATURES = "jos-msd2features.tbl"
JOS_TO_UD_FEATURES_RULES = "jos2ud-features.tbl"
JOS_TO_UPOS_RULES = "jos2ud-pos.tbl"
## Positions of lexeme-level features for each category
LEXEME_FEATURE_MAP = {'noun':{1,2},
'verb':{1,2},
@@ -53,6 +62,14 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
class MsdState(IntEnum):
UNKNOWN = -1
PARTIAL = 1
FULL = 2
class MsdException(Exception):
pass
class Specifications:
"""JOS specifications with list of all word categories."""
@@ -216,12 +233,68 @@ class Properties:
and self.language == obj.language
class UD:
"""Universal Dependencies object.
Can be converted to a valid UD features string.
"""
def __init__(self, pos, features_map):
self.pos = pos
self.features_map = features_map
def to_features_string(self):
return self._features_string()
def to_full_string(self):
features = self._features_string()
if features:
return "UposTag=" + self.pos + "|" + features
else:
return "UposTag=" + self.pos
def _features_string(self):
return "|".join([f"{feature}={value}" for feature, value in self._sort_features(self.features_map)])
def _sort_features(self, features_map):
return sorted(features_map.items(), key=lambda x: x[0].lower(), reverse=False)
def __str__(self):
return f"pos={self.pos}, features_map={self.features_map}"
class Msd:
"""JOS msd."""
def __init__(self, code, language):
class State(IntEnum):
UNKNOWN = -1
PARTIAL = 1
FULL = 2
def __init__(self, code, language, expected_state=State.FULL, require_valid=False):
self.code = code
self.language = language
self.expected_state = expected_state
self.require_valid = require_valid
self.state = self._validate_and_get_state()
def _validate_and_get_state(self):
states = set()
if self.code in DEFAULT_SPECIFICATIONS.codes_map[self.language]:
states.add(self.State.FULL)
if self.code in DEFAULT_SPECIFICATIONS.partial_codes_map[self.language]:
states.add(self.State.PARTIAL)
if len(states) == 0:
states.add(self.State.UNKNOWN)
if self.expected_state not in states:
if self.require_valid:
raise MsdException(f"Given msd '{self.codecode}' is '{self.state.name}', but expected state is '{self.expected_state.name}'.")
else:
if self.state == self.State.UNKNOWN:
print(f"[WARN] The Msd '{self.code}' is unknown.")
else:
print(f"[WARN] The Msd '{self.code}' is unknown for expected state '{self.expected_state.name}'.")
return max(states)
def __str__(self):
return 'code={code}, language={language}'.format(code=self.code, language=self.language)
@@ -230,26 +303,12 @@ class Msd:
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
class CustomException(Exception):
pass
class MsdException(CustomException):
pass
class Converter:
"""Converter between Msd and Properties objects."""
def __init__(self, xml_file_name=None):
if (xml_file_name is None):
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
if (resource.is_file()):
try:
with resource.open('rb') as pickle_file:
self.specifications = pickle.load(pickle_file)
except:
exit('Could not parse specifications pickle file installed.')
else:
exit('No pickle installed or xml provided.')
self.specifications = DEFAULT_SPECIFICATIONS
else:
parser = SpecificationsParser()
try:
@@ -257,20 +316,45 @@ class Converter:
except:
exit('Could not parse specifications xml file provided.')
def is_valid_msd(self, msd):
"""Verify if the Msd code is in the standard JOS set."""
return msd.code in self.specifications.codes_map[msd.language]
self.mte_to_ud_features = self._parse_msd_ud_conversion(MSD_TO_FEATURES)
self.mte_to_ud_features_rules = self._parse_ud_rules(JOS_TO_UD_FEATURES_RULES)
self.mte_to_upos_rules = self._parse_ud_rules(JOS_TO_UPOS_RULES)
def check_valid_msd(self, msd, require_valid_flag):
"""If the Msd code is not valid, raise an exception or give a warning."""
if (not self.is_valid_msd(msd)):
message = 'The msd {} is unknown'.format(msd.code)
if (require_valid_flag):
raise MsdException(message)
else:
print('[WARN] ' + message)
def _get_partial_msd(self, msd):
properties = self.msd_to_properties(msd, msd.language)
category_char = msd.code[0].lower()
category = self.specifications.find_category_by_code(category_char, msd.language)
category_name = category.names.get(msd.language)
properties = Properties(
category=category_name,
lexeme_feature_map=properties.lexeme_feature_map,
form_feature_map={},
language=msd.language
)
return self.properties_to_msd(properties, msd.language, expected_state=Msd.State.PARTIAL).code
def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False):
def _parse_msd_ud_conversion(self, file_name):
"""Parse file with direct conversions from English Msd to Universal Dependencies."""
conversion_map = defaultdict()
with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as conversion_file:
for line in conversion_file.readlines():
mte_msd_en, mte_features_en = line.strip("\n").split("\t")
mte_sl = self.translate_msd(Msd(mte_msd_en, "en"), "sl").code
conversion_map[mte_msd_en] = mte_features_en
conversion_map[mte_sl] = mte_features_en
return conversion_map
def _parse_ud_rules(self, file_name):
"""Parse file with rules additional rules for converting from applied to conversion from English Msd to Universal Dependencies."""
all_rules = defaultdict(list)
with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as rules_file:
for line in [l for l in rules_file.readlines() if l[0].isdigit()]:
priority, *current_rules = line.strip("\n").split("\t")
current_rules += [""] * (6 - len(current_rules))
all_rules[priority].append(current_rules)
return all_rules
def msd_to_properties(self, msd, language, lemma=None, warn_level_flag=False):
"""Convert Msd to Properties.
The language of the generated Properties is specified and can differ from the Msd language.
@@ -296,7 +380,6 @@ class Converter:
Properties: the result of the conversion of the Msd in the language requested
"""
self.check_valid_msd(msd, require_valid_flag)
category_char = msd.code[0].lower()
value_chars = msd.code[1:]
category = self.specifications.find_category_by_code(category_char, msd.language)
@@ -322,7 +405,7 @@ class Converter:
form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, language)
def properties_to_msd(self, properties, language, require_valid_flag=False):
def properties_to_msd(self, properties, language, expected_state=Msd.State.FULL):
"""Convert Properties to Msd.
The language of the generated Msd is specified and can differ from the Properties language.
@@ -354,12 +437,68 @@ class Converter:
msd_code += '-'
i += 1
msd_code += position_map[position]
msd = Msd(msd_code, language)
self.check_valid_msd(msd, require_valid_flag)
msd = Msd(msd_code, language, expected_state=expected_state)
return msd
def msd_to_ud(self, msd, lemma):
"""Convert Msd to Universal Dependencies object.
Partial Msds are currently not supported.
Parameters:
msd(Msd): the Msd to convert
lemma(str): the lemma of the word form with the MSD
"""
if msd.state != Msd.State.FULL:
raise MsdException(f"Msd must be full to be converted to UD.")
upos_category, *upos_features = self.mte_to_ud_features[msd.code].split()
final_upos = ""
for priority in sorted(self.mte_to_upos_rules, reverse=True):
for rule in self.mte_to_upos_rules[priority]:
rule_lemma, rule_category, rule_mte_features, _, rule_pos_ud, _ = rule
if (rule_category != upos_category
or (rule_lemma not in ("*", "*en") and lemma != rule_lemma)
or (rule_lemma == "*en" and not lemma.endswith("en"))
or (rule_mte_features != "*" and not all(f in upos_features for f in rule_mte_features.split("|")))):
continue
final_upos = rule_pos_ud
for priority in sorted(self.mte_to_ud_features_rules):
for rule in self.mte_to_ud_features_rules[priority]:
rule_lemma, rule_category, rule_mte_features, rule_pos_ud, rule_ud_features, _ = rule
if (rule_lemma != "*" and lemma != rule_lemma
or (rule_category != "*" and rule_category != upos_category)
or (rule_pos_ud != "*" and rule_pos_ud != final_upos)):
continue
upos_features = [rule_ud_features if f == rule_mte_features else f for f in upos_features]
if rule_mte_features == "*" and rule_ud_features != "-":
upos_features.append(rule_ud_features)
ud_features = dict(f.split("=", 1) for f in "|".join(upos_features).split("|") if f not in {"", "-"})
return UD(final_upos, ud_features)
def translate_msd(self, msd, language):
return self.properties_to_msd(self.msd_to_properties(msd, language), language)
def translate_properties(self, properties, language):
return self.msd_to_properties(self.properties_to_msd(properties, language), language)
def _load_default_specifications():
global DEFAULT_SPECIFICATIONS
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
if resource.is_file():
try:
with resource.open('rb') as pickle_file:
DEFAULT_SPECIFICATIONS = pickle.load(pickle_file)
except Exception as e:
exit('Could not parse specifications pickle file installed.')
else:
exit("Default specifications not found.")
_load_default_specifications()
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,128 @@
# Mapping from JOS features to UD features
# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek
# 2018-11-23
#
#Prio Lemma Category Feats PoS-UD ->Feature-UD #Comment
----------------------------------------------------------------------------------------------------
1 * Noun Type=common * -
1 * Noun Type=proper * -
1 * Verb Negative=no * Polarity=Pos
1 * Verb Negative=yes * Polarity=Neg
1 * Verb Type=auxiliary * -
1 * Verb Type=main * -
1 * Verb VForm=present * VerbForm=Fin|Mood=Ind|Tense=Pres
1 * Verb VForm=future * VerbForm=Fin|Mood=Ind|Tense=Fut
1 * Verb VForm=conditional * VerbForm=Fin|Mood=Cnd
1 * Verb VForm=imperative * VerbForm=Fin|Mood=Imp
1 * Verb VForm=infinitive * VerbForm=Inf
1 * Verb VForm=supine * VerbForm=Sup
1 * Verb VForm=participle * VerbForm=Part
1 * Adjective Type=general * -
1 * Adjective Type=possessive * Poss=Yes
1 * Adjective Type=participle * VerbForm=Part
2 * Adverb Type=participle * VerbForm=Conv
2 * Adverb Type=general * -
1 nekaj Adverb Type=general DET PronType=Ind
1 več Adverb Type=general DET PronType=Ind
1 veliko Adverb Type=general DET PronType=Ind
1 manj Adverb Type=general DET PronType=Ind
1 dovolj Adverb Type=general DET PronType=Ind
1 pol Adverb Type=general DET PronType=Ind
1 malo Adverb Type=general DET PronType=Ind
1 toliko Adverb Type=general DET PronType=Dem
1 največ Adverb Type=general DET PronType=Ind
1 mnogo Adverb Type=general DET PronType=Ind
1 preveč Adverb Type=general DET PronType=Ind
1 par Adverb Type=general DET PronType=Ind
1 koliko Adverb Type=general DET PronType=Int
1 dosti Adverb Type=general DET PronType=Ind
1 nešteto Adverb Type=general DET PronType=Ind
1 četrt Adverb Type=general DET PronType=Ind
1 ogromno Adverb Type=general DET PronType=Ind
1 čimveč Adverb Type=general DET PronType=Ind
1 obilo Adverb Type=general DET PronType=Ind
1 premnogo Adverb Type=general DET PronType=Ind
1 enormno Adverb Type=general DET PronType=Ind
1 majčkeno Adverb Type=general DET PronType=Ind
2 * Pronoun Type=reflexive * PronType=Prs|Reflex=Yes
2 * Pronoun Type=personal * PronType=Prs
2 * Pronoun Type=possessive * PronType=Prs|Poss=Yes
2 * Pronoun Type=interrogative * PronType=Int
2 * Pronoun Type=relative * PronType=Rel
2 * Pronoun Type=demonstrative * PronType=Dem
2 * Pronoun Type=general * PronType=Tot
2 * Pronoun Type=negative * PronType=Neg
2 * Pronoun Type=indefinite * PronType=Ind
1 * Pronoun Type=personal DET PronType=Prs
1 * Pronoun Type=possessive DET PronType=Prs|Poss=Yes
1 * Pronoun Owner_Gender=masculine * Gender[psor]=Masc #lg.spec.feature
1 * Pronoun Owner_Gender=feminine * Gender[psor]=Fem #lg.spec.feature
1 * Pronoun Owner_Gender=neuter * Gender[psor]=Neut #lg.spec.feature
1 * Pronoun Owner_Number=singular * Number[psor]=Sing #lg.spec.feature
1 * Pronoun Owner_Number=plural * Number[psor]=Plur #lg.spec.feature
1 * Pronoun Owner_Number=dual * Number[psor]=Dual #lg.spec.feature
1 * Pronoun Clitic=yes * Variant=Short #lg.spec.feature
1 * Pronoun Clitic=bound * Variant=Bound #lg.spec.feature
1 svoj Pronoun Type=reflexive * PronType=Prs|Reflex=Yes|Poss=Yes
2 * Numeral Type=pronominal * -
2 * Numeral Form=letter * -
2 * Numeral Type=cardinal NUM NumType=Card
1 * Numeral Form=letter NUM NumForm=Word #lg.spec.feature
1 * Numeral Form=digit NUM NumForm=Digit #lg.spec.feature
1 * Numeral Form=roman NUM NumForm=Roman #lg.spec.feature
1 * Numeral Type=ordinal * NumType=Ord
1 * Numeral Type=special ADJ NumType=Mult
1 * Numeral Type=special NUM NumType=Sets
1 en Numeral Type=pronominal * NumType=Card
1 eden Numeral Type=pronominal * NumType=Card
1 * Conjunction Type=subordinating * -
1 * Conjunction Type=coordinating * -
2 * Particle * * -
1 ne Particle * * Polarity=Neg
1 * Interjection * * -
1 * Abbreviation * * Abbr=Yes
2 * Residual * * -
1 * Residual Type=foreign * Foreign=Yes
1 * Residual Type=typo * -
1 * Residual Type=program * -
1 * Punctuation * * -
2 * * Degree=positive * Degree=Pos
2 * * Degree=comparative * Degree=Cmp
2 * * Degree=superlative * Degree=Sup
1 * * Degree=positive DET -
1 * * Degree=comparative DET -
1 * * Degree=superlative DET -
1 * * Animate=no * Animacy=Inan
1 * * Animate=yes * Animacy=Anim
1 * * Aspect=perfective * Aspect=Perf
1 * * Aspect=progressive * Aspect=Imp
1 * * Aspect=biaspectual * -
1 * * Case=nominative * Case=Nom
1 * * Case=genitive * Case=Gen
1 * * Case=dative * Case=Dat
1 * * Case=accusative * Case=Acc
1 * * Case=locative * Case=Loc
1 * * Case=instrumental * Case=Ins
1 * * Definiteness=no * Definite=Ind
1 * * Definiteness=yes * Definite=Def
1 * * Gender=masculine * Gender=Masc
1 * * Gender=feminine * Gender=Fem
1 * * Gender=neuter * Gender=Neut
1 * * Number=singular * Number=Sing
1 * * Number=plural * Number=Plur
1 * * Number=dual * Number=Dual
1 * * Person=first * Person=1
1 * * Person=second * Person=2
1 * * Person=third * Person=3
+282
View File
@@ -0,0 +1,282 @@
# Mapping from JOS PoS to UD 2.0 PoS
# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek
# 2019-02-04
#
#Prio Lemma Category Feats Deps ->PoS-UD #Comment
#-------------------------------------------------------------------------------------------------------
3 * Noun Type=common * NOUN
3 * Noun Type=proper * PROPN
3 * Verb * * VERB
2 * Verb Type=auxiliary * AUX #This is one can in fact also be VERB, but this has to be determined by some other means
3 * Adjective * * ADJ
3 * Adverb * * ADV
1 četrt Adverb * * DET
1 čimmanj Adverb * * DET
1 čimveč Adverb * * DET
1 dosti Adverb * * DET
1 dovolj Adverb * * DET
1 enako Adverb * * ADV
1 enormno Adverb * * DET
1 ful Adverb * * ADV
1 koliko Adverb * * DET
1 majčkeno Adverb * * DET
1 maksimalno Adverb * * ADV
1 malce Adverb * * ADV
1 malo Adverb * * DET
1 manj Adverb * * DET
1 minimalno Adverb * * ADV
1 mnogo Adverb * * DET
1 najmanj Adverb * * ADV
1 največ Adverb * * DET
1 nekaj Adverb * * DET
1 nekoliko Adverb * * ADV
1 nemalo Adverb * * ADV
1 nešteto Adverb * * DET
1 nič Adverb * * ADV
1 ničkoliko Adverb * * DET
1 obilo Adverb * * DET
1 ogromno Adverb * * DET
1 par Adverb * * DET
1 pol Adverb * * DET
1 polno Adverb * * ADV
1 precej Adverb * * ADV
1 premalo Adverb * * ADV
1 premnogo Adverb * * DET
1 preveč Adverb * * DET
1 toliko Adverb * * DET
1 veliko Adverb * * DET
1 več Adverb * * DET
1 večidel Adverb * * ADV
1 vse Adverb * * ADV
1 zadosti Adverb * * ADV
##All Pronouns should be explicitly defined
##But are not because of jos1M wrong lemmatisations for e.g. "ti", "te" etc.
3 * Pronoun * * PRON
##2 * Pronoun Type=demonstrative * DET
##2 * Pronoun Type=possessive * DET
1 bogsigavedikakšen Pronoun Type=indefinite * DET
1 bogvedikaj Pronoun Type=indefinite * PRON
1 bogvedikateri Pronoun Type=indefinite * DET
1 bogvekaj Pronoun Type=indefinite * PRON
1 bogvekakšen Pronoun Type=indefinite * DET
1 bogvekateri Pronoun Type=indefinite * DET
1 bogvekolik Pronoun Type=indefinite * DET
1 bogvekolikšen Pronoun Type=indefinite * DET
1 čezme Pronoun Type=personal * PRON
1 čezse Pronoun Type=reflexive * PRON
1 čigar Pronoun Type=relative * DET
1 čigarkoli Pronoun Type=relative * DET
1 čigarsižebodi Pronoun Type=relative * DET
1 čigav Pronoun Type=interrogative * DET
1 čigaver Pronoun Type=relative * DET
1 čigaverkoli Pronoun Type=relative * DET
1 čigavršen Pronoun Type=relative * DET
1 čigavršnji Pronoun Type=relative * DET
1 enak Pronoun Type=indefinite * DET
1 enaki Pronoun Type=indefinite * DET
1 enakšen Pronoun Type=indefinite * DET
1 isti Pronoun Type=indefinite * DET
1 jaz Pronoun Type=personal * PRON
1 jest Pronoun Type=personal * PRON
1 kaj Pronoun Type=interrogative * PRON
1 kak Pronoun Type=interrogative * DET
1 kakov Pronoun Type=interrogative * DET
1 kakošen Pronoun Type=interrogative * DET
1 kakršen Pronoun Type=relative * DET
1 kakršenkoli Pronoun Type=relative * DET
1 kakršensižebodi Pronoun Type=relative * DET
1 kakšen Pronoun Type=interrogative * DET
1 kar Pronoun Type=relative * PRON
1 karkoli Pronoun Type=relative * PRON
1 karsibodi Pronoun Type=relative * PRON
1 karsižebodi Pronoun Type=relative * PRON
1 kateri Pronoun Type=interrogative * DET
1 katerikoli Pronoun Type=relative * DET
1 katerisibodi Pronoun Type=relative * DET
1 kdo Pronoun Type=interrogative * PRON
1 kdor Pronoun Type=relative * PRON
1 kdorkoli Pronoun Type=relative * PRON
1 kdorsibodi Pronoun Type=relative * PRON
1 kdorsižebodi Pronoun Type=relative * PRON
1 kdovekaj Pronoun Type=indefinite * PRON
1 kdovekak Pronoun Type=indefinite * DET
1 kdovekakšen Pronoun Type=indefinite * DET
1 kdovekateri Pronoun Type=indefinite * DET
1 kdovekdo Pronoun Type=indefinite * PRON
1 kdovekolik Pronoun Type=indefinite * DET
1 koji Pronoun Type=interrogative * DET
1 kolik Pronoun Type=interrogative * DET
1 kolik Pronoun Type=indefinite * DET
1 koliker Pronoun Type=interrogative * DET
1 kolikršen Pronoun Type=relative * DET
1 kolikšen Pronoun Type=interrogative * DET
1 malokaj Pronoun Type=indefinite * PRON
1 malokak Pronoun Type=indefinite * DET
1 malokakšen Pronoun Type=indefinite * DET
1 malokateri Pronoun Type=indefinite * DET
1 malokdo Pronoun Type=indefinite * PRON
1 marsikaj Pronoun Type=indefinite * PRON
1 marsikak Pronoun Type=indefinite * DET
1 marsikakšen Pronoun Type=indefinite * DET
1 marsikateri Pronoun Type=indefinite * DET
1 marsikdo Pronoun Type=indefinite * PRON
1 marsičigav Pronoun Type=indefinite * DET
1 medme Pronoun Type=personal * PRON
1 medse Pronoun Type=reflexive * PRON
1 mnog Pronoun Type=indefinite * DET
1 mnogokaj Pronoun Type=indefinite * PRON
1 mnogokateri Pronoun Type=indefinite * DET
1 mnogokdo Pronoun Type=indefinite * PRON
1 moj Pronoun Type=possessive * DET
1 nadme Pronoun Type=personal * PRON
1 nadse Pronoun Type=reflexive * PRON
1 najin Pronoun Type=possessive * DET
1 name Pronoun Type=personal * PRON
1 nase Pronoun Type=reflexive * PRON
1 naš Pronoun Type=possessive * DET
1 negdo Pronoun Type=indefinite * PRON
1 nek Pronoun Type=indefinite * DET
1 nekaj Pronoun Type=indefinite * PRON
1 nekak Pronoun Type=indefinite * DET
1 nekakov Pronoun Type=indefinite * DET
1 nekakšen Pronoun Type=indefinite * DET
1 nekateri Pronoun Type=indefinite * DET
1 nekdo Pronoun Type=indefinite * PRON
1 neki Pronoun Type=indefinite * DET
1 nekolik Pronoun Type=indefinite * DET
1 nekolikšen Pronoun Type=indefinite * DET
1 nekolikšnji Pronoun Type=indefinite * DET
1 nekov Pronoun Type=indefinite * DET
1 nekšen Pronoun Type=indefinite * DET
1 nevemkakšen Pronoun Type=indefinite * DET
1 nihče Pronoun Type=negative * PRON
1 nikak Pronoun Type=negative * DET
1 nikakršen Pronoun Type=negative * DET
1 nikakšen Pronoun Type=negative * DET
1 nikdo Pronoun Type=negative * PRON
1 nikogaršen Pronoun Type=negative * DET
1 nikogaršnji Pronoun Type=negative * DET
1 nič Pronoun Type=negative * PRON
1 njegov Pronoun Type=possessive * DET
1 njen Pronoun Type=possessive * DET
1 njihen Pronoun Type=possessive * DET
1 njihnji Pronoun Type=possessive * DET
1 njihov Pronoun Type=possessive * DET
1 njun Pronoun Type=possessive * DET
1 nobeden Pronoun Type=negative * PRON
1 noben Pronoun Type=negative * DET
1 oba Pronoun Type=general * DET
1 obadva Pronoun Type=general * PRON
1 obme Pronoun Type=personal * PRON
1 oboj Pronoun Type=general * DET
1 obojen Pronoun Type=general * DET
1 obse Pronoun Type=reflexive * PRON
1 on Pronoun Type=personal * PRON
1 oni Pronoun Type=demonstrative * DET
1 onile Pronoun Type=demonstrative * PRON
1 podme Pronoun Type=personal * PRON
1 podse Pronoun Type=reflexive * PRON
1 pome Pronoun Type=personal * PRON
1 predme Pronoun Type=personal * PRON
1 predse Pronoun Type=reflexive * PRON
1 premarsikateri Pronoun Type=indefinite * DET
1 premnog Pronoun Type=indefinite * DET
1 prenekaj Pronoun Type=indefinite * PRON
1 prenekateri Pronoun Type=indefinite * DET
1 prenekdo Pronoun Type=indefinite * PRON
1 redkokateri Pronoun Type=indefinite * DET
1 redkokdo Pronoun Type=indefinite * PRON
1 se Pronoun Type=reflexive * PRON
1 skozme Pronoun Type=personal * PRON
1 skozse Pronoun Type=reflexive * PRON
1 svoj Pronoun Type=reflexive * DET
1 ta Pronoun Type=demonstrative * DET
1 tadva Pronoun Type=demonstrative * PRON
1 taisti Pronoun Type=demonstrative * DET
1 tak Pronoun Type=demonstrative * DET
1 takisti Pronoun Type=demonstrative * DET
1 takle Pronoun Type=demonstrative * DET
1 takov Pronoun Type=demonstrative * DET
1 takošen Pronoun Type=demonstrative * DET
1 takšen Pronoun Type=demonstrative * DET
1 takšenle Pronoun Type=demonstrative * DET
1 tale Pronoun Type=demonstrative * DET
1 talele Pronoun Type=demonstrative * DET
1 teu Pronoun Type=personal * PRON
1 ti Pronoun Type=personal * PRON
1 tisti Pronoun Type=demonstrative * DET
1 tistile Pronoun Type=demonstrative * DET
1 tolik Pronoun Type=demonstrative * DET
1 toliker Pronoun Type=demonstrative * DET
1 tolikšen Pronoun Type=demonstrative * DET
1 tolikšnji Pronoun Type=demonstrative * DET
1 toti Pronoun Type=demonstrative * DET
1 tvoj Pronoun Type=possessive * DET
1 un Pronoun Type=demonstrative * DET
1 vajin Pronoun Type=possessive * DET
1 vame Pronoun Type=personal * PRON
1 vase Pronoun Type=reflexive * PRON
1 vaš Pronoun Type=possessive * DET
1 ves Pronoun Type=general * DET
1 vsak Pronoun Type=general * DET
1 vsakateri Pronoun Type=general * DET
1 vsakdo Pronoun Type=general * PRON
1 vsakogaršen Pronoun Type=general * DET
1 vsakogaršnji Pronoun Type=general * DET
1 vsakršen Pronoun Type=general * DET
1 vsakteri Pronoun Type=general * DET
1 zame Pronoun Type=personal * PRON
1 zase Pronoun Type=reflexive * PRON
3 * Numeral Form=digit * NUM
3 * Numeral Form=roman * NUM
3 * Numeral Form=letter|Type=special * NUM
3 * Numeral Form=letter|Type=cardinal * NUM
2 * Numeral Form=letter|Type=ordinal * ADJ
1 drug Numeral Form=letter|Type=pronominal * ADJ
1 en Numeral Form=letter|Type=pronominal * NUM
1 *en Numeral Form=letter|Type=special * ADJ #enojen, dvojen
1 eden Numeral Form=letter|Type=pronominal * NUM #Dodal E.T.
3 * Adposition * * ADP #MULTEXT-East name
3 * Preposition * * ADP #JOS name
3 * Conjunction Type=coordinating * CCONJ
3 * Conjunction Type=subordinating * SCONJ
3 * Particle * * PART
3 * Interjection * * INTJ
3 * Abbreviation * * X
3 * Residual * * X
2 * Residual Type=web * SYM
2 * Residual Type=emo * SYM
2 * Residual Type=hashtag * SYM #Better mapping?
2 * Residual Type=at * SYM #Better mapping?
2 * Residual Type=foreign * X #Better mapping?
3 * Punctuation * * PUNCT
1 # Punctuation * * SYM
1 % Punctuation * * SYM
1 & Punctuation * * SYM
1 < Punctuation * * SYM
1 > Punctuation * * SYM
1 + Punctuation * * SYM
1 = Punctuation * * SYM
1 ° Punctuation * * SYM
1 × Punctuation * * SYM
1 ÷ Punctuation * * SYM
1 $ Punctuation * * SYM
1 @ Punctuation * * SYM
1 µ Punctuation * * SYM
1 © Punctuation * * SYM
1 § Punctuation * * SYM
1 € Punctuation * * SYM
1 £ Punctuation * * SYM
Binary file not shown.
@@ -1,6 +1,6 @@
import unittest
from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException
from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException, Msd
class JosPropertiesToMsdTestCase(unittest.TestCase):
@@ -52,6 +52,25 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Nc-d')
def test_msd_to_jos(self):
ud = self.converter.msd_to_ud(Msd('Ppnzei', 'sl'), 'slovenski')
self.assertEqual(ud.pos, 'ADJ')
self.assertEqual(ud.to_full_string(), 'UposTag=ADJ|Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
self.assertEqual(ud.to_features_string(), 'Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
ud = self.converter.msd_to_ud(Msd('Sommr', 'sl'), 'dečko')
self.assertEqual(ud.pos, 'NOUN')
self.assertEqual(ud.to_full_string(), 'UposTag=NOUN|Case=Gen|Gender=Masc|Number=Plur')
self.assertEqual(ud.to_features_string(), 'Case=Gen|Gender=Masc|Number=Plur')
def test_msd_to_jos_partial_msd(self):
try:
self.converter.msd_to_ud(Msd('Soz', 'sl'), 'vlada')
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)
def test_bad_msd_with_require_valid(self):
try:
self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
+3 -4
View File
@@ -4,7 +4,6 @@ This script was developed in the context of a specific task and may not generali
"""
import argparse
import codecs
import lxml.etree as lxml
from importlib_resources import files
@@ -13,7 +12,7 @@ from conversion_utils.jos_msds_and_properties import Converter, Msd
def get_syn_map():
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
dict_file = codecs.open(dict_file_name, 'r')
dict_file = open(dict_file_name, 'r', encoding='utf-8')
root = lxml.parse(dict_file).getroot()
dict_file.close()
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
@@ -23,8 +22,8 @@ def translate(input_file_name, scope, output_file_name):
syn_map = get_syn_map()
output_file = codecs.open(output_file_name, 'w')
input_file = codecs.open(input_file_name, 'r')
output_file = open(output_file_name, 'w', encoding='utf-8')
input_file = open(input_file_name, 'r', encoding='utf-8')
converter = Converter()
+1 -1
View File
@@ -6,7 +6,7 @@ with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
setup(name='cjvt_conversion_utils',
version='0.3',
version='0.4',
description='CJVT conversion utilities',
long_description=long_description,
long_description_content_type="text/markdown",