7 Commits

10 changed files with 2514 additions and 44 deletions
+3
View File
@@ -1,3 +1,6 @@
include conversion_utils/resources/jos_specifications.pickle include conversion_utils/resources/jos_specifications.pickle
include conversion_utils/resources/dict.xml include conversion_utils/resources/dict.xml
include conversion_utils/resources/structure_conversions.csv include conversion_utils/resources/structure_conversions.csv
include conversion_utils/resources/jos-msd2features.tbl
include conversion_utils/resources/jos2ud-features.tbl
include conversion_utils/resources/jos2ud-pos.tbl
+3 -3
View File
@@ -312,7 +312,7 @@ def construct_tei_etrees(conllu_lines):
def convert_file(input_file_name, output_file_name): def convert_file(input_file_name, output_file_name):
input_file = open(input_file_name, 'r') input_file = open(input_file_name, 'r', encoding='utf-8')
root = construct_tei_etrees(input_file)[0] root = construct_tei_etrees(input_file)[0]
tree = etree.ElementTree(root) tree = etree.ElementTree(root)
tree.write(output_file_name, encoding='UTF-8', pretty_print=True) tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@@ -332,7 +332,7 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
if args.out: if args.out:
f_out = open(args.out, 'w') f_out = open(args.out, 'w', encoding='utf-8')
else: else:
f_out = sys.stdout f_out = sys.stdout
@@ -341,7 +341,7 @@ if __name__ == '__main__':
for arg in args.files: for arg in args.files:
filelist = glob(arg) filelist = glob(arg)
for f in filelist: for f in filelist:
with open(f, 'r') as conllu_f: with open(f, 'r', encoding='utf-8') as conllu_f:
tei_etrees = construct_tei_etrees(conllu_f) tei_etrees = construct_tei_etrees(conllu_f)
for tei_etree in tei_etrees: for tei_etree in tei_etrees:
f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode()) f_out.write(etree.tostring(tei_etree, pretty_print=True, encoding='utf-8').decode())
+174 -35
View File
@@ -1,12 +1,21 @@
import lxml.etree as lxml
import re import re
import pickle import pickle
import lxml.etree as lxml
from collections import defaultdict
from importlib_resources import files from importlib_resources import files
from enum import IntEnum
from conversion_utils.utils import xpath_find, get_xml_id from conversion_utils.utils import xpath_find, get_xml_id
JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle' JOS_SPECIFICATIONS_PICKLE_RESOURCE = 'jos_specifications.pickle'
RESOURCES_DIR = "conversion_utils.resources"
MSD_TO_FEATURES = "jos-msd2features.tbl"
JOS_TO_UD_FEATURES_RULES = "jos2ud-features.tbl"
JOS_TO_UPOS_RULES = "jos2ud-pos.tbl"
## Positions of lexeme-level features for each category ## Positions of lexeme-level features for each category
LEXEME_FEATURE_MAP = {'noun':{1,2}, LEXEME_FEATURE_MAP = {'noun':{1,2},
'verb':{1,2}, 'verb':{1,2},
@@ -53,6 +62,14 @@ LEVEL_EXCEPTIONS = {('pronoun', 2, 'čezme'), ('zaimek', 2, 'čezme'),
('pronoun', 8, 'se'), ('zaimek', 8, 'se'), ('pronoun', 8, 'se'), ('zaimek', 8, 'se'),
('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')} ('pronoun', 8, 'ti'), ('zaimek', 8, 'ti')}
class MsdState(IntEnum):
UNKNOWN = -1
PARTIAL = 1
FULL = 2
class MsdException(Exception):
pass
class Specifications: class Specifications:
"""JOS specifications with list of all word categories.""" """JOS specifications with list of all word categories."""
@@ -214,42 +231,84 @@ class Properties:
and self.lexeme_feature_map == obj.lexeme_feature_map\ and self.lexeme_feature_map == obj.lexeme_feature_map\
and self.form_feature_map == obj.form_feature_map\ and self.form_feature_map == obj.form_feature_map\
and self.language == obj.language and self.language == obj.language
class UD:
"""Universal Dependencies object.
Can be converted to a valid UD features string.
"""
def __init__(self, pos, features_map):
self.pos = pos
self.features_map = features_map
def to_features_string(self):
return self._features_string()
def to_full_string(self):
features = self._features_string()
if features:
return "UposTag=" + self.pos + "|" + features
else:
return "UposTag=" + self.pos
def _features_string(self):
return "|".join([f"{feature}={value}" for feature, value in self._sort_features(self.features_map)])
def _sort_features(self, features_map):
return sorted(features_map.items(), key=lambda x: x[0].lower(), reverse=False)
def __str__(self):
return f"pos={self.pos}, features_map={self.features_map}"
class Msd: class Msd:
"""JOS msd.""" """JOS msd."""
def __init__(self, code, language): class State(IntEnum):
UNKNOWN = -1
PARTIAL = 1
FULL = 2
def __init__(self, code, language, expected_state=State.FULL, require_valid=False):
self.code = code self.code = code
self.language = language self.language = language
self.expected_state = expected_state
self.require_valid = require_valid
self.state = self._validate_and_get_state()
def _validate_and_get_state(self):
states = set()
if self.code in DEFAULT_SPECIFICATIONS.codes_map[self.language]:
states.add(self.State.FULL)
if self.code in DEFAULT_SPECIFICATIONS.partial_codes_map[self.language]:
states.add(self.State.PARTIAL)
if len(states) == 0:
states.add(self.State.UNKNOWN)
if self.expected_state not in states:
if self.require_valid:
raise MsdException(f"Given msd '{self.codecode}' is '{self.state.name}', but expected state is '{self.expected_state.name}'.")
else:
if self.state == self.State.UNKNOWN:
print(f"[WARN] The Msd '{self.code}' is unknown.")
else:
print(f"[WARN] The Msd '{self.code}' is unknown for expected state '{self.expected_state.name}'.")
return max(states)
def __str__(self): def __str__(self):
return 'code={code}, language={language}'.format(code=self.code, language=self.language) return 'code={code}, language={language}'.format(code=self.code, language=self.language)
def __eq__(self, obj): def __eq__(self, obj):
return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language return isinstance(obj, Msd) and self.code == obj.code and self.language == obj.language
class CustomException(Exception):
pass
class MsdException(CustomException):
pass
class Converter: class Converter:
"""Converter between Msd and Properties objects.""" """Converter between Msd and Properties objects."""
def __init__(self, xml_file_name=None): def __init__(self, xml_file_name=None):
if (xml_file_name is None): if (xml_file_name is None):
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE) self.specifications = DEFAULT_SPECIFICATIONS
if (resource.is_file()):
try:
with resource.open('rb') as pickle_file:
self.specifications = pickle.load(pickle_file)
except:
exit('Could not parse specifications pickle file installed.')
else:
exit('No pickle installed or xml provided.')
else: else:
parser = SpecificationsParser() parser = SpecificationsParser()
try: try:
@@ -257,20 +316,45 @@ class Converter:
except: except:
exit('Could not parse specifications xml file provided.') exit('Could not parse specifications xml file provided.')
def is_valid_msd(self, msd): self.mte_to_ud_features = self._parse_msd_ud_conversion(MSD_TO_FEATURES)
"""Verify if the Msd code is in the standard JOS set.""" self.mte_to_ud_features_rules = self._parse_ud_rules(JOS_TO_UD_FEATURES_RULES)
return msd.code in self.specifications.codes_map[msd.language] self.mte_to_upos_rules = self._parse_ud_rules(JOS_TO_UPOS_RULES)
def check_valid_msd(self, msd, require_valid_flag): def _get_partial_msd(self, msd):
"""If the Msd code is not valid, raise an exception or give a warning.""" properties = self.msd_to_properties(msd, msd.language)
if (not self.is_valid_msd(msd)): category_char = msd.code[0].lower()
message = 'The msd {} is unknown'.format(msd.code) category = self.specifications.find_category_by_code(category_char, msd.language)
if (require_valid_flag): category_name = category.names.get(msd.language)
raise MsdException(message) properties = Properties(
else: category=category_name,
print('[WARN] ' + message) lexeme_feature_map=properties.lexeme_feature_map,
form_feature_map={},
language=msd.language
)
return self.properties_to_msd(properties, msd.language, expected_state=Msd.State.PARTIAL).code
def msd_to_properties(self, msd, language, lemma=None, require_valid_flag=False, warn_level_flag=False): def _parse_msd_ud_conversion(self, file_name):
"""Parse file with direct conversions from English Msd to Universal Dependencies."""
conversion_map = defaultdict()
with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as conversion_file:
for line in conversion_file.readlines():
mte_msd_en, mte_features_en = line.strip("\n").split("\t")
mte_sl = self.translate_msd(Msd(mte_msd_en, "en"), "sl").code
conversion_map[mte_msd_en] = mte_features_en
conversion_map[mte_sl] = mte_features_en
return conversion_map
def _parse_ud_rules(self, file_name):
"""Parse file with rules additional rules for converting from applied to conversion from English Msd to Universal Dependencies."""
all_rules = defaultdict(list)
with files(RESOURCES_DIR).joinpath(file_name).open("r", encoding="UTF-8") as rules_file:
for line in [l for l in rules_file.readlines() if l[0].isdigit()]:
priority, *current_rules = line.strip("\n").split("\t")
current_rules += [""] * (6 - len(current_rules))
all_rules[priority].append(current_rules)
return all_rules
def msd_to_properties(self, msd, language, lemma=None, warn_level_flag=False):
"""Convert Msd to Properties. """Convert Msd to Properties.
The language of the generated Properties is specified and can differ from the Msd language. The language of the generated Properties is specified and can differ from the Msd language.
@@ -296,7 +380,6 @@ class Converter:
Properties: the result of the conversion of the Msd in the language requested Properties: the result of the conversion of the Msd in the language requested
""" """
self.check_valid_msd(msd, require_valid_flag)
category_char = msd.code[0].lower() category_char = msd.code[0].lower()
value_chars = msd.code[1:] value_chars = msd.code[1:]
category = self.specifications.find_category_by_code(category_char, msd.language) category = self.specifications.find_category_by_code(category_char, msd.language)
@@ -322,7 +405,7 @@ class Converter:
form_feature_map[feature_name] = feature_value form_feature_map[feature_name] = feature_value
return Properties(category_name, lexeme_feature_map, form_feature_map, language) return Properties(category_name, lexeme_feature_map, form_feature_map, language)
def properties_to_msd(self, properties, language, require_valid_flag=False): def properties_to_msd(self, properties, language, expected_state=Msd.State.FULL):
"""Convert Properties to Msd. """Convert Properties to Msd.
The language of the generated Msd is specified and can differ from the Properties language. The language of the generated Msd is specified and can differ from the Properties language.
@@ -354,12 +437,68 @@ class Converter:
msd_code += '-' msd_code += '-'
i += 1 i += 1
msd_code += position_map[position] msd_code += position_map[position]
msd = Msd(msd_code, language) msd = Msd(msd_code, language, expected_state=expected_state)
self.check_valid_msd(msd, require_valid_flag)
return msd return msd
def msd_to_ud(self, msd, lemma):
"""Convert Msd to Universal Dependencies object.
Partial Msds are currently not supported.
Parameters:
msd(Msd): the Msd to convert
lemma(str): the lemma of the word form with the MSD
"""
if msd.state != Msd.State.FULL:
raise MsdException(f"Msd must be full to be converted to UD.")
upos_category, *upos_features = self.mte_to_ud_features[msd.code].split()
final_upos = ""
for priority in sorted(self.mte_to_upos_rules, reverse=True):
for rule in self.mte_to_upos_rules[priority]:
rule_lemma, rule_category, rule_mte_features, _, rule_pos_ud, _ = rule
if (rule_category != upos_category
or (rule_lemma not in ("*", "*en") and lemma != rule_lemma)
or (rule_lemma == "*en" and not lemma.endswith("en"))
or (rule_mte_features != "*" and not all(f in upos_features for f in rule_mte_features.split("|")))):
continue
final_upos = rule_pos_ud
for priority in sorted(self.mte_to_ud_features_rules):
for rule in self.mte_to_ud_features_rules[priority]:
rule_lemma, rule_category, rule_mte_features, rule_pos_ud, rule_ud_features, _ = rule
if (rule_lemma != "*" and lemma != rule_lemma
or (rule_category != "*" and rule_category != upos_category)
or (rule_pos_ud != "*" and rule_pos_ud != final_upos)):
continue
upos_features = [rule_ud_features if f == rule_mte_features else f for f in upos_features]
if rule_mte_features == "*" and rule_ud_features != "-":
upos_features.append(rule_ud_features)
ud_features = dict(f.split("=", 1) for f in "|".join(upos_features).split("|") if f not in {"", "-"})
return UD(final_upos, ud_features)
def translate_msd(self, msd, language): def translate_msd(self, msd, language):
return self.properties_to_msd(self.msd_to_properties(msd, language), language) return self.properties_to_msd(self.msd_to_properties(msd, language), language)
def translate_properties(self, properties, language): def translate_properties(self, properties, language):
return self.msd_to_properties(self.properties_to_msd(properties, language), language) return self.msd_to_properties(self.properties_to_msd(properties, language), language)
def _load_default_specifications():
global DEFAULT_SPECIFICATIONS
resource = files('conversion_utils.resources').joinpath(JOS_SPECIFICATIONS_PICKLE_RESOURCE)
if resource.is_file():
try:
with resource.open('rb') as pickle_file:
DEFAULT_SPECIFICATIONS = pickle.load(pickle_file)
except Exception as e:
exit('Could not parse specifications pickle file installed.')
else:
exit("Default specifications not found.")
_load_default_specifications()
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,128 @@
# Mapping from JOS features to UD features
# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek
# 2018-11-23
#
#Prio Lemma Category Feats PoS-UD ->Feature-UD #Comment
----------------------------------------------------------------------------------------------------
1 * Noun Type=common * -
1 * Noun Type=proper * -
1 * Verb Negative=no * Polarity=Pos
1 * Verb Negative=yes * Polarity=Neg
1 * Verb Type=auxiliary * -
1 * Verb Type=main * -
1 * Verb VForm=present * VerbForm=Fin|Mood=Ind|Tense=Pres
1 * Verb VForm=future * VerbForm=Fin|Mood=Ind|Tense=Fut
1 * Verb VForm=conditional * VerbForm=Fin|Mood=Cnd
1 * Verb VForm=imperative * VerbForm=Fin|Mood=Imp
1 * Verb VForm=infinitive * VerbForm=Inf
1 * Verb VForm=supine * VerbForm=Sup
1 * Verb VForm=participle * VerbForm=Part
1 * Adjective Type=general * -
1 * Adjective Type=possessive * Poss=Yes
1 * Adjective Type=participle * VerbForm=Part
2 * Adverb Type=participle * VerbForm=Conv
2 * Adverb Type=general * -
1 nekaj Adverb Type=general DET PronType=Ind
1 več Adverb Type=general DET PronType=Ind
1 veliko Adverb Type=general DET PronType=Ind
1 manj Adverb Type=general DET PronType=Ind
1 dovolj Adverb Type=general DET PronType=Ind
1 pol Adverb Type=general DET PronType=Ind
1 malo Adverb Type=general DET PronType=Ind
1 toliko Adverb Type=general DET PronType=Dem
1 največ Adverb Type=general DET PronType=Ind
1 mnogo Adverb Type=general DET PronType=Ind
1 preveč Adverb Type=general DET PronType=Ind
1 par Adverb Type=general DET PronType=Ind
1 koliko Adverb Type=general DET PronType=Int
1 dosti Adverb Type=general DET PronType=Ind
1 nešteto Adverb Type=general DET PronType=Ind
1 četrt Adverb Type=general DET PronType=Ind
1 ogromno Adverb Type=general DET PronType=Ind
1 čimveč Adverb Type=general DET PronType=Ind
1 obilo Adverb Type=general DET PronType=Ind
1 premnogo Adverb Type=general DET PronType=Ind
1 enormno Adverb Type=general DET PronType=Ind
1 majčkeno Adverb Type=general DET PronType=Ind
2 * Pronoun Type=reflexive * PronType=Prs|Reflex=Yes
2 * Pronoun Type=personal * PronType=Prs
2 * Pronoun Type=possessive * PronType=Prs|Poss=Yes
2 * Pronoun Type=interrogative * PronType=Int
2 * Pronoun Type=relative * PronType=Rel
2 * Pronoun Type=demonstrative * PronType=Dem
2 * Pronoun Type=general * PronType=Tot
2 * Pronoun Type=negative * PronType=Neg
2 * Pronoun Type=indefinite * PronType=Ind
1 * Pronoun Type=personal DET PronType=Prs
1 * Pronoun Type=possessive DET PronType=Prs|Poss=Yes
1 * Pronoun Owner_Gender=masculine * Gender[psor]=Masc #lg.spec.feature
1 * Pronoun Owner_Gender=feminine * Gender[psor]=Fem #lg.spec.feature
1 * Pronoun Owner_Gender=neuter * Gender[psor]=Neut #lg.spec.feature
1 * Pronoun Owner_Number=singular * Number[psor]=Sing #lg.spec.feature
1 * Pronoun Owner_Number=plural * Number[psor]=Plur #lg.spec.feature
1 * Pronoun Owner_Number=dual * Number[psor]=Dual #lg.spec.feature
1 * Pronoun Clitic=yes * Variant=Short #lg.spec.feature
1 * Pronoun Clitic=bound * Variant=Bound #lg.spec.feature
1 svoj Pronoun Type=reflexive * PronType=Prs|Reflex=Yes|Poss=Yes
2 * Numeral Type=pronominal * -
2 * Numeral Form=letter * -
2 * Numeral Type=cardinal NUM NumType=Card
1 * Numeral Form=letter NUM NumForm=Word #lg.spec.feature
1 * Numeral Form=digit NUM NumForm=Digit #lg.spec.feature
1 * Numeral Form=roman NUM NumForm=Roman #lg.spec.feature
1 * Numeral Type=ordinal * NumType=Ord
1 * Numeral Type=special ADJ NumType=Mult
1 * Numeral Type=special NUM NumType=Sets
1 en Numeral Type=pronominal * NumType=Card
1 eden Numeral Type=pronominal * NumType=Card
1 * Conjunction Type=subordinating * -
1 * Conjunction Type=coordinating * -
2 * Particle * * -
1 ne Particle * * Polarity=Neg
1 * Interjection * * -
1 * Abbreviation * * Abbr=Yes
2 * Residual * * -
1 * Residual Type=foreign * Foreign=Yes
1 * Residual Type=typo * -
1 * Residual Type=program * -
1 * Punctuation * * -
2 * * Degree=positive * Degree=Pos
2 * * Degree=comparative * Degree=Cmp
2 * * Degree=superlative * Degree=Sup
1 * * Degree=positive DET -
1 * * Degree=comparative DET -
1 * * Degree=superlative DET -
1 * * Animate=no * Animacy=Inan
1 * * Animate=yes * Animacy=Anim
1 * * Aspect=perfective * Aspect=Perf
1 * * Aspect=progressive * Aspect=Imp
1 * * Aspect=biaspectual * -
1 * * Case=nominative * Case=Nom
1 * * Case=genitive * Case=Gen
1 * * Case=dative * Case=Dat
1 * * Case=accusative * Case=Acc
1 * * Case=locative * Case=Loc
1 * * Case=instrumental * Case=Ins
1 * * Definiteness=no * Definite=Ind
1 * * Definiteness=yes * Definite=Def
1 * * Gender=masculine * Gender=Masc
1 * * Gender=feminine * Gender=Fem
1 * * Gender=neuter * Gender=Neut
1 * * Number=singular * Number=Sing
1 * * Number=plural * Number=Plur
1 * * Number=dual * Number=Dual
1 * * Person=first * Person=1
1 * * Person=second * Person=2
1 * * Person=third * Person=3
+282
View File
@@ -0,0 +1,282 @@
# Mapping from JOS PoS to UD 2.0 PoS
# Kaja Dobrovoljc, Tomaž Erjavec, Simon Krek
# 2019-02-04
#
#Prio Lemma Category Feats Deps ->PoS-UD #Comment
#-------------------------------------------------------------------------------------------------------
3 * Noun Type=common * NOUN
3 * Noun Type=proper * PROPN
3 * Verb * * VERB
2 * Verb Type=auxiliary * AUX #This is one can in fact also be VERB, but this has to be determined by some other means
3 * Adjective * * ADJ
3 * Adverb * * ADV
1 četrt Adverb * * DET
1 čimmanj Adverb * * DET
1 čimveč Adverb * * DET
1 dosti Adverb * * DET
1 dovolj Adverb * * DET
1 enako Adverb * * ADV
1 enormno Adverb * * DET
1 ful Adverb * * ADV
1 koliko Adverb * * DET
1 majčkeno Adverb * * DET
1 maksimalno Adverb * * ADV
1 malce Adverb * * ADV
1 malo Adverb * * DET
1 manj Adverb * * DET
1 minimalno Adverb * * ADV
1 mnogo Adverb * * DET
1 najmanj Adverb * * ADV
1 največ Adverb * * DET
1 nekaj Adverb * * DET
1 nekoliko Adverb * * ADV
1 nemalo Adverb * * ADV
1 nešteto Adverb * * DET
1 nič Adverb * * ADV
1 ničkoliko Adverb * * DET
1 obilo Adverb * * DET
1 ogromno Adverb * * DET
1 par Adverb * * DET
1 pol Adverb * * DET
1 polno Adverb * * ADV
1 precej Adverb * * ADV
1 premalo Adverb * * ADV
1 premnogo Adverb * * DET
1 preveč Adverb * * DET
1 toliko Adverb * * DET
1 veliko Adverb * * DET
1 več Adverb * * DET
1 večidel Adverb * * ADV
1 vse Adverb * * ADV
1 zadosti Adverb * * ADV
##All Pronouns should be explicitly defined
##But are not because of jos1M wrong lemmatisations for e.g. "ti", "te" etc.
3 * Pronoun * * PRON
##2 * Pronoun Type=demonstrative * DET
##2 * Pronoun Type=possessive * DET
1 bogsigavedikakšen Pronoun Type=indefinite * DET
1 bogvedikaj Pronoun Type=indefinite * PRON
1 bogvedikateri Pronoun Type=indefinite * DET
1 bogvekaj Pronoun Type=indefinite * PRON
1 bogvekakšen Pronoun Type=indefinite * DET
1 bogvekateri Pronoun Type=indefinite * DET
1 bogvekolik Pronoun Type=indefinite * DET
1 bogvekolikšen Pronoun Type=indefinite * DET
1 čezme Pronoun Type=personal * PRON
1 čezse Pronoun Type=reflexive * PRON
1 čigar Pronoun Type=relative * DET
1 čigarkoli Pronoun Type=relative * DET
1 čigarsižebodi Pronoun Type=relative * DET
1 čigav Pronoun Type=interrogative * DET
1 čigaver Pronoun Type=relative * DET
1 čigaverkoli Pronoun Type=relative * DET
1 čigavršen Pronoun Type=relative * DET
1 čigavršnji Pronoun Type=relative * DET
1 enak Pronoun Type=indefinite * DET
1 enaki Pronoun Type=indefinite * DET
1 enakšen Pronoun Type=indefinite * DET
1 isti Pronoun Type=indefinite * DET
1 jaz Pronoun Type=personal * PRON
1 jest Pronoun Type=personal * PRON
1 kaj Pronoun Type=interrogative * PRON
1 kak Pronoun Type=interrogative * DET
1 kakov Pronoun Type=interrogative * DET
1 kakošen Pronoun Type=interrogative * DET
1 kakršen Pronoun Type=relative * DET
1 kakršenkoli Pronoun Type=relative * DET
1 kakršensižebodi Pronoun Type=relative * DET
1 kakšen Pronoun Type=interrogative * DET
1 kar Pronoun Type=relative * PRON
1 karkoli Pronoun Type=relative * PRON
1 karsibodi Pronoun Type=relative * PRON
1 karsižebodi Pronoun Type=relative * PRON
1 kateri Pronoun Type=interrogative * DET
1 katerikoli Pronoun Type=relative * DET
1 katerisibodi Pronoun Type=relative * DET
1 kdo Pronoun Type=interrogative * PRON
1 kdor Pronoun Type=relative * PRON
1 kdorkoli Pronoun Type=relative * PRON
1 kdorsibodi Pronoun Type=relative * PRON
1 kdorsižebodi Pronoun Type=relative * PRON
1 kdovekaj Pronoun Type=indefinite * PRON
1 kdovekak Pronoun Type=indefinite * DET
1 kdovekakšen Pronoun Type=indefinite * DET
1 kdovekateri Pronoun Type=indefinite * DET
1 kdovekdo Pronoun Type=indefinite * PRON
1 kdovekolik Pronoun Type=indefinite * DET
1 koji Pronoun Type=interrogative * DET
1 kolik Pronoun Type=interrogative * DET
1 kolik Pronoun Type=indefinite * DET
1 koliker Pronoun Type=interrogative * DET
1 kolikršen Pronoun Type=relative * DET
1 kolikšen Pronoun Type=interrogative * DET
1 malokaj Pronoun Type=indefinite * PRON
1 malokak Pronoun Type=indefinite * DET
1 malokakšen Pronoun Type=indefinite * DET
1 malokateri Pronoun Type=indefinite * DET
1 malokdo Pronoun Type=indefinite * PRON
1 marsikaj Pronoun Type=indefinite * PRON
1 marsikak Pronoun Type=indefinite * DET
1 marsikakšen Pronoun Type=indefinite * DET
1 marsikateri Pronoun Type=indefinite * DET
1 marsikdo Pronoun Type=indefinite * PRON
1 marsičigav Pronoun Type=indefinite * DET
1 medme Pronoun Type=personal * PRON
1 medse Pronoun Type=reflexive * PRON
1 mnog Pronoun Type=indefinite * DET
1 mnogokaj Pronoun Type=indefinite * PRON
1 mnogokateri Pronoun Type=indefinite * DET
1 mnogokdo Pronoun Type=indefinite * PRON
1 moj Pronoun Type=possessive * DET
1 nadme Pronoun Type=personal * PRON
1 nadse Pronoun Type=reflexive * PRON
1 najin Pronoun Type=possessive * DET
1 name Pronoun Type=personal * PRON
1 nase Pronoun Type=reflexive * PRON
1 naš Pronoun Type=possessive * DET
1 negdo Pronoun Type=indefinite * PRON
1 nek Pronoun Type=indefinite * DET
1 nekaj Pronoun Type=indefinite * PRON
1 nekak Pronoun Type=indefinite * DET
1 nekakov Pronoun Type=indefinite * DET
1 nekakšen Pronoun Type=indefinite * DET
1 nekateri Pronoun Type=indefinite * DET
1 nekdo Pronoun Type=indefinite * PRON
1 neki Pronoun Type=indefinite * DET
1 nekolik Pronoun Type=indefinite * DET
1 nekolikšen Pronoun Type=indefinite * DET
1 nekolikšnji Pronoun Type=indefinite * DET
1 nekov Pronoun Type=indefinite * DET
1 nekšen Pronoun Type=indefinite * DET
1 nevemkakšen Pronoun Type=indefinite * DET
1 nihče Pronoun Type=negative * PRON
1 nikak Pronoun Type=negative * DET
1 nikakršen Pronoun Type=negative * DET
1 nikakšen Pronoun Type=negative * DET
1 nikdo Pronoun Type=negative * PRON
1 nikogaršen Pronoun Type=negative * DET
1 nikogaršnji Pronoun Type=negative * DET
1 nič Pronoun Type=negative * PRON
1 njegov Pronoun Type=possessive * DET
1 njen Pronoun Type=possessive * DET
1 njihen Pronoun Type=possessive * DET
1 njihnji Pronoun Type=possessive * DET
1 njihov Pronoun Type=possessive * DET
1 njun Pronoun Type=possessive * DET
1 nobeden Pronoun Type=negative * PRON
1 noben Pronoun Type=negative * DET
1 oba Pronoun Type=general * DET
1 obadva Pronoun Type=general * PRON
1 obme Pronoun Type=personal * PRON
1 oboj Pronoun Type=general * DET
1 obojen Pronoun Type=general * DET
1 obse Pronoun Type=reflexive * PRON
1 on Pronoun Type=personal * PRON
1 oni Pronoun Type=demonstrative * DET
1 onile Pronoun Type=demonstrative * PRON
1 podme Pronoun Type=personal * PRON
1 podse Pronoun Type=reflexive * PRON
1 pome Pronoun Type=personal * PRON
1 predme Pronoun Type=personal * PRON
1 predse Pronoun Type=reflexive * PRON
1 premarsikateri Pronoun Type=indefinite * DET
1 premnog Pronoun Type=indefinite * DET
1 prenekaj Pronoun Type=indefinite * PRON
1 prenekateri Pronoun Type=indefinite * DET
1 prenekdo Pronoun Type=indefinite * PRON
1 redkokateri Pronoun Type=indefinite * DET
1 redkokdo Pronoun Type=indefinite * PRON
1 se Pronoun Type=reflexive * PRON
1 skozme Pronoun Type=personal * PRON
1 skozse Pronoun Type=reflexive * PRON
1 svoj Pronoun Type=reflexive * DET
1 ta Pronoun Type=demonstrative * DET
1 tadva Pronoun Type=demonstrative * PRON
1 taisti Pronoun Type=demonstrative * DET
1 tak Pronoun Type=demonstrative * DET
1 takisti Pronoun Type=demonstrative * DET
1 takle Pronoun Type=demonstrative * DET
1 takov Pronoun Type=demonstrative * DET
1 takošen Pronoun Type=demonstrative * DET
1 takšen Pronoun Type=demonstrative * DET
1 takšenle Pronoun Type=demonstrative * DET
1 tale Pronoun Type=demonstrative * DET
1 talele Pronoun Type=demonstrative * DET
1 teu Pronoun Type=personal * PRON
1 ti Pronoun Type=personal * PRON
1 tisti Pronoun Type=demonstrative * DET
1 tistile Pronoun Type=demonstrative * DET
1 tolik Pronoun Type=demonstrative * DET
1 toliker Pronoun Type=demonstrative * DET
1 tolikšen Pronoun Type=demonstrative * DET
1 tolikšnji Pronoun Type=demonstrative * DET
1 toti Pronoun Type=demonstrative * DET
1 tvoj Pronoun Type=possessive * DET
1 un Pronoun Type=demonstrative * DET
1 vajin Pronoun Type=possessive * DET
1 vame Pronoun Type=personal * PRON
1 vase Pronoun Type=reflexive * PRON
1 vaš Pronoun Type=possessive * DET
1 ves Pronoun Type=general * DET
1 vsak Pronoun Type=general * DET
1 vsakateri Pronoun Type=general * DET
1 vsakdo Pronoun Type=general * PRON
1 vsakogaršen Pronoun Type=general * DET
1 vsakogaršnji Pronoun Type=general * DET
1 vsakršen Pronoun Type=general * DET
1 vsakteri Pronoun Type=general * DET
1 zame Pronoun Type=personal * PRON
1 zase Pronoun Type=reflexive * PRON
3 * Numeral Form=digit * NUM
3 * Numeral Form=roman * NUM
3 * Numeral Form=letter|Type=special * NUM
3 * Numeral Form=letter|Type=cardinal * NUM
2 * Numeral Form=letter|Type=ordinal * ADJ
1 drug Numeral Form=letter|Type=pronominal * ADJ
1 en Numeral Form=letter|Type=pronominal * NUM
1 *en Numeral Form=letter|Type=special * ADJ #enojen, dvojen
1 eden Numeral Form=letter|Type=pronominal * NUM #Dodal E.T.
3 * Adposition * * ADP #MULTEXT-East name
3 * Preposition * * ADP #JOS name
3 * Conjunction Type=coordinating * CCONJ
3 * Conjunction Type=subordinating * SCONJ
3 * Particle * * PART
3 * Interjection * * INTJ
3 * Abbreviation * * X
3 * Residual * * X
2 * Residual Type=web * SYM
2 * Residual Type=emo * SYM
2 * Residual Type=hashtag * SYM #Better mapping?
2 * Residual Type=at * SYM #Better mapping?
2 * Residual Type=foreign * X #Better mapping?
3 * Punctuation * * PUNCT
1 # Punctuation * * SYM
1 % Punctuation * * SYM
1 & Punctuation * * SYM
1 < Punctuation * * SYM
1 > Punctuation * * SYM
1 + Punctuation * * SYM
1 = Punctuation * * SYM
1 ° Punctuation * * SYM
1 × Punctuation * * SYM
1 ÷ Punctuation * * SYM
1 $ Punctuation * * SYM
1 @ Punctuation * * SYM
1 µ Punctuation * * SYM
1 © Punctuation * * SYM
1 § Punctuation * * SYM
1 € Punctuation * * SYM
1 £ Punctuation * * SYM
Binary file not shown.
@@ -1,6 +1,6 @@
import unittest import unittest
from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException from conversion_utils.jos_msds_and_properties import Converter, Properties, MsdException, Msd
class JosPropertiesToMsdTestCase(unittest.TestCase): class JosPropertiesToMsdTestCase(unittest.TestCase):
@@ -52,6 +52,25 @@ class JosPropertiesToMsdTestCase(unittest.TestCase):
self.assertEqual(msd.language, 'en') self.assertEqual(msd.language, 'en')
self.assertEqual(msd.code, 'Nc-d') self.assertEqual(msd.code, 'Nc-d')
def test_msd_to_jos(self):
ud = self.converter.msd_to_ud(Msd('Ppnzei', 'sl'), 'slovenski')
self.assertEqual(ud.pos, 'ADJ')
self.assertEqual(ud.to_full_string(), 'UposTag=ADJ|Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
self.assertEqual(ud.to_features_string(), 'Case=Nom|Degree=Pos|Gender=Fem|Number=Sing')
ud = self.converter.msd_to_ud(Msd('Sommr', 'sl'), 'dečko')
self.assertEqual(ud.pos, 'NOUN')
self.assertEqual(ud.to_full_string(), 'UposTag=NOUN|Case=Gen|Gender=Masc|Number=Plur')
self.assertEqual(ud.to_features_string(), 'Case=Gen|Gender=Masc|Number=Plur')
def test_msd_to_jos_partial_msd(self):
try:
self.converter.msd_to_ud(Msd('Soz', 'sl'), 'vlada')
fails = False
except MsdException:
fails = True
self.assertEqual(fails, True)
def test_bad_msd_with_require_valid(self): def test_bad_msd_with_require_valid(self):
try: try:
self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True) self.converter.properties_to_msd(Properties('noun', {'type':'common'}, {'number':'dual'}, 'en'), 'en', require_valid_flag=True)
+3 -4
View File
@@ -4,7 +4,6 @@ This script was developed in the context of a specific task and may not generali
""" """
import argparse import argparse
import codecs
import lxml.etree as lxml import lxml.etree as lxml
from importlib_resources import files from importlib_resources import files
@@ -13,7 +12,7 @@ from conversion_utils.jos_msds_and_properties import Converter, Msd
def get_syn_map(): def get_syn_map():
dict_file_name = files('conversion_utils.resources').joinpath('dict.xml') dict_file_name = files('conversion_utils.resources').joinpath('dict.xml')
dict_file = codecs.open(dict_file_name, 'r') dict_file = open(dict_file_name, 'r', encoding='utf-8')
root = lxml.parse(dict_file).getroot() root = lxml.parse(dict_file).getroot()
dict_file.close() dict_file.close()
return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')} return {syn.get('en'):syn.get('sl') for syn in root.xpath('syns/syn')}
@@ -23,8 +22,8 @@ def translate(input_file_name, scope, output_file_name):
syn_map = get_syn_map() syn_map = get_syn_map()
output_file = codecs.open(output_file_name, 'w') output_file = open(output_file_name, 'w', encoding='utf-8')
input_file = codecs.open(input_file_name, 'r') input_file = open(input_file_name, 'r', encoding='utf-8')
converter = Converter() converter = Converter()
+1 -1
View File
@@ -6,7 +6,7 @@ with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read() long_description = f.read()
setup(name='cjvt_conversion_utils', setup(name='cjvt_conversion_utils',
version='0.3', version='0.4',
description='CJVT conversion utilities', description='CJVT conversion utilities',
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/markdown",