You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
2.3 KiB
68 lines
2.3 KiB
import pkg_resources
|
|
import pandas
|
|
import sys
|
|
|
|
# msd mappings from slo to ang
|
|
class Msdmap:
|
|
def __init__(self):
|
|
# http://nl.ijs.si/ME/V4/msd/html/msd.categories-sl.html
|
|
self.pos_slo_ang = [
|
|
("samostalnik", "S", "Noun", "N"),
|
|
("glagol", "G", "Verb", "V"),
|
|
("pridevnik", "P", "Adjective", "A"),
|
|
("prislov", "R", "Adverb", "R"),
|
|
("zaimek", "Z", "Pronoun", "P"),
|
|
("števnik", "K", "Numeral", "M"),
|
|
("predlog", "D", "Adposition", "S"),
|
|
("veznik", "V", "Conjunction", "C"),
|
|
("členek", "L", "Particle", "Q"),
|
|
("medmet", "M", "Interjection", "I"),
|
|
("okrajšava", "O", "Abbreviation", "Y"),
|
|
("neuvrščeno", "N", "Residual", "X"),
|
|
]
|
|
|
|
table_name = "msd-human-sl.tbl"
|
|
table_path = pkg_resources.resource_filename(__name__, table_name)
|
|
with open(table_path, "rb") as fp:
|
|
# TODO: pandas is awfully slow --- might need optimization
|
|
self.msd_table = pandas.read_csv(
|
|
fp,
|
|
"\t",
|
|
names=["id", "eng_msd", "eng_long", "slo_msd", "slo_long", "slo_very_long"]
|
|
)
|
|
self.msd_table.set_index('slo_msd')
|
|
|
|
def slo_msd_to_eng_long(self, slo_msd):
|
|
# old, slow
|
|
# return (self.msd_table[self.msd_table["slo_msd"] == slo_msd]["eng_long"]).values[0]
|
|
# return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
|
|
query = self.msd_table.query("slo_msd == '{}'".format(slo_msd))
|
|
if query.empty:
|
|
return "No-matching-msd-found"
|
|
return query["eng_long"].values[0]
|
|
|
|
def slo_msd_to_eng_pos(self, slo_msd):
|
|
# first letter in slo_msd == slo_pos
|
|
return self.pos_slo_ang_map(1, slo_msd[0])[3]
|
|
|
|
def pos_slo_ang_map(self, col, query):
|
|
for pos in self.pos_slo_ang:
|
|
if pos[col] == query:
|
|
return pos
|
|
raise ValueError("Wrong part of speech.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
msdmap = Msdmap()
|
|
test_msds = [
|
|
"Soser",
|
|
"Ppnzmm",
|
|
"Gp-d-mz"
|
|
]
|
|
|
|
for msd in test_msds:
|
|
print(msd)
|
|
print(msdmap.slo_msd_to_eng_long(msd))
|
|
print(msdmap.slo_msd_to_eng_pos(msd))
|
|
print()
|