import pkg_resources import pandas import sys # msd mappings from slo to ang class Msdmap: def __init__(self): # http://nl.ijs.si/ME/V4/msd/html/msd.categories-sl.html self.pos_slo_ang = [ ("samostalnik", "S", "Noun", "N"), ("glagol", "G", "Verb", "V"), ("pridevnik", "P", "Adjective", "A"), ("prislov", "R", "Adverb", "R"), ("zaimek", "Z", "Pronoun", "P"), ("števnik", "K", "Numeral", "M"), ("predlog", "D", "Adposition", "S"), ("veznik", "V", "Conjunction", "C"), ("členek", "L", "Particle", "Q"), ("medmet", "M", "Interjection", "I"), ("okrajšava", "O", "Abbreviation", "Y"), ("neuvrščeno", "N", "Residual", "X"), ] table_name = "msd-human-sl.tbl" table_path = pkg_resources.resource_filename(__name__, table_name) with open(table_path, "rb") as fp: # TODO: pandas is awfully slow --- might need optimization self.msd_table = pandas.read_csv( fp, "\t", names=["id", "eng_msd", "eng_long", "slo_msd", "slo_long", "slo_very_long"] ) self.msd_table.set_index('slo_msd') def slo_msd_to_eng_long(self, slo_msd): # old, slow # return (self.msd_table[self.msd_table["slo_msd"] == slo_msd]["eng_long"]).values[0] # return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0] query = self.msd_table.query("slo_msd == '{}'".format(slo_msd)) if query.empty: return "No-matching-msd-found" return query["eng_long"].values[0] def slo_msd_to_eng_pos(self, slo_msd): # first letter in slo_msd == slo_pos return self.pos_slo_ang_map(1, slo_msd[0])[3] def pos_slo_ang_map(self, col, query): for pos in self.pos_slo_ang: if pos[col] == query: return pos raise ValueError("Wrong part of speech.") if __name__ == "__main__": msdmap = Msdmap() test_msds = [ "Soser", "Ppnzmm", "Gp-d-mz" ] for msd in test_msds: print(msd) print(msdmap.slo_msd_to_eng_long(msd)) print(msdmap.slo_msd_to_eng_pos(msd)) print()