luscenje_struktur/src/word.py

69 lines
1.8 KiB
Python

from collections import defaultdict
import logging
from msd_translate import MSD_TRANSLATE
class WordMsdOnly:
def __init__(self, msd):
self.msd = msd
self.lemma = None
self.text = None
def most_frequent_text(self, _):
return None
class Word:
def __init__(self, lemma, msd, wid, text, do_msd_translate):
self.lemma = lemma
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
self.id = wid
self.text = text
self.links = defaultdict(list)
last_num = self.id.split('.')[-1]
if last_num[0] not in '0123456789':
last_num = last_num[1:]
self.int_id = int(last_num)
assert None not in (self.id, self.lemma, self.msd)
@staticmethod
def from_xml(xml, do_msd_translate):
lemma = xml.get('lemma')
msd = Word.get_msd(xml)
wid = xml.get('id')
text = xml.text
return Word(lemma, msd, wid, text, do_msd_translate)
@staticmethod
def get_msd(comp):
d = dict(comp.items())
if 'msd' in d:
return d['msd']
elif 'ana' in d:
return d['ana'][4:]
else:
logging.error(d)
raise NotImplementedError("MSD?")
@staticmethod
def pc_word(pc, do_msd_translate):
pc.set('lemma', pc.text)
pc.set('msd', "N" if do_msd_translate else "U")
return Word.from_xml(pc, do_msd_translate)
def add_link(self, link, to):
self.links[link].append(to)
def get_links(self, link):
if link not in self.links and "|" in link:
for l in link.split('|'):
self.links[link].extend(self.links[l])
return self.links[link]
def most_frequent_text(self, word_renderer):
return word_renderer.render(self.lemma, self.msd)