from collections import defaultdict import logging from luscenje_struktur.msd_translate import MSD_TRANSLATE class WordCompressed: def __init__(self, text, collocation, dependency_tree): self.text = text self.collocation = collocation self.dependency_tree = dependency_tree class WordMsdOnly: def __init__(self, msd): self.msd = msd self.lemma = None self.text = None def most_frequent_text(self, _): return None class WordDummy: def __init__(self, msd, lemma, text): self.msd = msd self.lemma = lemma self.text = text def most_frequent_text(self, word_renderer): return word_renderer.render(self.lemma, self.msd) class Word: def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None): self.lemma = lemma self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd self.id = wid self.idi = None self.text = text self.glue = '' self.previous_glue = '' if previous_punctuation is None else previous_punctuation self.fake_word = fake_word self.links = defaultdict(list) last_num = self.id.split('.')[-1] if last_num[0] not in '0123456789': last_num = last_num[1:] self.int_id = int(last_num) assert None not in (self.id, self.lemma, self.msd) @staticmethod def from_xml(xml, do_msd_translate): lemma = xml.get('lemma') msd = Word.get_msd(xml) wid = xml.get('id') text = xml.text return Word(lemma, msd, wid, text, do_msd_translate) @staticmethod def get_msd(comp): d = dict(comp.items()) if 'ana' in d: return d['ana'][4:] elif 'msd' in d: return d['msd'] else: logging.error(d) raise NotImplementedError("MSD?") @staticmethod def pc_word(pc, do_msd_translate): pc.set('lemma', pc.text) pc.set('msd', "N" if do_msd_translate else "U") return Word.from_xml(pc, do_msd_translate) @staticmethod def fake_root_word(sentence_id): wid = sentence_id return Word('', '', wid, '', False, True) def add_link(self, link, to): self.links[link].append(to) def get_links(self, link): if link not in self.links and "|" in link: for l in link.split('|'): self.links[link].extend(self.links[l]) return self.links[link] def most_frequent_text(self, word_renderer): return word_renderer.render(self.lemma, self.msd)