You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
2.6 KiB

from collections import defaultdict
import logging
from luscenje_struktur.msd_translate import MSD_TRANSLATE
class WordCompressed:
def __init__(self, text, collocation, dependency_tree):
self.text = text
self.collocation = collocation
self.dependency_tree = dependency_tree
class WordMsdOnly:
def __init__(self, msd):
self.msd = msd
self.lemma = None
self.text = None
def most_frequent_text(self, _):
return None
class WordDummy:
def __init__(self, msd, lemma, text):
self.msd = msd
self.lemma = lemma
self.text = text
def most_frequent_text(self, word_renderer):
return word_renderer.render(self.lemma, self.msd)
class Word:
def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False, previous_punctuation=None):
self.lemma = lemma
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
self.id = wid
self.idi = None
self.text = text
self.glue = ''
self.previous_glue = '' if previous_punctuation is None else previous_punctuation
self.fake_word = fake_word
self.links = defaultdict(list)
last_num = self.id.split('.')[-1]
if last_num[0] not in '0123456789':
last_num = last_num[1:]
self.int_id = int(last_num)
assert None not in (self.id, self.lemma, self.msd)
@staticmethod
def from_xml(xml, do_msd_translate):
lemma = xml.get('lemma')
msd = Word.get_msd(xml)
wid = xml.get('id')
text = xml.text
return Word(lemma, msd, wid, text, do_msd_translate)
@staticmethod
def get_msd(comp):
d = dict(comp.items())
if 'ana' in d:
return d['ana'][4:]
elif 'msd' in d:
return d['msd']
else:
logging.error(d)
raise NotImplementedError("MSD?")
@staticmethod
def pc_word(pc, do_msd_translate):
pc.set('lemma', pc.text)
pc.set('msd', "N" if do_msd_translate else "U")
return Word.from_xml(pc, do_msd_translate)
@staticmethod
def fake_root_word(sentence_id):
wid = sentence_id
return Word('', '', wid, '', False, True)
def add_link(self, link, to):
self.links[link].append(to)
def get_links(self, link):
if link not in self.links and "|" in link:
for l in link.split('|'):
self.links[link].extend(self.links[l])
return self.links[link]
def most_frequent_text(self, word_renderer):
return word_renderer.render(self.lemma, self.msd)