Added s/z, k/h + fixed bug 90 + connecting with sloleks on lemma_fallback
This commit is contained in:
parent
ec113f9cd2
commit
777791ad1e
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,3 +9,4 @@ __pycache__
|
||||||
|
|
||||||
prev
|
prev
|
||||||
old
|
old
|
||||||
|
data
|
||||||
|
|
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
|
@ -1,3 +1,129 @@
|
||||||
|
CODES_TRANSLATION = {
|
||||||
|
"N": {
|
||||||
|
2: {
|
||||||
|
'm': 'masculine',
|
||||||
|
'f': 'feminine',
|
||||||
|
'n': 'neuter',
|
||||||
|
},
|
||||||
|
3: {
|
||||||
|
"s": "singular",
|
||||||
|
"d": "dual",
|
||||||
|
"p": "plural",
|
||||||
|
},
|
||||||
|
4: {
|
||||||
|
"n": "nominative",
|
||||||
|
"g": "genitive",
|
||||||
|
"d": "dative",
|
||||||
|
"a": "accusative",
|
||||||
|
"l": "locative",
|
||||||
|
"i": "instrumental",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"V": {
|
||||||
|
1: {
|
||||||
|
"m": "main",
|
||||||
|
"a": "auxiliary",
|
||||||
|
},
|
||||||
|
3: {
|
||||||
|
"n": "infinitive",
|
||||||
|
"u": "supine",
|
||||||
|
"p": "participle",
|
||||||
|
"r": "present",
|
||||||
|
"f": "future",
|
||||||
|
"c": "conditional",
|
||||||
|
"m": "imperative",
|
||||||
|
},
|
||||||
|
4: {
|
||||||
|
"1": "first",
|
||||||
|
"2": "second",
|
||||||
|
"3": "third",
|
||||||
|
},
|
||||||
|
5: {
|
||||||
|
"s": "singular",
|
||||||
|
"d": "dual",
|
||||||
|
"p": "plural",
|
||||||
|
},
|
||||||
|
6: {
|
||||||
|
'm': 'masculine',
|
||||||
|
'f': 'feminine',
|
||||||
|
'n': 'neuter',
|
||||||
|
},
|
||||||
|
8: {
|
||||||
|
"n": "no",
|
||||||
|
"y": "yes",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"A": {
|
||||||
|
1: {
|
||||||
|
"g": "general",
|
||||||
|
"s": "possessive",
|
||||||
|
},
|
||||||
|
2: {
|
||||||
|
"p": "positive",
|
||||||
|
"c": "comparative",
|
||||||
|
"s": "superlative",
|
||||||
|
},
|
||||||
|
3: {
|
||||||
|
'm': 'masculine',
|
||||||
|
'f': 'feminine',
|
||||||
|
'n': 'neuter',
|
||||||
|
},
|
||||||
|
4: {
|
||||||
|
"s": "singular",
|
||||||
|
"d": "dual",
|
||||||
|
"p": "plural",
|
||||||
|
},
|
||||||
|
5: {
|
||||||
|
"n": "nominative",
|
||||||
|
"g": "genitive",
|
||||||
|
"d": "dative",
|
||||||
|
"a": "accusative",
|
||||||
|
"l": "locative",
|
||||||
|
"i": "instrumental",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
# "R": "Adverb",
|
||||||
|
# "P": "Pronoun",
|
||||||
|
# "M": "Numeral",
|
||||||
|
# "S": "Preposition",
|
||||||
|
# "C": "Conjunction",
|
||||||
|
# "Q": "Particle",
|
||||||
|
# "I": "Interjection",
|
||||||
|
# "Y": "Abbreviation",
|
||||||
|
# "X": "Residual",
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# "e": "perfective",
|
||||||
|
# "p": "progressive",
|
||||||
|
# "b": "biaspectual",
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# "p": "personal",
|
||||||
|
# "d": "demonstrative",
|
||||||
|
# "r": "relative",
|
||||||
|
# "x": "reflexive",
|
||||||
|
# "q": "interrogative",
|
||||||
|
# "i": "indefinite",
|
||||||
|
# "z": "negative",
|
||||||
|
# "b": "bound",
|
||||||
|
# "d": "digit",
|
||||||
|
# "r": "roman",
|
||||||
|
# "l": "letter",
|
||||||
|
# "c": "cardinal",
|
||||||
|
# "o": "ordinal",
|
||||||
|
# "p": "pronominal",
|
||||||
|
# "s": "special",
|
||||||
|
# "c": "coordinating",
|
||||||
|
# "s": "subordinating",
|
||||||
|
# "f": "foreign",
|
||||||
|
# "t": "typo",
|
||||||
|
# "p": "program",
|
||||||
|
# "w": "web",
|
||||||
|
# "e": "emo",
|
||||||
|
# "h": "hashtag",
|
||||||
|
# "a: "at""
|
||||||
|
}
|
||||||
|
|
||||||
CODES = {
|
CODES = {
|
||||||
"Noun": "N",
|
"Noun": "N",
|
||||||
"Verb": "V",
|
"Verb": "V",
|
||||||
|
|
|
@ -124,9 +124,9 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
||||||
dest = l.get('dep')
|
dest = l.get('dep')
|
||||||
else:
|
else:
|
||||||
ana = l.get('ana')
|
ana = l.get('ana')
|
||||||
if ana[:4] != 'syn:': # dont bother...
|
if ana[:8] != 'jos-syn:': # dont bother...
|
||||||
continue
|
continue
|
||||||
ana = ana[4:]
|
ana = ana[8:]
|
||||||
lfrom, dest = l.get('target').replace('#', '').split()
|
lfrom, dest = l.get('target').replace('#', '').split()
|
||||||
|
|
||||||
if lfrom in words:
|
if lfrom in words:
|
||||||
|
|
|
@ -91,7 +91,14 @@ class MatchStore:
|
||||||
(structure.id,)):
|
(structure.id,)):
|
||||||
yield StructureMatch.from_db(self.db, cid[0], structure)
|
yield StructureMatch.from_db(self.db, cid[0], structure)
|
||||||
|
|
||||||
def set_representations(self, word_renderer, structures):
|
def add_inserts(self, inserts):
|
||||||
|
for match in inserts:
|
||||||
|
for component_id, text in match.representations.items():
|
||||||
|
self.db.execute("""
|
||||||
|
INSERT INTO Representations (colocation_id, component_id, text)
|
||||||
|
VALUES (?,?,?)""", (match.match_id, component_id, text))
|
||||||
|
|
||||||
|
def set_representations(self, word_renderer, structures, sloleks_db=None):
|
||||||
step_name = 'representation'
|
step_name = 'representation'
|
||||||
if self.db.is_step_done(step_name):
|
if self.db.is_step_done(step_name):
|
||||||
print("Representation step already done, skipping")
|
print("Representation step already done, skipping")
|
||||||
|
@ -105,17 +112,14 @@ class MatchStore:
|
||||||
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
|
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
|
||||||
structure = structures_dict[sid]
|
structure = structures_dict[sid]
|
||||||
match = StructureMatch.from_db(self.db, cid, structure)
|
match = StructureMatch.from_db(self.db, cid, structure)
|
||||||
RepresentationAssigner.set_representations(match, word_renderer)
|
RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db)
|
||||||
|
|
||||||
inserts.append(match)
|
inserts.append(match)
|
||||||
if len(inserts) > num_inserts:
|
if len(inserts) > num_inserts:
|
||||||
for match in inserts:
|
self.add_inserts(inserts)
|
||||||
for component_id, text in match.representations.items():
|
|
||||||
self.db.execute("""
|
|
||||||
INSERT INTO Representations (colocation_id, component_id, text)
|
|
||||||
VALUES (?,?,?)""", (match.match_id, component_id, text))
|
|
||||||
inserts = []
|
inserts = []
|
||||||
|
|
||||||
|
self.add_inserts(inserts)
|
||||||
self.db.step_is_done(step_name)
|
self.db.step_is_done(step_name)
|
||||||
|
|
||||||
def has_colocation_id_enough_frequency(self, colocation_id):
|
def has_colocation_id_enough_frequency(self, colocation_id):
|
||||||
|
|
38
src/postprocessor.py
Normal file
38
src/postprocessor.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
|
||||||
|
class Postprocessor:
|
||||||
|
def __init__(self, fix_one_letter_words=True):
|
||||||
|
self.fix_one_letter_words = fix_one_letter_words
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fix_sz(next_word):
|
||||||
|
if next_word[0] in ['c', 'č', 'f', 'h', 'k', 'p', 's', 'š', 't']:
|
||||||
|
return 's'
|
||||||
|
return 'z'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fix_kh(next_word):
|
||||||
|
if next_word[0] in ['g', 'k']:
|
||||||
|
return 'h'
|
||||||
|
return 'k'
|
||||||
|
|
||||||
|
def process(self, match, collocation_id):
|
||||||
|
# self.matches = matches
|
||||||
|
# if self.fix_one_letter_words:
|
||||||
|
# for syn_structure_key, syn_structure_value in self.matches.items():
|
||||||
|
# for match, collocation_id in syn_structure_value:
|
||||||
|
if len(collocation_id) > 2:
|
||||||
|
# a = collocation_id[1:-1]
|
||||||
|
# b = enumerate(collocation_id[1:-1])
|
||||||
|
# for a, c in b:
|
||||||
|
# print('here')
|
||||||
|
for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
|
||||||
|
if word in ['s', 'z']:
|
||||||
|
correct_letter = self.fix_sz(collocation_id[idx + 2][1])
|
||||||
|
collocation_id[idx + 1][1] = correct_letter
|
||||||
|
match[col_id].text = correct_letter
|
||||||
|
elif word in ['k', 'h']:
|
||||||
|
correct_letter = self.fix_kh(collocation_id[idx + 2][1])
|
||||||
|
collocation_id[idx + 1][1] = correct_letter
|
||||||
|
match[col_id].text = correct_letter
|
||||||
|
collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]]
|
||||||
|
return match, collocation_id
|
|
@ -4,6 +4,9 @@ from collections import Counter
|
||||||
from codes_tagset import TAGSET, CODES
|
from codes_tagset import TAGSET, CODES
|
||||||
from word import WordMsdOnly
|
from word import WordMsdOnly
|
||||||
|
|
||||||
|
from src.word import WordDummy
|
||||||
|
|
||||||
|
|
||||||
class ComponentRepresentation:
|
class ComponentRepresentation:
|
||||||
def __init__(self, data, word_renderer):
|
def __init__(self, data, word_renderer):
|
||||||
self.data = data
|
self.data = data
|
||||||
|
@ -19,23 +22,23 @@ class ComponentRepresentation:
|
||||||
def add_word(self, word):
|
def add_word(self, word):
|
||||||
self.words.append(word)
|
self.words.append(word)
|
||||||
|
|
||||||
def render(self):
|
def render(self, sloleks_db=None):
|
||||||
if self.rendition_text is None:
|
if self.rendition_text is None:
|
||||||
self.rendition_text = self._render()
|
self.rendition_text = self._render(sloleks_db=sloleks_db)
|
||||||
|
|
||||||
def _render(self):
|
def _render(self, sloleks_db=None):
|
||||||
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
|
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
|
||||||
|
|
||||||
class LemmaCR(ComponentRepresentation):
|
class LemmaCR(ComponentRepresentation):
|
||||||
def _render(self):
|
def _render(self, sloleks_db=None):
|
||||||
return self.words[0].lemma if len(self.words) > 0 else None
|
return self.words[0].lemma if len(self.words) > 0 else None
|
||||||
|
|
||||||
class LexisCR(ComponentRepresentation):
|
class LexisCR(ComponentRepresentation):
|
||||||
def _render(self):
|
def _render(self, sloleks_db=None):
|
||||||
return self.data['lexis']
|
return self.data['lexis']
|
||||||
|
|
||||||
class WordFormAllCR(ComponentRepresentation):
|
class WordFormAllCR(ComponentRepresentation):
|
||||||
def _render(self):
|
def _render(self, sloleks_db=None):
|
||||||
if len(self.words) == 0:
|
if len(self.words) == 0:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
|
@ -43,7 +46,7 @@ class WordFormAllCR(ComponentRepresentation):
|
||||||
return "/".join(set(forms))
|
return "/".join(set(forms))
|
||||||
|
|
||||||
class WordFormAnyCR(ComponentRepresentation):
|
class WordFormAnyCR(ComponentRepresentation):
|
||||||
def _render(self):
|
def _render(self, sloleks_db=None):
|
||||||
text_forms = {}
|
text_forms = {}
|
||||||
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
|
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
|
||||||
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
|
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
|
||||||
|
@ -60,6 +63,23 @@ class WordFormAnyCR(ComponentRepresentation):
|
||||||
# check if agreements match
|
# check if agreements match
|
||||||
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
||||||
|
|
||||||
|
# in case all agreements do not match try to get data from sloleks and change properly
|
||||||
|
if not all(agreements_matched):
|
||||||
|
if sloleks_db is None:
|
||||||
|
raise Exception('sloleks_db not properly setup!')
|
||||||
|
for agr in self.agreement:
|
||||||
|
if not agr.match(word_msd):
|
||||||
|
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
|
||||||
|
if msd is not None:
|
||||||
|
agr.msds[0] = msd
|
||||||
|
agr.words.append(WordDummy(msd, lemma, text))
|
||||||
|
# agr.words[0].msd = msd
|
||||||
|
# agr.words[0].text = text
|
||||||
|
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
# if we are at the last "backup word", then confirm matches
|
# if we are at the last "backup word", then confirm matches
|
||||||
# that worked for this one and return
|
# that worked for this one and return
|
||||||
if word_lemma is None:
|
if word_lemma is None:
|
||||||
|
@ -109,9 +129,15 @@ class WordFormMsdCR(WordFormAnyCR):
|
||||||
if self.check_msd(word.msd):
|
if self.check_msd(word.msd):
|
||||||
super().add_word(word)
|
super().add_word(word)
|
||||||
|
|
||||||
def _render(self):
|
def _render(self, sloleks_db=None):
|
||||||
|
if len(self.words) == 0:
|
||||||
|
if sloleks_db is None:
|
||||||
|
raise Exception('sloleks_db not properly setup!')
|
||||||
|
msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
|
||||||
|
if msd is not None:
|
||||||
|
self.words.append(WordDummy(msd, lemma, text))
|
||||||
self.words.append(WordMsdOnly(self._common_msd()))
|
self.words.append(WordMsdOnly(self._common_msd()))
|
||||||
return super()._render()
|
return super()._render(sloleks_db)
|
||||||
|
|
||||||
def _common_msd(self):
|
def _common_msd(self):
|
||||||
msds = sorted(self.msds, key=len)
|
msds = sorted(self.msds, key=len)
|
||||||
|
@ -182,5 +208,5 @@ class WordFormAgreementCR(WordFormMsdCR):
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def render(self):
|
def render(self, sloleks_db=None):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -39,7 +39,7 @@ class RepresentationAssigner:
|
||||||
return self.representation_factory(self.more, word_renderer)
|
return self.representation_factory(self.more, word_renderer)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def set_representations(match, word_renderer):
|
def set_representations(match, word_renderer, sloleks_db=None):
|
||||||
representations = {}
|
representations = {}
|
||||||
for c in match.structure.components:
|
for c in match.structure.components:
|
||||||
representations[c.idx] = []
|
representations[c.idx] = []
|
||||||
|
@ -70,7 +70,7 @@ class RepresentationAssigner:
|
||||||
|
|
||||||
for cid, reps in representations.items():
|
for cid, reps in representations.items():
|
||||||
for rep in reps:
|
for rep in reps:
|
||||||
rep.render()
|
rep.render(sloleks_db=sloleks_db)
|
||||||
|
|
||||||
for cid, reps in representations.items():
|
for cid, reps in representations.items():
|
||||||
reps = [rep.rendition_text for rep in reps]
|
reps = [rep.rendition_text for rep in reps]
|
||||||
|
|
191
src/sloleks_db.py
Normal file
191
src/sloleks_db.py
Normal file
|
@ -0,0 +1,191 @@
|
||||||
|
from collections import defaultdict
|
||||||
|
from ast import literal_eval
|
||||||
|
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy.orm import Session, aliased
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy import func
|
||||||
|
|
||||||
|
from match import StructureMatch
|
||||||
|
from representation_assigner import RepresentationAssigner
|
||||||
|
from progress_bar import progress
|
||||||
|
|
||||||
|
# Lexeme = None
|
||||||
|
# LexemeFeature = None
|
||||||
|
# SyntacticStructure = None
|
||||||
|
# StructureComponent = None
|
||||||
|
# Feature = None
|
||||||
|
# LexicalUnitLexeme = None
|
||||||
|
# LexicalUnit = None
|
||||||
|
# LexicalUnitType = None
|
||||||
|
# Category = None
|
||||||
|
# Sense = None
|
||||||
|
# Measure = None
|
||||||
|
# LexicalUnitMeasure = None
|
||||||
|
# Corpus = None
|
||||||
|
# Definition = None
|
||||||
|
# WordForm = None
|
||||||
|
# WordFormFeature = None
|
||||||
|
# FormRepresentation = None
|
||||||
|
from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
|
||||||
|
|
||||||
|
|
||||||
|
class SloleksDatabase:
|
||||||
|
def __init__(self, db):
|
||||||
|
# self.db = db
|
||||||
|
# self.dispersions = {}
|
||||||
|
# self.min_freq = args.min_freq
|
||||||
|
|
||||||
|
# self.db.init("""CREATE TABLE Colocations (
|
||||||
|
# colocation_id INTEGER PRIMARY KEY,
|
||||||
|
# structure_id varchar(8),
|
||||||
|
# key varchar(256))
|
||||||
|
# """)
|
||||||
|
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
||||||
|
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||||
|
|
||||||
|
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
||||||
|
pool_recycle=14400)
|
||||||
|
Base = declarative_base()
|
||||||
|
Base.metadata.reflect(engine)
|
||||||
|
|
||||||
|
class Lexeme(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_lexeme']
|
||||||
|
|
||||||
|
class LexemeFeature(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_lexeme_feature']
|
||||||
|
|
||||||
|
class SyntacticStructure(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_syntacticstructure']
|
||||||
|
|
||||||
|
class StructureComponent(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_structurecomponent']
|
||||||
|
|
||||||
|
class Feature(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_feature']
|
||||||
|
|
||||||
|
class LexicalUnitLexeme(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
|
||||||
|
|
||||||
|
class LexicalUnit(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_lexicalunit']
|
||||||
|
|
||||||
|
class LexicalUnitType(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_lexicalunittype']
|
||||||
|
|
||||||
|
class Category(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_category']
|
||||||
|
|
||||||
|
class Sense(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_sense']
|
||||||
|
|
||||||
|
class Measure(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_measure']
|
||||||
|
|
||||||
|
class LexicalUnitMeasure(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
|
||||||
|
|
||||||
|
class Corpus(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_corpus']
|
||||||
|
|
||||||
|
class Definition(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_definition']
|
||||||
|
|
||||||
|
class WordForm(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_wordform']
|
||||||
|
|
||||||
|
class WordFormFeature(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_wordform_feature']
|
||||||
|
|
||||||
|
class FormRepresentation(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
||||||
|
|
||||||
|
self.session = Session(engine)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.session.close()
|
||||||
|
|
||||||
|
def decypher_msd(self, msd):
|
||||||
|
t = msd[0]
|
||||||
|
decypher = []
|
||||||
|
if t == 'N':
|
||||||
|
# gender = CODES_TRANSLATION[t][2][msd[2]]
|
||||||
|
number = CODES_TRANSLATION[t][3][msd[3]]
|
||||||
|
case = CODES_TRANSLATION[t][4][msd[4]]
|
||||||
|
decypher = [number, case]
|
||||||
|
elif t == 'V':
|
||||||
|
# gender = CODES_TRANSLATION[t][6][msd[6]]
|
||||||
|
vform = CODES_TRANSLATION[t][3][msd[3]]
|
||||||
|
number = CODES_TRANSLATION[t][5][msd[5]]
|
||||||
|
person = 'third'
|
||||||
|
decypher = [vform, number, person]
|
||||||
|
elif t == 'A':
|
||||||
|
gender = CODES_TRANSLATION[t][3][msd[3]]
|
||||||
|
number = CODES_TRANSLATION[t][4][msd[4]]
|
||||||
|
case = CODES_TRANSLATION[t][5][msd[5]]
|
||||||
|
decypher = [gender, number, case]
|
||||||
|
|
||||||
|
return decypher
|
||||||
|
|
||||||
|
def get_word_form(self, lemma, msd, data, align_msd=False):
|
||||||
|
# modify msd as required
|
||||||
|
msd = list(msd)
|
||||||
|
|
||||||
|
if not align_msd and 'msd' in data:
|
||||||
|
for key, value in data['msd'].items():
|
||||||
|
t = msd[0]
|
||||||
|
v = TAGSET[t].index(key.lower())
|
||||||
|
msd[v + 1] = CODES[value]
|
||||||
|
|
||||||
|
elif 'agreement' in data:
|
||||||
|
align_msd = list(align_msd)
|
||||||
|
t_align_msd = align_msd[0]
|
||||||
|
t = msd[0]
|
||||||
|
|
||||||
|
for att in data['agreement']:
|
||||||
|
v_align_msd = TAGSET[t_align_msd].index(att.lower())
|
||||||
|
v = TAGSET[t].index(att.lower())
|
||||||
|
# fix for verbs with short msds
|
||||||
|
if v >= len(msd):
|
||||||
|
return None, None, None
|
||||||
|
# if v >= len(msd) and t == 'V' and att == 'number':
|
||||||
|
# if len(msd) == 4:
|
||||||
|
# msd += ['3']
|
||||||
|
# if len(msd) == 5:
|
||||||
|
# msd += ['_']
|
||||||
|
# try:
|
||||||
|
msd[v + 1] = align_msd[v_align_msd + 1]
|
||||||
|
# except:
|
||||||
|
# print('here')
|
||||||
|
|
||||||
|
# msd = list(msd)
|
||||||
|
decypher_msd = self.decypher_msd(msd)
|
||||||
|
|
||||||
|
if not decypher_msd:
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
|
||||||
|
# wf1 = aliased(WordFormFeature)
|
||||||
|
# wf2 = aliased(WordFormFeature)
|
||||||
|
# wf3 = aliased(WordFormFeature)
|
||||||
|
query_preposition = self.session.query(FormRepresentation.form) \
|
||||||
|
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
||||||
|
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
|
||||||
|
|
||||||
|
for wf in wfs:
|
||||||
|
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
|
||||||
|
# .join(wf1, wf1.word_form_id == WordForm.id) \
|
||||||
|
# .join(wf2, wf2.word_form_id == WordForm.id) \
|
||||||
|
# .join(wf3, wf3.word_form_id == WordForm.id) \
|
||||||
|
|
||||||
|
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
|
||||||
|
|
||||||
|
for wf, msd_el in zip(wfs, decypher_msd):
|
||||||
|
query_preposition = query_preposition.filter(wf.value == msd_el)
|
||||||
|
|
||||||
|
pattern_translation_hws = query_preposition.all()
|
||||||
|
if len(pattern_translation_hws) > 0:
|
||||||
|
return ''.join(msd), lemma, pattern_translation_hws[0][0]
|
||||||
|
# pattern_translation_hws = [el[0] for el in query_preposition.all()]
|
||||||
|
return None, None, None
|
||||||
|
# return pattern_translation_hws
|
|
@ -14,7 +14,7 @@ class SyntacticStructure:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_xml(xml):
|
def from_xml(xml):
|
||||||
st = SyntacticStructure()
|
st = SyntacticStructure()
|
||||||
st.id = xml.get('id')
|
st.id = xml.get('id_nsss')
|
||||||
st.lbs = xml.get('LBS')
|
st.lbs = xml.get('LBS')
|
||||||
|
|
||||||
assert len(list(xml)) == 1
|
assert len(list(xml)) == 1
|
||||||
|
|
25
src/wani.py
25
src/wani.py
|
@ -11,6 +11,7 @@ import concurrent.futures
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
from progress_bar import progress
|
from progress_bar import progress
|
||||||
|
from sloleks_db import SloleksDatabase
|
||||||
from word import Word
|
from word import Word
|
||||||
from syntactic_structure import build_structures
|
from syntactic_structure import build_structures
|
||||||
from match_store import MatchStore
|
from match_store import MatchStore
|
||||||
|
@ -20,16 +21,20 @@ from loader import load_files
|
||||||
from database import Database
|
from database import Database
|
||||||
from time_info import TimeInfo
|
from time_info import TimeInfo
|
||||||
|
|
||||||
|
from src.postprocessor import Postprocessor
|
||||||
|
|
||||||
def match_file(words, structures):
|
|
||||||
|
def match_file(words, structures, postprocessor):
|
||||||
matches = {s: [] for s in structures}
|
matches = {s: [] for s in structures}
|
||||||
|
|
||||||
for s in progress(structures, "matching"):
|
for s in progress(structures, "matching"):
|
||||||
for w in words:
|
for w in words:
|
||||||
mhere = s.match(w)
|
mhere = s.match(w)
|
||||||
for match in mhere:
|
for match in mhere:
|
||||||
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
# colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
||||||
|
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
|
||||||
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
|
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
|
||||||
|
match, collocation_id = postprocessor.process(match, colocation_id)
|
||||||
colocation_id = tuple(colocation_id)
|
colocation_id = tuple(colocation_id)
|
||||||
|
|
||||||
matches[s].append((match, colocation_id))
|
matches[s].append((match, colocation_id))
|
||||||
|
@ -38,6 +43,7 @@ def match_file(words, structures):
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
sloleks_db = SloleksDatabase(args.sloleks_db)
|
||||||
structures, lemma_msds, max_num_components = build_structures(args)
|
structures, lemma_msds, max_num_components = build_structures(args)
|
||||||
timeinfo = TimeInfo(len(args.input))
|
timeinfo = TimeInfo(len(args.input))
|
||||||
|
|
||||||
|
@ -51,7 +57,11 @@ def main(args):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
matches = match_file(words, structures)
|
postprocessor = Postprocessor()
|
||||||
|
matches = match_file(words, structures, postprocessor)
|
||||||
|
|
||||||
|
# matches = .process()
|
||||||
|
# TODO Add postprocessing here or inside previous function!
|
||||||
match_store.add_matches(matches)
|
match_store.add_matches(matches)
|
||||||
word_stats.add_words(words)
|
word_stats.add_words(words)
|
||||||
database.commit()
|
database.commit()
|
||||||
|
@ -74,7 +84,7 @@ def main(args):
|
||||||
|
|
||||||
# figure out representations!
|
# figure out representations!
|
||||||
if args.out or args.out_no_stat:
|
if args.out or args.out_no_stat:
|
||||||
match_store.set_representations(word_stats, structures)
|
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
|
||||||
|
|
||||||
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||||
structures, match_store)
|
structures, match_store)
|
||||||
|
@ -85,6 +95,10 @@ def main(args):
|
||||||
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
|
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||||
structures, match_store)
|
structures, match_store)
|
||||||
|
|
||||||
|
# sloleks_db.get_word_form(lemma, gender, number, case)
|
||||||
|
sloleks_db.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='Extract structures from a parsed corpus.')
|
description='Extract structures from a parsed corpus.')
|
||||||
|
@ -92,6 +106,7 @@ if __name__ == '__main__':
|
||||||
help='Structures definitions in xml file')
|
help='Structures definitions in xml file')
|
||||||
parser.add_argument('input',
|
parser.add_argument('input',
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
|
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
|
||||||
|
parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
|
||||||
parser.add_argument('--out',
|
parser.add_argument('--out',
|
||||||
help='Classic output file')
|
help='Classic output file')
|
||||||
parser.add_argument('--out-no-stat',
|
parser.add_argument('--out-no-stat',
|
||||||
|
@ -100,7 +115,7 @@ if __name__ == '__main__':
|
||||||
help='Additional output file, writes more data')
|
help='Additional output file, writes more data')
|
||||||
parser.add_argument('--stats',
|
parser.add_argument('--stats',
|
||||||
help='Output file for statistics')
|
help='Output file for statistics')
|
||||||
|
#
|
||||||
parser.add_argument('--no-msd-translate',
|
parser.add_argument('--no-msd-translate',
|
||||||
help='MSDs are translated from slovene to english by default',
|
help='MSDs are translated from slovene to english by default',
|
||||||
action='store_true')
|
action='store_true')
|
||||||
|
|
16
src/word.py
16
src/word.py
|
@ -14,6 +14,16 @@ class WordMsdOnly:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class WordDummy:
|
||||||
|
def __init__(self, msd, lemma, text):
|
||||||
|
self.msd = msd
|
||||||
|
self.lemma = lemma
|
||||||
|
self.text = text
|
||||||
|
|
||||||
|
def most_frequent_text(self, word_renderer):
|
||||||
|
return word_renderer.render(self.lemma, self.msd)
|
||||||
|
|
||||||
|
|
||||||
class Word:
|
class Word:
|
||||||
def __init__(self, lemma, msd, wid, text, do_msd_translate):
|
def __init__(self, lemma, msd, wid, text, do_msd_translate):
|
||||||
self.lemma = lemma
|
self.lemma = lemma
|
||||||
|
@ -41,10 +51,10 @@ class Word:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_msd(comp):
|
def get_msd(comp):
|
||||||
d = dict(comp.items())
|
d = dict(comp.items())
|
||||||
if 'msd' in d:
|
if 'ana' in d:
|
||||||
return d['msd']
|
|
||||||
elif 'ana' in d:
|
|
||||||
return d['ana'][4:]
|
return d['ana'][4:]
|
||||||
|
elif 'msd' in d:
|
||||||
|
return d['msd']
|
||||||
else:
|
else:
|
||||||
logging.error(d)
|
logging.error(d)
|
||||||
raise NotImplementedError("MSD?")
|
raise NotImplementedError("MSD?")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user