Added s/z, k/h + fixed bug 90 + connecting with sloleks on lemma_fallback

This commit is contained in:
Luka 2020-07-08 19:23:56 +02:00
parent ec113f9cd2
commit 777791ad1e
12 changed files with 443 additions and 32 deletions

1
.gitignore vendored
View File

@ -9,3 +9,4 @@ __pycache__
prev
old
data

0
src/__init__.py Normal file
View File

View File

@ -1,3 +1,129 @@
CODES_TRANSLATION = {
"N": {
2: {
'm': 'masculine',
'f': 'feminine',
'n': 'neuter',
},
3: {
"s": "singular",
"d": "dual",
"p": "plural",
},
4: {
"n": "nominative",
"g": "genitive",
"d": "dative",
"a": "accusative",
"l": "locative",
"i": "instrumental",
},
},
"V": {
1: {
"m": "main",
"a": "auxiliary",
},
3: {
"n": "infinitive",
"u": "supine",
"p": "participle",
"r": "present",
"f": "future",
"c": "conditional",
"m": "imperative",
},
4: {
"1": "first",
"2": "second",
"3": "third",
},
5: {
"s": "singular",
"d": "dual",
"p": "plural",
},
6: {
'm': 'masculine',
'f': 'feminine',
'n': 'neuter',
},
8: {
"n": "no",
"y": "yes",
},
},
"A": {
1: {
"g": "general",
"s": "possessive",
},
2: {
"p": "positive",
"c": "comparative",
"s": "superlative",
},
3: {
'm': 'masculine',
'f': 'feminine',
'n': 'neuter',
},
4: {
"s": "singular",
"d": "dual",
"p": "plural",
},
5: {
"n": "nominative",
"g": "genitive",
"d": "dative",
"a": "accusative",
"l": "locative",
"i": "instrumental",
},
}
# "R": "Adverb",
# "P": "Pronoun",
# "M": "Numeral",
# "S": "Preposition",
# "C": "Conjunction",
# "Q": "Particle",
# "I": "Interjection",
# "Y": "Abbreviation",
# "X": "Residual",
#
#
# "e": "perfective",
# "p": "progressive",
# "b": "biaspectual",
#
#
# "p": "personal",
# "d": "demonstrative",
# "r": "relative",
# "x": "reflexive",
# "q": "interrogative",
# "i": "indefinite",
# "z": "negative",
# "b": "bound",
# "d": "digit",
# "r": "roman",
# "l": "letter",
# "c": "cardinal",
# "o": "ordinal",
# "p": "pronominal",
# "s": "special",
# "c": "coordinating",
# "s": "subordinating",
# "f": "foreign",
# "t": "typo",
# "p": "program",
# "w": "web",
# "e": "emo",
# "h": "hashtag",
# "a: "at""
}
CODES = {
"Noun": "N",
"Verb": "V",

View File

@ -124,9 +124,9 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
dest = l.get('dep')
else:
ana = l.get('ana')
if ana[:4] != 'syn:': # dont bother...
if ana[:8] != 'jos-syn:': # dont bother...
continue
ana = ana[4:]
ana = ana[8:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:

View File

@ -91,7 +91,14 @@ class MatchStore:
(structure.id,)):
yield StructureMatch.from_db(self.db, cid[0], structure)
def set_representations(self, word_renderer, structures):
def add_inserts(self, inserts):
for match in inserts:
for component_id, text in match.representations.items():
self.db.execute("""
INSERT INTO Representations (colocation_id, component_id, text)
VALUES (?,?,?)""", (match.match_id, component_id, text))
def set_representations(self, word_renderer, structures, sloleks_db=None):
step_name = 'representation'
if self.db.is_step_done(step_name):
print("Representation step already done, skipping")
@ -105,17 +112,14 @@ class MatchStore:
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
structure = structures_dict[sid]
match = StructureMatch.from_db(self.db, cid, structure)
RepresentationAssigner.set_representations(match, word_renderer)
RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db)
inserts.append(match)
if len(inserts) > num_inserts:
for match in inserts:
for component_id, text in match.representations.items():
self.db.execute("""
INSERT INTO Representations (colocation_id, component_id, text)
VALUES (?,?,?)""", (match.match_id, component_id, text))
self.add_inserts(inserts)
inserts = []
self.add_inserts(inserts)
self.db.step_is_done(step_name)
def has_colocation_id_enough_frequency(self, colocation_id):

38
src/postprocessor.py Normal file
View File

@ -0,0 +1,38 @@
class Postprocessor:
def __init__(self, fix_one_letter_words=True):
self.fix_one_letter_words = fix_one_letter_words
@staticmethod
def fix_sz(next_word):
if next_word[0] in ['c', 'č', 'f', 'h', 'k', 'p', 's', 'š', 't']:
return 's'
return 'z'
@staticmethod
def fix_kh(next_word):
if next_word[0] in ['g', 'k']:
return 'h'
return 'k'
def process(self, match, collocation_id):
# self.matches = matches
# if self.fix_one_letter_words:
# for syn_structure_key, syn_structure_value in self.matches.items():
# for match, collocation_id in syn_structure_value:
if len(collocation_id) > 2:
# a = collocation_id[1:-1]
# b = enumerate(collocation_id[1:-1])
# for a, c in b:
# print('here')
for idx, (col_id, word) in enumerate(collocation_id[1:-1]):
if word in ['s', 'z']:
correct_letter = self.fix_sz(collocation_id[idx + 2][1])
collocation_id[idx + 1][1] = correct_letter
match[col_id].text = correct_letter
elif word in ['k', 'h']:
correct_letter = self.fix_kh(collocation_id[idx + 2][1])
collocation_id[idx + 1][1] = correct_letter
match[col_id].text = correct_letter
collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]]
return match, collocation_id

View File

@ -4,6 +4,9 @@ from collections import Counter
from codes_tagset import TAGSET, CODES
from word import WordMsdOnly
from src.word import WordDummy
class ComponentRepresentation:
def __init__(self, data, word_renderer):
self.data = data
@ -19,23 +22,23 @@ class ComponentRepresentation:
def add_word(self, word):
self.words.append(word)
def render(self):
def render(self, sloleks_db=None):
if self.rendition_text is None:
self.rendition_text = self._render()
self.rendition_text = self._render(sloleks_db=sloleks_db)
def _render(self):
def _render(self, sloleks_db=None):
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
class LemmaCR(ComponentRepresentation):
def _render(self):
def _render(self, sloleks_db=None):
return self.words[0].lemma if len(self.words) > 0 else None
class LexisCR(ComponentRepresentation):
def _render(self):
def _render(self, sloleks_db=None):
return self.data['lexis']
class WordFormAllCR(ComponentRepresentation):
def _render(self):
def _render(self, sloleks_db=None):
if len(self.words) == 0:
return None
else:
@ -43,7 +46,7 @@ class WordFormAllCR(ComponentRepresentation):
return "/".join(set(forms))
class WordFormAnyCR(ComponentRepresentation):
def _render(self):
def _render(self, sloleks_db=None):
text_forms = {}
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
@ -60,6 +63,23 @@ class WordFormAnyCR(ComponentRepresentation):
# check if agreements match
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
# in case all agreements do not match try to get data from sloleks and change properly
if not all(agreements_matched):
if sloleks_db is None:
raise Exception('sloleks_db not properly setup!')
for agr in self.agreement:
if not agr.match(word_msd):
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
if msd is not None:
agr.msds[0] = msd
agr.words.append(WordDummy(msd, lemma, text))
# agr.words[0].msd = msd
# agr.words[0].text = text
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
else:
break
# if we are at the last "backup word", then confirm matches
# that worked for this one and return
if word_lemma is None:
@ -109,9 +129,15 @@ class WordFormMsdCR(WordFormAnyCR):
if self.check_msd(word.msd):
super().add_word(word)
def _render(self):
def _render(self, sloleks_db=None):
if len(self.words) == 0:
if sloleks_db is None:
raise Exception('sloleks_db not properly setup!')
msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data)
if msd is not None:
self.words.append(WordDummy(msd, lemma, text))
self.words.append(WordMsdOnly(self._common_msd()))
return super()._render()
return super()._render(sloleks_db)
def _common_msd(self):
msds = sorted(self.msds, key=len)
@ -182,5 +208,5 @@ class WordFormAgreementCR(WordFormMsdCR):
return True
def render(self):
def render(self, sloleks_db=None):
pass

View File

@ -39,7 +39,7 @@ class RepresentationAssigner:
return self.representation_factory(self.more, word_renderer)
@staticmethod
def set_representations(match, word_renderer):
def set_representations(match, word_renderer, sloleks_db=None):
representations = {}
for c in match.structure.components:
representations[c.idx] = []
@ -70,7 +70,7 @@ class RepresentationAssigner:
for cid, reps in representations.items():
for rep in reps:
rep.render()
rep.render(sloleks_db=sloleks_db)
for cid, reps in representations.items():
reps = [rep.rendition_text for rep in reps]

191
src/sloleks_db.py Normal file
View File

@ -0,0 +1,191 @@
from collections import defaultdict
from ast import literal_eval
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, aliased
from sqlalchemy import create_engine
from sqlalchemy import func
from match import StructureMatch
from representation_assigner import RepresentationAssigner
from progress_bar import progress
# Lexeme = None
# LexemeFeature = None
# SyntacticStructure = None
# StructureComponent = None
# Feature = None
# LexicalUnitLexeme = None
# LexicalUnit = None
# LexicalUnitType = None
# Category = None
# Sense = None
# Measure = None
# LexicalUnitMeasure = None
# Corpus = None
# Definition = None
# WordForm = None
# WordFormFeature = None
# FormRepresentation = None
from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION
class SloleksDatabase:
def __init__(self, db):
# self.db = db
# self.dispersions = {}
# self.min_freq = args.min_freq
# self.db.init("""CREATE TABLE Colocations (
# colocation_id INTEGER PRIMARY KEY,
# structure_id varchar(8),
# key varchar(256))
# """)
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':')
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
pool_recycle=14400)
Base = declarative_base()
Base.metadata.reflect(engine)
class Lexeme(Base):
__table__ = Base.metadata.tables['jedro_lexeme']
class LexemeFeature(Base):
__table__ = Base.metadata.tables['jedro_lexeme_feature']
class SyntacticStructure(Base):
__table__ = Base.metadata.tables['jedro_syntacticstructure']
class StructureComponent(Base):
__table__ = Base.metadata.tables['jedro_structurecomponent']
class Feature(Base):
__table__ = Base.metadata.tables['jedro_feature']
class LexicalUnitLexeme(Base):
__table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
class LexicalUnit(Base):
__table__ = Base.metadata.tables['jedro_lexicalunit']
class LexicalUnitType(Base):
__table__ = Base.metadata.tables['jedro_lexicalunittype']
class Category(Base):
__table__ = Base.metadata.tables['jedro_category']
class Sense(Base):
__table__ = Base.metadata.tables['jedro_sense']
class Measure(Base):
__table__ = Base.metadata.tables['jedro_measure']
class LexicalUnitMeasure(Base):
__table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
class Corpus(Base):
__table__ = Base.metadata.tables['jedro_corpus']
class Definition(Base):
__table__ = Base.metadata.tables['jedro_definition']
class WordForm(Base):
__table__ = Base.metadata.tables['jedro_wordform']
class WordFormFeature(Base):
__table__ = Base.metadata.tables['jedro_wordform_feature']
class FormRepresentation(Base):
__table__ = Base.metadata.tables['jedro_formrepresentation']
self.session = Session(engine)
def close(self):
self.session.close()
def decypher_msd(self, msd):
t = msd[0]
decypher = []
if t == 'N':
# gender = CODES_TRANSLATION[t][2][msd[2]]
number = CODES_TRANSLATION[t][3][msd[3]]
case = CODES_TRANSLATION[t][4][msd[4]]
decypher = [number, case]
elif t == 'V':
# gender = CODES_TRANSLATION[t][6][msd[6]]
vform = CODES_TRANSLATION[t][3][msd[3]]
number = CODES_TRANSLATION[t][5][msd[5]]
person = 'third'
decypher = [vform, number, person]
elif t == 'A':
gender = CODES_TRANSLATION[t][3][msd[3]]
number = CODES_TRANSLATION[t][4][msd[4]]
case = CODES_TRANSLATION[t][5][msd[5]]
decypher = [gender, number, case]
return decypher
def get_word_form(self, lemma, msd, data, align_msd=False):
# modify msd as required
msd = list(msd)
if not align_msd and 'msd' in data:
for key, value in data['msd'].items():
t = msd[0]
v = TAGSET[t].index(key.lower())
msd[v + 1] = CODES[value]
elif 'agreement' in data:
align_msd = list(align_msd)
t_align_msd = align_msd[0]
t = msd[0]
for att in data['agreement']:
v_align_msd = TAGSET[t_align_msd].index(att.lower())
v = TAGSET[t].index(att.lower())
# fix for verbs with short msds
if v >= len(msd):
return None, None, None
# if v >= len(msd) and t == 'V' and att == 'number':
# if len(msd) == 4:
# msd += ['3']
# if len(msd) == 5:
# msd += ['_']
# try:
msd[v + 1] = align_msd[v_align_msd + 1]
# except:
# print('here')
# msd = list(msd)
decypher_msd = self.decypher_msd(msd)
if not decypher_msd:
return None, None, None
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
# wf1 = aliased(WordFormFeature)
# wf2 = aliased(WordFormFeature)
# wf3 = aliased(WordFormFeature)
query_preposition = self.session.query(FormRepresentation.form) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
for wf in wfs:
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
# .join(wf1, wf1.word_form_id == WordForm.id) \
# .join(wf2, wf2.word_form_id == WordForm.id) \
# .join(wf3, wf3.word_form_id == WordForm.id) \
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
for wf, msd_el in zip(wfs, decypher_msd):
query_preposition = query_preposition.filter(wf.value == msd_el)
pattern_translation_hws = query_preposition.all()
if len(pattern_translation_hws) > 0:
return ''.join(msd), lemma, pattern_translation_hws[0][0]
# pattern_translation_hws = [el[0] for el in query_preposition.all()]
return None, None, None
# return pattern_translation_hws

View File

@ -14,7 +14,7 @@ class SyntacticStructure:
@staticmethod
def from_xml(xml):
st = SyntacticStructure()
st.id = xml.get('id')
st.id = xml.get('id_nsss')
st.lbs = xml.get('LBS')
assert len(list(xml)) == 1

View File

@ -11,6 +11,7 @@ import concurrent.futures
import tempfile
from progress_bar import progress
from sloleks_db import SloleksDatabase
from word import Word
from syntactic_structure import build_structures
from match_store import MatchStore
@ -20,16 +21,20 @@ from loader import load_files
from database import Database
from time_info import TimeInfo
from src.postprocessor import Postprocessor
def match_file(words, structures):
def match_file(words, structures, postprocessor):
matches = {s: [] for s in structures}
for s in progress(structures, "matching"):
for w in words:
mhere = s.match(w)
for match in mhere:
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
# colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
match, collocation_id = postprocessor.process(match, colocation_id)
colocation_id = tuple(colocation_id)
matches[s].append((match, colocation_id))
@ -38,6 +43,7 @@ def match_file(words, structures):
def main(args):
sloleks_db = SloleksDatabase(args.sloleks_db)
structures, lemma_msds, max_num_components = build_structures(args)
timeinfo = TimeInfo(len(args.input))
@ -51,7 +57,11 @@ def main(args):
continue
start_time = time.time()
matches = match_file(words, structures)
postprocessor = Postprocessor()
matches = match_file(words, structures, postprocessor)
# matches = .process()
# TODO Add postprocessing here or inside previous function!
match_store.add_matches(matches)
word_stats.add_words(words)
database.commit()
@ -74,7 +84,7 @@ def main(args):
# figure out representations!
if args.out or args.out_no_stat:
match_store.set_representations(word_stats, structures)
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
@ -85,6 +95,10 @@ def main(args):
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
# sloleks_db.get_word_form(lemma, gender, number, case)
sloleks_db.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract structures from a parsed corpus.')
@ -92,6 +106,7 @@ if __name__ == '__main__':
help='Structures definitions in xml file')
parser.add_argument('input',
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials')
parser.add_argument('--out',
help='Classic output file')
parser.add_argument('--out-no-stat',
@ -100,7 +115,7 @@ if __name__ == '__main__':
help='Additional output file, writes more data')
parser.add_argument('--stats',
help='Output file for statistics')
#
parser.add_argument('--no-msd-translate',
help='MSDs are translated from slovene to english by default',
action='store_true')

View File

@ -14,6 +14,16 @@ class WordMsdOnly:
return None
class WordDummy:
def __init__(self, msd, lemma, text):
self.msd = msd
self.lemma = lemma
self.text = text
def most_frequent_text(self, word_renderer):
return word_renderer.render(self.lemma, self.msd)
class Word:
def __init__(self, lemma, msd, wid, text, do_msd_translate):
self.lemma = lemma
@ -41,10 +51,10 @@ class Word:
@staticmethod
def get_msd(comp):
d = dict(comp.items())
if 'msd' in d:
return d['msd']
elif 'ana' in d:
if 'ana' in d:
return d['ana'][4:]
elif 'msd' in d:
return d['msd']
else:
logging.error(d)
raise NotImplementedError("MSD?")