Improved representations speed + Fixed bug in representations

This commit is contained in:
Luka 2020-07-22 11:16:28 +02:00
parent 4c84873ff5
commit f330a37764
6 changed files with 137 additions and 80 deletions

2
run.sh
View File

@ -1 +1 @@
pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db 'superbaza:A)2U&+3Vfd$Fg]Gb:kolokacije:127.0.0.1' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<PUT DB CREDENTIALS HERE!>' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks

View File

@ -1,16 +1,41 @@
POSSIBLE_WORD_FORM_FEATURE_VALUES = {
"singular",
"dual",
"plural",
"nominative",
"genitive",
"dative",
"accusative",
"locative",
"instrumental",
"infinitive",
"supine",
"participle",
"present",
"future",
"conditional",
"imperative",
'masculine',
'feminine',
'neuter',
}
CODES_TRANSLATION = { CODES_TRANSLATION = {
"N": { "N": {
2: { 2: {
'-': 'masculine',
'm': 'masculine', 'm': 'masculine',
'f': 'feminine', 'f': 'feminine',
'n': 'neuter', 'n': 'neuter',
}, },
3: { 3: {
"-": "singular",
"s": "singular", "s": "singular",
"d": "dual", "d": "dual",
"p": "plural", "p": "plural",
}, },
4: { 4: {
"-": "nominative",
"n": "nominative", "n": "nominative",
"g": "genitive", "g": "genitive",
"d": "dative", "d": "dative",
@ -21,10 +46,12 @@ CODES_TRANSLATION = {
}, },
"V": { "V": {
1: { 1: {
"-": "main",
"m": "main", "m": "main",
"a": "auxiliary", "a": "auxiliary",
}, },
3: { 3: {
"-": "infinitive",
"n": "infinitive", "n": "infinitive",
"u": "supine", "u": "supine",
"p": "participle", "p": "participle",
@ -34,46 +61,55 @@ CODES_TRANSLATION = {
"m": "imperative", "m": "imperative",
}, },
4: { 4: {
"-": "first",
"1": "first", "1": "first",
"2": "second", "2": "second",
"3": "third", "3": "third",
}, },
5: { 5: {
"-": "singular",
"s": "singular", "s": "singular",
"d": "dual", "d": "dual",
"p": "plural", "p": "plural",
}, },
6: { 6: {
'-': 'masculine',
'm': 'masculine', 'm': 'masculine',
'f': 'feminine', 'f': 'feminine',
'n': 'neuter', 'n': 'neuter',
}, },
8: { 8: {
"-": "no",
"n": "no", "n": "no",
"y": "yes", "y": "yes",
}, },
}, },
"A": { "A": {
1: { 1: {
"-": "general",
"g": "general", "g": "general",
"s": "possessive", "s": "possessive",
}, },
2: { 2: {
"-": "positive",
"p": "positive", "p": "positive",
"c": "comparative", "c": "comparative",
"s": "superlative", "s": "superlative",
}, },
3: { 3: {
'-': 'masculine',
'm': 'masculine', 'm': 'masculine',
'f': 'feminine', 'f': 'feminine',
'n': 'neuter', 'n': 'neuter',
}, },
4: { 4: {
"-": "singular",
"s": "singular", "s": "singular",
"d": "dual", "d": "dual",
"p": "plural", "p": "plural",
}, },
5: { 5: {
"-": "nominative",
"n": "nominative", "n": "nominative",
"g": "genitive", "g": "genitive",
"d": "dative", "d": "dative",
@ -82,46 +118,6 @@ CODES_TRANSLATION = {
"i": "instrumental", "i": "instrumental",
}, },
} }
# "R": "Adverb",
# "P": "Pronoun",
# "M": "Numeral",
# "S": "Preposition",
# "C": "Conjunction",
# "Q": "Particle",
# "I": "Interjection",
# "Y": "Abbreviation",
# "X": "Residual",
#
#
# "e": "perfective",
# "p": "progressive",
# "b": "biaspectual",
#
#
# "p": "personal",
# "d": "demonstrative",
# "r": "relative",
# "x": "reflexive",
# "q": "interrogative",
# "i": "indefinite",
# "z": "negative",
# "b": "bound",
# "d": "digit",
# "r": "roman",
# "l": "letter",
# "c": "cardinal",
# "o": "ordinal",
# "p": "pronominal",
# "s": "special",
# "c": "coordinating",
# "s": "subordinating",
# "f": "foreign",
# "t": "typo",
# "p": "program",
# "w": "web",
# "e": "emo",
# "h": "hashtag",
# "a: "at""
} }
CODES = { CODES = {

View File

@ -1,5 +1,7 @@
import gc
from collections import defaultdict from collections import defaultdict
from ast import literal_eval from ast import literal_eval
from time import time
from match import StructureMatch from match import StructureMatch
from representation_assigner import RepresentationAssigner from representation_assigner import RepresentationAssigner
@ -110,6 +112,7 @@ class MatchStore:
structures_dict = {s.id: s for s in structures} structures_dict = {s.id: s for s in structures}
num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0]) num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
start_time = time()
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations): for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
structure = structures_dict[sid] structure = structures_dict[sid]
match = StructureMatch.from_db(self.db, cid, structure) match = StructureMatch.from_db(self.db, cid, structure)
@ -119,7 +122,9 @@ class MatchStore:
if len(inserts) > num_inserts: if len(inserts) > num_inserts:
self.add_inserts(inserts) self.add_inserts(inserts)
inserts = [] inserts = []
if time() - start_time > 5:
start_time = time()
gc.collect()
self.add_inserts(inserts) self.add_inserts(inserts)
self.db.step_is_done(step_name) self.db.step_is_done(step_name)

View File

@ -45,7 +45,7 @@ class LexisCR(ComponentRepresentation):
class WordFormAllCR(ComponentRepresentation): class WordFormAllCR(ComponentRepresentation):
def _render(self, sloleks_db=None): def _render(self, sloleks_db=None):
if len(self.words) == 0: if len(self.words) == 0:
return None return None, None
else: else:
forms = [w.text.lower() for w in self.words] forms = [w.text.lower() for w in self.words]
msds = [w.msd for w in self.words] msds = [w.msd for w in self.words]
@ -74,15 +74,17 @@ class WordFormAnyCR(ComponentRepresentation):
if not all(agreements_matched): if not all(agreements_matched):
if sloleks_db is None: if sloleks_db is None:
raise Exception('sloleks_db not properly setup!') raise Exception('sloleks_db not properly setup!')
for agr in self.agreement: for i, agr in enumerate(self.agreement):
if not agr.match(word_msd): if not agr.match(word_msd):
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd) msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
if msd is not None: if msd is not None:
agr.msds[0] = msd agr.msds[0] = msd
agr.words.append(WordDummy(msd, lemma, text)) agr.words.append(WordDummy(msd, lemma, text))
# agr.words[0].msd = msd # when we find element in sloleks automatically add it (no need for second checks, since msd
# agr.words[0].text = text # is tailored to pass tests by default)
agreements_matched = [agr.match(word_msd) for agr in self.agreement] agr.rendition_candidate = text
agr.rendition_msd_candidate = msd
agreements_matched[i] = True
else: else:
break break
@ -101,6 +103,7 @@ class WordFormAnyCR(ComponentRepresentation):
agr.confirm_match() agr.confirm_match()
return text_forms[(word_msd, word_lemma)], word_msd return text_forms[(word_msd, word_lemma)], word_msd
return None, None
class WordFormMsdCR(WordFormAnyCR): class WordFormMsdCR(WordFormAnyCR):
@ -153,7 +156,6 @@ class WordFormMsdCR(WordFormAnyCR):
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds))) common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
else msds[0][idx] for idx in range(len(msds[0]))] else msds[0][idx] for idx in range(len(msds[0]))]
common_msd = "".join(common_msd) common_msd = "".join(common_msd)
iommon_msd = "".join(common_msd)
return self.word_renderer.common_lemma_msd(self.lemma, common_msd) return self.word_renderer.common_lemma_msd(self.lemma, common_msd)

View File

@ -1,3 +1,5 @@
import gc
from psycopg2cffi import compat from psycopg2cffi import compat
compat.register() compat.register()
@ -5,11 +7,11 @@ from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, aliased from sqlalchemy.orm import Session, aliased
from sqlalchemy import create_engine from sqlalchemy import create_engine
from codes_tagset import TAGSET, CODES, CODES_TRANSLATION from codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
class SloleksDatabase: class SloleksDatabase:
def __init__(self, db): def __init__(self, db, load_sloleks):
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':') [db_user, db_password, db_database, db_host] = db.split(':')
@ -71,12 +73,65 @@ class SloleksDatabase:
self.session = Session(engine) self.session = Session(engine)
self.load_sloleks = load_sloleks
if self.load_sloleks:
self.init_load_sloleks()
def init_load_sloleks(self):
query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
word_form_features = query_word_form_features.all()
query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form)
form_representations = query_form_representations.all()
query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
word_forms = query_word_forms.all()
query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
lexemes = query_lexemes.all()
self.lemmas = {}
for lexeme in lexemes:
if lexeme.lemma not in self.lemmas:
self.lemmas[lexeme.lemma] = []
self.lemmas[lexeme.lemma].append(lexeme.id)
self.word_form_features = {}
for word_form_feature in word_form_features:
if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES:
continue
if word_form_feature.word_form_id not in self.word_form_features:
self.word_form_features[word_form_feature.word_form_id] = set()
self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation
in form_representations}
self.word_forms = {}
for word_form in word_forms:
if word_form.lexeme_id not in self.word_forms:
self.word_forms[word_form.lexeme_id] = []
self.word_forms[word_form.lexeme_id].append(word_form.id)
self.connected_lemmas = {}
for lemma, lemma_ids in self.lemmas.items():
for lemma_id in lemma_ids:
if lemma_id in self.word_forms:
for word_form_id in self.word_forms[lemma_id]:
if word_form_id in self.word_form_features and word_form_id in self.form_representations:
if lemma not in self.connected_lemmas:
self.connected_lemmas[lemma] = []
self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id]))
del self.lemmas, self.word_form_features, self.form_representations, self.word_forms
gc.collect()
def close(self): def close(self):
self.session.close() self.session.close()
def decypher_msd(self, msd): def decypher_msd(self, msd):
t = msd[0] t = msd[0]
decypher = [] decypher = []
# IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES
if t == 'N': if t == 'N':
# gender = CODES_TRANSLATION[t][2][msd[2]] # gender = CODES_TRANSLATION[t][2][msd[2]]
number = CODES_TRANSLATION[t][3][msd[3]] number = CODES_TRANSLATION[t][3][msd[3]]
@ -118,7 +173,6 @@ class SloleksDatabase:
# fix for verbs with short msds # fix for verbs with short msds
if v + 1 >= len(msd): if v + 1 >= len(msd):
msd = msd + ['-' for _ in range(v - len(msd) + 2)] msd = msd + ['-' for _ in range(v - len(msd) + 2)]
# return None, None, None
msd[v + 1] = align_msd[v_align_msd + 1] msd[v + 1] = align_msd[v_align_msd + 1]
@ -127,20 +181,31 @@ class SloleksDatabase:
if not decypher_msd: if not decypher_msd:
return None, None, None return None, None, None
wfs = [aliased(WordFormFeature) for _ in decypher_msd] if self.load_sloleks and lemma in self.connected_lemmas:
query_preposition = self.session.query(FormRepresentation.form) \ for (word_form_features, form_representations) in self.connected_lemmas[lemma]:
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ fits = True
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) for d_m in decypher_msd:
if d_m not in word_form_features:
fits = False
break
if fits:
break
return ''.join(msd), lemma, form_representations
else:
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
query_preposition = self.session.query(FormRepresentation.form) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
for wf in wfs: for wf in wfs:
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id) query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
query_preposition = query_preposition.filter(Lexeme.lemma == lemma) query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
for wf, msd_el in zip(wfs, decypher_msd): for wf, msd_el in zip(wfs, decypher_msd):
query_preposition = query_preposition.filter(wf.value == msd_el) query_preposition = query_preposition.filter(wf.value == msd_el)
pattern_translation_hws = query_preposition.all() pattern_translation_hws = query_preposition.limit(1).all()
if len(pattern_translation_hws) > 0: if len(pattern_translation_hws) > 0:
return ''.join(msd), lemma, pattern_translation_hws[0][0] return ''.join(msd), lemma, pattern_translation_hws[0][0]
return None, None, None return None, None, None

View File

@ -31,32 +31,17 @@ def match_file(words, structures, postprocessor):
for w in words: for w in words:
mhere = s.match(w) mhere = s.match(w)
for match in mhere: for match in mhere:
# colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [[idx, w.lemma] for idx, w in match.items()] colocation_id = [[idx, w.lemma] for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
match, collocation_id = postprocessor.process(match, colocation_id) match, collocation_id = postprocessor.process(match, colocation_id)
colocation_id = tuple(colocation_id) colocation_id = tuple(colocation_id)
matches[s].append((match, colocation_id)) matches[s].append((match, colocation_id))
# for key, val in matches.items():
# if key.id == '15':
# for el in val:
# if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje':
# word_id = '.'.join(words[0].id.split('.')[:-1])
# print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}")
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
# if s.id == '15':
# if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje':
# word_id = '.'.join(match['1'].id.split('.')[:-1])
# print(f"ID: {word_id}")
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
return matches return matches
def main(args): def main(args):
sloleks_db = SloleksDatabase(args.sloleks_db)
structures, lemma_msds, max_num_components = build_structures(args) structures, lemma_msds, max_num_components = build_structures(args)
timeinfo = TimeInfo(len(args.input)) timeinfo = TimeInfo(len(args.input))
@ -95,7 +80,9 @@ def main(args):
# figure out representations! # figure out representations!
if args.out or args.out_no_stat: if args.out or args.out_no_stat:
sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db) match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
sloleks_db.close()
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out( Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store) structures, match_store)
@ -106,8 +93,6 @@ def main(args):
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out( Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store) structures, match_store)
# sloleks_db.get_word_form(lemma, gender, number, case)
sloleks_db.close()
if __name__ == '__main__': if __name__ == '__main__':
@ -144,6 +129,10 @@ if __name__ == '__main__':
help='Generate one output for each syntactic structure', help='Generate one output for each syntactic structure',
action='store_true') action='store_true')
parser.add_argument('--load-sloleks',
help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
action='store_true')
parser.add_argument('--sort-by', parser.add_argument('--sort-by',
help="Sort by a this column (index)", type=int, default=-1) help="Sort by a this column (index)", type=int, default=-1)
parser.add_argument('--sort-reversed', parser.add_argument('--sort-reversed',