Improved representations speed + Fixed bug in representations

This commit is contained in:
Luka 2020-07-22 11:16:28 +02:00
parent 4c84873ff5
commit f330a37764
6 changed files with 137 additions and 80 deletions

2
run.sh
View File

@ -1 +1 @@
pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db 'superbaza:A)2U&+3Vfd$Fg]Gb:kolokacije:127.0.0.1' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output
pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<PUT DB CREDENTIALS HERE!>' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks

View File

@ -1,16 +1,41 @@
POSSIBLE_WORD_FORM_FEATURE_VALUES = {
"singular",
"dual",
"plural",
"nominative",
"genitive",
"dative",
"accusative",
"locative",
"instrumental",
"infinitive",
"supine",
"participle",
"present",
"future",
"conditional",
"imperative",
'masculine',
'feminine',
'neuter',
}
CODES_TRANSLATION = {
"N": {
2: {
'-': 'masculine',
'm': 'masculine',
'f': 'feminine',
'n': 'neuter',
},
3: {
"-": "singular",
"s": "singular",
"d": "dual",
"p": "plural",
},
4: {
"-": "nominative",
"n": "nominative",
"g": "genitive",
"d": "dative",
@ -21,10 +46,12 @@ CODES_TRANSLATION = {
},
"V": {
1: {
"-": "main",
"m": "main",
"a": "auxiliary",
},
3: {
"-": "infinitive",
"n": "infinitive",
"u": "supine",
"p": "participle",
@ -34,46 +61,55 @@ CODES_TRANSLATION = {
"m": "imperative",
},
4: {
"-": "first",
"1": "first",
"2": "second",
"3": "third",
},
5: {
"-": "singular",
"s": "singular",
"d": "dual",
"p": "plural",
},
6: {
'-': 'masculine',
'm': 'masculine',
'f': 'feminine',
'n': 'neuter',
},
8: {
"-": "no",
"n": "no",
"y": "yes",
},
},
"A": {
1: {
"-": "general",
"g": "general",
"s": "possessive",
},
2: {
"-": "positive",
"p": "positive",
"c": "comparative",
"s": "superlative",
},
3: {
'-': 'masculine',
'm': 'masculine',
'f': 'feminine',
'n': 'neuter',
},
4: {
"-": "singular",
"s": "singular",
"d": "dual",
"p": "plural",
},
5: {
"-": "nominative",
"n": "nominative",
"g": "genitive",
"d": "dative",
@ -82,46 +118,6 @@ CODES_TRANSLATION = {
"i": "instrumental",
},
}
# "R": "Adverb",
# "P": "Pronoun",
# "M": "Numeral",
# "S": "Preposition",
# "C": "Conjunction",
# "Q": "Particle",
# "I": "Interjection",
# "Y": "Abbreviation",
# "X": "Residual",
#
#
# "e": "perfective",
# "p": "progressive",
# "b": "biaspectual",
#
#
# "p": "personal",
# "d": "demonstrative",
# "r": "relative",
# "x": "reflexive",
# "q": "interrogative",
# "i": "indefinite",
# "z": "negative",
# "b": "bound",
# "d": "digit",
# "r": "roman",
# "l": "letter",
# "c": "cardinal",
# "o": "ordinal",
# "p": "pronominal",
# "s": "special",
# "c": "coordinating",
# "s": "subordinating",
# "f": "foreign",
# "t": "typo",
# "p": "program",
# "w": "web",
# "e": "emo",
# "h": "hashtag",
# "a: "at""
}
CODES = {

View File

@ -1,5 +1,7 @@
import gc
from collections import defaultdict
from ast import literal_eval
from time import time
from match import StructureMatch
from representation_assigner import RepresentationAssigner
@ -110,6 +112,7 @@ class MatchStore:
structures_dict = {s.id: s for s in structures}
num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
start_time = time()
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
structure = structures_dict[sid]
match = StructureMatch.from_db(self.db, cid, structure)
@ -119,7 +122,9 @@ class MatchStore:
if len(inserts) > num_inserts:
self.add_inserts(inserts)
inserts = []
if time() - start_time > 5:
start_time = time()
gc.collect()
self.add_inserts(inserts)
self.db.step_is_done(step_name)

View File

@ -45,7 +45,7 @@ class LexisCR(ComponentRepresentation):
class WordFormAllCR(ComponentRepresentation):
def _render(self, sloleks_db=None):
if len(self.words) == 0:
return None
return None, None
else:
forms = [w.text.lower() for w in self.words]
msds = [w.msd for w in self.words]
@ -74,15 +74,17 @@ class WordFormAnyCR(ComponentRepresentation):
if not all(agreements_matched):
if sloleks_db is None:
raise Exception('sloleks_db not properly setup!')
for agr in self.agreement:
for i, agr in enumerate(self.agreement):
if not agr.match(word_msd):
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
if msd is not None:
agr.msds[0] = msd
agr.words.append(WordDummy(msd, lemma, text))
# agr.words[0].msd = msd
# agr.words[0].text = text
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
# when we find element in sloleks automatically add it (no need for second checks, since msd
# is tailored to pass tests by default)
agr.rendition_candidate = text
agr.rendition_msd_candidate = msd
agreements_matched[i] = True
else:
break
@ -101,6 +103,7 @@ class WordFormAnyCR(ComponentRepresentation):
agr.confirm_match()
return text_forms[(word_msd, word_lemma)], word_msd
return None, None
class WordFormMsdCR(WordFormAnyCR):
@ -153,7 +156,6 @@ class WordFormMsdCR(WordFormAnyCR):
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
else msds[0][idx] for idx in range(len(msds[0]))]
common_msd = "".join(common_msd)
iommon_msd = "".join(common_msd)
return self.word_renderer.common_lemma_msd(self.lemma, common_msd)

View File

@ -1,3 +1,5 @@
import gc
from psycopg2cffi import compat
compat.register()
@ -5,11 +7,11 @@ from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, aliased
from sqlalchemy import create_engine
from codes_tagset import TAGSET, CODES, CODES_TRANSLATION
from codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
class SloleksDatabase:
def __init__(self, db):
def __init__(self, db, load_sloleks):
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':')
@ -71,12 +73,65 @@ class SloleksDatabase:
self.session = Session(engine)
self.load_sloleks = load_sloleks
if self.load_sloleks:
self.init_load_sloleks()
def init_load_sloleks(self):
query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
word_form_features = query_word_form_features.all()
query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form)
form_representations = query_form_representations.all()
query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
word_forms = query_word_forms.all()
query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
lexemes = query_lexemes.all()
self.lemmas = {}
for lexeme in lexemes:
if lexeme.lemma not in self.lemmas:
self.lemmas[lexeme.lemma] = []
self.lemmas[lexeme.lemma].append(lexeme.id)
self.word_form_features = {}
for word_form_feature in word_form_features:
if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES:
continue
if word_form_feature.word_form_id not in self.word_form_features:
self.word_form_features[word_form_feature.word_form_id] = set()
self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation
in form_representations}
self.word_forms = {}
for word_form in word_forms:
if word_form.lexeme_id not in self.word_forms:
self.word_forms[word_form.lexeme_id] = []
self.word_forms[word_form.lexeme_id].append(word_form.id)
self.connected_lemmas = {}
for lemma, lemma_ids in self.lemmas.items():
for lemma_id in lemma_ids:
if lemma_id in self.word_forms:
for word_form_id in self.word_forms[lemma_id]:
if word_form_id in self.word_form_features and word_form_id in self.form_representations:
if lemma not in self.connected_lemmas:
self.connected_lemmas[lemma] = []
self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id]))
del self.lemmas, self.word_form_features, self.form_representations, self.word_forms
gc.collect()
def close(self):
self.session.close()
def decypher_msd(self, msd):
t = msd[0]
decypher = []
# IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES
if t == 'N':
# gender = CODES_TRANSLATION[t][2][msd[2]]
number = CODES_TRANSLATION[t][3][msd[3]]
@ -118,7 +173,6 @@ class SloleksDatabase:
# fix for verbs with short msds
if v + 1 >= len(msd):
msd = msd + ['-' for _ in range(v - len(msd) + 2)]
# return None, None, None
msd[v + 1] = align_msd[v_align_msd + 1]
@ -127,20 +181,31 @@ class SloleksDatabase:
if not decypher_msd:
return None, None, None
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
query_preposition = self.session.query(FormRepresentation.form) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
if self.load_sloleks and lemma in self.connected_lemmas:
for (word_form_features, form_representations) in self.connected_lemmas[lemma]:
fits = True
for d_m in decypher_msd:
if d_m not in word_form_features:
fits = False
break
if fits:
break
return ''.join(msd), lemma, form_representations
else:
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
query_preposition = self.session.query(FormRepresentation.form) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
for wf in wfs:
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
for wf in wfs:
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
query_preposition = query_preposition.filter(Lexeme.lemma == lemma)
for wf, msd_el in zip(wfs, decypher_msd):
query_preposition = query_preposition.filter(wf.value == msd_el)
for wf, msd_el in zip(wfs, decypher_msd):
query_preposition = query_preposition.filter(wf.value == msd_el)
pattern_translation_hws = query_preposition.all()
if len(pattern_translation_hws) > 0:
return ''.join(msd), lemma, pattern_translation_hws[0][0]
pattern_translation_hws = query_preposition.limit(1).all()
if len(pattern_translation_hws) > 0:
return ''.join(msd), lemma, pattern_translation_hws[0][0]
return None, None, None

View File

@ -31,32 +31,17 @@ def match_file(words, structures, postprocessor):
for w in words:
mhere = s.match(w)
for match in mhere:
# colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
match, collocation_id = postprocessor.process(match, colocation_id)
colocation_id = tuple(colocation_id)
matches[s].append((match, colocation_id))
# for key, val in matches.items():
# if key.id == '15':
# for el in val:
# if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje':
# word_id = '.'.join(words[0].id.split('.')[:-1])
# print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}")
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
# if s.id == '15':
# if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje':
# word_id = '.'.join(match['1'].id.split('.')[:-1])
# print(f"ID: {word_id}")
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
return matches
def main(args):
sloleks_db = SloleksDatabase(args.sloleks_db)
structures, lemma_msds, max_num_components = build_structures(args)
timeinfo = TimeInfo(len(args.input))
@ -95,7 +80,9 @@ def main(args):
# figure out representations!
if args.out or args.out_no_stat:
sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
sloleks_db.close()
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
@ -106,8 +93,6 @@ def main(args):
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
structures, match_store)
# sloleks_db.get_word_form(lemma, gender, number, case)
sloleks_db.close()
if __name__ == '__main__':
@ -144,6 +129,10 @@ if __name__ == '__main__':
help='Generate one output for each syntactic structure',
action='store_true')
parser.add_argument('--load-sloleks',
help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
action='store_true')
parser.add_argument('--sort-by',
help="Sort by a this column (index)", type=int, default=-1)
parser.add_argument('--sort-reversed',