Improved representations speed + Fixed bug in representations
This commit is contained in:
parent
4c84873ff5
commit
f330a37764
2
run.sh
2
run.sh
|
@ -1 +1 @@
|
|||
pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db 'superbaza:A)2U&+3Vfd$Fg]Gb:kolokacije:127.0.0.1' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output
|
||||
pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '<PUT DB CREDENTIALS HERE!>' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks
|
|
@ -1,16 +1,41 @@
|
|||
POSSIBLE_WORD_FORM_FEATURE_VALUES = {
|
||||
"singular",
|
||||
"dual",
|
||||
"plural",
|
||||
"nominative",
|
||||
"genitive",
|
||||
"dative",
|
||||
"accusative",
|
||||
"locative",
|
||||
"instrumental",
|
||||
"infinitive",
|
||||
"supine",
|
||||
"participle",
|
||||
"present",
|
||||
"future",
|
||||
"conditional",
|
||||
"imperative",
|
||||
'masculine',
|
||||
'feminine',
|
||||
'neuter',
|
||||
}
|
||||
|
||||
CODES_TRANSLATION = {
|
||||
"N": {
|
||||
2: {
|
||||
'-': 'masculine',
|
||||
'm': 'masculine',
|
||||
'f': 'feminine',
|
||||
'n': 'neuter',
|
||||
},
|
||||
3: {
|
||||
"-": "singular",
|
||||
"s": "singular",
|
||||
"d": "dual",
|
||||
"p": "plural",
|
||||
},
|
||||
4: {
|
||||
"-": "nominative",
|
||||
"n": "nominative",
|
||||
"g": "genitive",
|
||||
"d": "dative",
|
||||
|
@ -21,10 +46,12 @@ CODES_TRANSLATION = {
|
|||
},
|
||||
"V": {
|
||||
1: {
|
||||
"-": "main",
|
||||
"m": "main",
|
||||
"a": "auxiliary",
|
||||
},
|
||||
3: {
|
||||
"-": "infinitive",
|
||||
"n": "infinitive",
|
||||
"u": "supine",
|
||||
"p": "participle",
|
||||
|
@ -34,46 +61,55 @@ CODES_TRANSLATION = {
|
|||
"m": "imperative",
|
||||
},
|
||||
4: {
|
||||
"-": "first",
|
||||
"1": "first",
|
||||
"2": "second",
|
||||
"3": "third",
|
||||
},
|
||||
5: {
|
||||
"-": "singular",
|
||||
"s": "singular",
|
||||
"d": "dual",
|
||||
"p": "plural",
|
||||
},
|
||||
6: {
|
||||
'-': 'masculine',
|
||||
'm': 'masculine',
|
||||
'f': 'feminine',
|
||||
'n': 'neuter',
|
||||
},
|
||||
8: {
|
||||
"-": "no",
|
||||
"n": "no",
|
||||
"y": "yes",
|
||||
},
|
||||
},
|
||||
"A": {
|
||||
1: {
|
||||
"-": "general",
|
||||
"g": "general",
|
||||
"s": "possessive",
|
||||
},
|
||||
2: {
|
||||
"-": "positive",
|
||||
"p": "positive",
|
||||
"c": "comparative",
|
||||
"s": "superlative",
|
||||
},
|
||||
3: {
|
||||
'-': 'masculine',
|
||||
'm': 'masculine',
|
||||
'f': 'feminine',
|
||||
'n': 'neuter',
|
||||
},
|
||||
4: {
|
||||
"-": "singular",
|
||||
"s": "singular",
|
||||
"d": "dual",
|
||||
"p": "plural",
|
||||
},
|
||||
5: {
|
||||
"-": "nominative",
|
||||
"n": "nominative",
|
||||
"g": "genitive",
|
||||
"d": "dative",
|
||||
|
@ -82,46 +118,6 @@ CODES_TRANSLATION = {
|
|||
"i": "instrumental",
|
||||
},
|
||||
}
|
||||
# "R": "Adverb",
|
||||
# "P": "Pronoun",
|
||||
# "M": "Numeral",
|
||||
# "S": "Preposition",
|
||||
# "C": "Conjunction",
|
||||
# "Q": "Particle",
|
||||
# "I": "Interjection",
|
||||
# "Y": "Abbreviation",
|
||||
# "X": "Residual",
|
||||
#
|
||||
#
|
||||
# "e": "perfective",
|
||||
# "p": "progressive",
|
||||
# "b": "biaspectual",
|
||||
#
|
||||
#
|
||||
# "p": "personal",
|
||||
# "d": "demonstrative",
|
||||
# "r": "relative",
|
||||
# "x": "reflexive",
|
||||
# "q": "interrogative",
|
||||
# "i": "indefinite",
|
||||
# "z": "negative",
|
||||
# "b": "bound",
|
||||
# "d": "digit",
|
||||
# "r": "roman",
|
||||
# "l": "letter",
|
||||
# "c": "cardinal",
|
||||
# "o": "ordinal",
|
||||
# "p": "pronominal",
|
||||
# "s": "special",
|
||||
# "c": "coordinating",
|
||||
# "s": "subordinating",
|
||||
# "f": "foreign",
|
||||
# "t": "typo",
|
||||
# "p": "program",
|
||||
# "w": "web",
|
||||
# "e": "emo",
|
||||
# "h": "hashtag",
|
||||
# "a: "at""
|
||||
}
|
||||
|
||||
CODES = {
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import gc
|
||||
from collections import defaultdict
|
||||
from ast import literal_eval
|
||||
from time import time
|
||||
|
||||
from match import StructureMatch
|
||||
from representation_assigner import RepresentationAssigner
|
||||
|
@ -110,6 +112,7 @@ class MatchStore:
|
|||
|
||||
structures_dict = {s.id: s for s in structures}
|
||||
num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
|
||||
start_time = time()
|
||||
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
|
||||
structure = structures_dict[sid]
|
||||
match = StructureMatch.from_db(self.db, cid, structure)
|
||||
|
@ -119,7 +122,9 @@ class MatchStore:
|
|||
if len(inserts) > num_inserts:
|
||||
self.add_inserts(inserts)
|
||||
inserts = []
|
||||
|
||||
if time() - start_time > 5:
|
||||
start_time = time()
|
||||
gc.collect()
|
||||
self.add_inserts(inserts)
|
||||
self.db.step_is_done(step_name)
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@ class LexisCR(ComponentRepresentation):
|
|||
class WordFormAllCR(ComponentRepresentation):
|
||||
def _render(self, sloleks_db=None):
|
||||
if len(self.words) == 0:
|
||||
return None
|
||||
return None, None
|
||||
else:
|
||||
forms = [w.text.lower() for w in self.words]
|
||||
msds = [w.msd for w in self.words]
|
||||
|
@ -74,15 +74,17 @@ class WordFormAnyCR(ComponentRepresentation):
|
|||
if not all(agreements_matched):
|
||||
if sloleks_db is None:
|
||||
raise Exception('sloleks_db not properly setup!')
|
||||
for agr in self.agreement:
|
||||
for i, agr in enumerate(self.agreement):
|
||||
if not agr.match(word_msd):
|
||||
msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd)
|
||||
if msd is not None:
|
||||
agr.msds[0] = msd
|
||||
agr.words.append(WordDummy(msd, lemma, text))
|
||||
# agr.words[0].msd = msd
|
||||
# agr.words[0].text = text
|
||||
agreements_matched = [agr.match(word_msd) for agr in self.agreement]
|
||||
# when we find element in sloleks automatically add it (no need for second checks, since msd
|
||||
# is tailored to pass tests by default)
|
||||
agr.rendition_candidate = text
|
||||
agr.rendition_msd_candidate = msd
|
||||
agreements_matched[i] = True
|
||||
else:
|
||||
break
|
||||
|
||||
|
@ -101,6 +103,7 @@ class WordFormAnyCR(ComponentRepresentation):
|
|||
agr.confirm_match()
|
||||
|
||||
return text_forms[(word_msd, word_lemma)], word_msd
|
||||
return None, None
|
||||
|
||||
|
||||
class WordFormMsdCR(WordFormAnyCR):
|
||||
|
@ -153,7 +156,6 @@ class WordFormMsdCR(WordFormAnyCR):
|
|||
common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds)))
|
||||
else msds[0][idx] for idx in range(len(msds[0]))]
|
||||
common_msd = "".join(common_msd)
|
||||
iommon_msd = "".join(common_msd)
|
||||
return self.word_renderer.common_lemma_msd(self.lemma, common_msd)
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import gc
|
||||
|
||||
from psycopg2cffi import compat
|
||||
compat.register()
|
||||
|
||||
|
@ -5,11 +7,11 @@ from sqlalchemy.ext.declarative import declarative_base
|
|||
from sqlalchemy.orm import Session, aliased
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
from codes_tagset import TAGSET, CODES, CODES_TRANSLATION
|
||||
from codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
|
||||
|
||||
|
||||
class SloleksDatabase:
|
||||
def __init__(self, db):
|
||||
def __init__(self, db, load_sloleks):
|
||||
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
||||
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||
|
||||
|
@ -71,12 +73,65 @@ class SloleksDatabase:
|
|||
|
||||
self.session = Session(engine)
|
||||
|
||||
self.load_sloleks = load_sloleks
|
||||
if self.load_sloleks:
|
||||
self.init_load_sloleks()
|
||||
|
||||
def init_load_sloleks(self):
|
||||
query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
|
||||
word_form_features = query_word_form_features.all()
|
||||
query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form)
|
||||
form_representations = query_form_representations.all()
|
||||
query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
|
||||
word_forms = query_word_forms.all()
|
||||
query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
|
||||
lexemes = query_lexemes.all()
|
||||
|
||||
self.lemmas = {}
|
||||
for lexeme in lexemes:
|
||||
if lexeme.lemma not in self.lemmas:
|
||||
self.lemmas[lexeme.lemma] = []
|
||||
self.lemmas[lexeme.lemma].append(lexeme.id)
|
||||
|
||||
self.word_form_features = {}
|
||||
for word_form_feature in word_form_features:
|
||||
if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES:
|
||||
continue
|
||||
if word_form_feature.word_form_id not in self.word_form_features:
|
||||
self.word_form_features[word_form_feature.word_form_id] = set()
|
||||
self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
|
||||
|
||||
self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation
|
||||
in form_representations}
|
||||
|
||||
self.word_forms = {}
|
||||
for word_form in word_forms:
|
||||
if word_form.lexeme_id not in self.word_forms:
|
||||
self.word_forms[word_form.lexeme_id] = []
|
||||
self.word_forms[word_form.lexeme_id].append(word_form.id)
|
||||
|
||||
|
||||
self.connected_lemmas = {}
|
||||
for lemma, lemma_ids in self.lemmas.items():
|
||||
for lemma_id in lemma_ids:
|
||||
if lemma_id in self.word_forms:
|
||||
for word_form_id in self.word_forms[lemma_id]:
|
||||
if word_form_id in self.word_form_features and word_form_id in self.form_representations:
|
||||
if lemma not in self.connected_lemmas:
|
||||
self.connected_lemmas[lemma] = []
|
||||
self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id]))
|
||||
|
||||
del self.lemmas, self.word_form_features, self.form_representations, self.word_forms
|
||||
gc.collect()
|
||||
|
||||
|
||||
def close(self):
|
||||
self.session.close()
|
||||
|
||||
def decypher_msd(self, msd):
|
||||
t = msd[0]
|
||||
decypher = []
|
||||
# IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES
|
||||
if t == 'N':
|
||||
# gender = CODES_TRANSLATION[t][2][msd[2]]
|
||||
number = CODES_TRANSLATION[t][3][msd[3]]
|
||||
|
@ -118,7 +173,6 @@ class SloleksDatabase:
|
|||
# fix for verbs with short msds
|
||||
if v + 1 >= len(msd):
|
||||
msd = msd + ['-' for _ in range(v - len(msd) + 2)]
|
||||
# return None, None, None
|
||||
|
||||
msd[v + 1] = align_msd[v_align_msd + 1]
|
||||
|
||||
|
@ -127,6 +181,17 @@ class SloleksDatabase:
|
|||
if not decypher_msd:
|
||||
return None, None, None
|
||||
|
||||
if self.load_sloleks and lemma in self.connected_lemmas:
|
||||
for (word_form_features, form_representations) in self.connected_lemmas[lemma]:
|
||||
fits = True
|
||||
for d_m in decypher_msd:
|
||||
if d_m not in word_form_features:
|
||||
fits = False
|
||||
break
|
||||
if fits:
|
||||
break
|
||||
return ''.join(msd), lemma, form_representations
|
||||
else:
|
||||
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
|
||||
query_preposition = self.session.query(FormRepresentation.form) \
|
||||
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
||||
|
@ -140,7 +205,7 @@ class SloleksDatabase:
|
|||
for wf, msd_el in zip(wfs, decypher_msd):
|
||||
query_preposition = query_preposition.filter(wf.value == msd_el)
|
||||
|
||||
pattern_translation_hws = query_preposition.all()
|
||||
pattern_translation_hws = query_preposition.limit(1).all()
|
||||
if len(pattern_translation_hws) > 0:
|
||||
return ''.join(msd), lemma, pattern_translation_hws[0][0]
|
||||
return None, None, None
|
||||
|
|
23
src/wani.py
23
src/wani.py
|
@ -31,32 +31,17 @@ def match_file(words, structures, postprocessor):
|
|||
for w in words:
|
||||
mhere = s.match(w)
|
||||
for match in mhere:
|
||||
# colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
||||
colocation_id = [[idx, w.lemma] for idx, w in match.items()]
|
||||
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
|
||||
match, collocation_id = postprocessor.process(match, colocation_id)
|
||||
colocation_id = tuple(colocation_id)
|
||||
|
||||
matches[s].append((match, colocation_id))
|
||||
# for key, val in matches.items():
|
||||
# if key.id == '15':
|
||||
# for el in val:
|
||||
# if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje':
|
||||
# word_id = '.'.join(words[0].id.split('.')[:-1])
|
||||
# print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}")
|
||||
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
|
||||
|
||||
# if s.id == '15':
|
||||
# if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje':
|
||||
# word_id = '.'.join(match['1'].id.split('.')[:-1])
|
||||
# print(f"ID: {word_id}")
|
||||
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
def main(args):
|
||||
sloleks_db = SloleksDatabase(args.sloleks_db)
|
||||
structures, lemma_msds, max_num_components = build_structures(args)
|
||||
timeinfo = TimeInfo(len(args.input))
|
||||
|
||||
|
@ -95,7 +80,9 @@ def main(args):
|
|||
|
||||
# figure out representations!
|
||||
if args.out or args.out_no_stat:
|
||||
sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks)
|
||||
match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db)
|
||||
sloleks_db.close()
|
||||
|
||||
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||
structures, match_store)
|
||||
|
@ -106,8 +93,6 @@ def main(args):
|
|||
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
|
||||
structures, match_store)
|
||||
|
||||
# sloleks_db.get_word_form(lemma, gender, number, case)
|
||||
sloleks_db.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -144,6 +129,10 @@ if __name__ == '__main__':
|
|||
help='Generate one output for each syntactic structure',
|
||||
action='store_true')
|
||||
|
||||
parser.add_argument('--load-sloleks',
|
||||
help='Tells weather sloleks is loaded into memory at the beginning of processing or not.',
|
||||
action='store_true')
|
||||
|
||||
parser.add_argument('--sort-by',
|
||||
help="Sort by a this column (index)", type=int, default=-1)
|
||||
parser.add_argument('--sort-reversed',
|
||||
|
|
Loading…
Reference in New Issue
Block a user