diff --git a/run.sh b/run.sh index 6593b89..7137826 100755 --- a/run.sh +++ b/run.sh @@ -1 +1 @@ -pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db 'superbaza:A)2U&+3Vfd$Fg]Gb:kolokacije:127.0.0.1' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output +pypy3 src/wani.py data/Kolokacije_strukture_JOS-32-representation_3D_08_1.xml data/input --out data/output --sloleks_db '' --collocation_sentence_map_dest data/collocation_sentence_mapper --db /mnt/tmp/mysql-wani --multiple-output --load-sloleks \ No newline at end of file diff --git a/src/codes_tagset.py b/src/codes_tagset.py index be2ffcf..75d0cdc 100644 --- a/src/codes_tagset.py +++ b/src/codes_tagset.py @@ -1,16 +1,41 @@ +POSSIBLE_WORD_FORM_FEATURE_VALUES = { + "singular", + "dual", + "plural", + "nominative", + "genitive", + "dative", + "accusative", + "locative", + "instrumental", + "infinitive", + "supine", + "participle", + "present", + "future", + "conditional", + "imperative", + 'masculine', + 'feminine', + 'neuter', +} + CODES_TRANSLATION = { "N": { 2: { + '-': 'masculine', 'm': 'masculine', 'f': 'feminine', 'n': 'neuter', }, 3: { + "-": "singular", "s": "singular", "d": "dual", "p": "plural", }, 4: { + "-": "nominative", "n": "nominative", "g": "genitive", "d": "dative", @@ -21,10 +46,12 @@ CODES_TRANSLATION = { }, "V": { 1: { + "-": "main", "m": "main", "a": "auxiliary", }, 3: { + "-": "infinitive", "n": "infinitive", "u": "supine", "p": "participle", @@ -34,46 +61,55 @@ CODES_TRANSLATION = { "m": "imperative", }, 4: { + "-": "first", "1": "first", "2": "second", "3": "third", }, 5: { + "-": "singular", "s": "singular", "d": "dual", "p": "plural", }, 6: { + '-': 'masculine', 'm': 'masculine', 'f': 'feminine', 'n': 'neuter', }, 8: { + "-": "no", "n": "no", "y": "yes", }, }, "A": { 1: { + "-": "general", "g": "general", "s": "possessive", }, 2: { + "-": "positive", "p": "positive", "c": "comparative", "s": "superlative", }, 3: { + '-': 'masculine', 'm': 'masculine', 'f': 'feminine', 'n': 'neuter', }, 4: { + "-": "singular", "s": "singular", "d": "dual", "p": "plural", }, 5: { + "-": "nominative", "n": "nominative", "g": "genitive", "d": "dative", @@ -82,46 +118,6 @@ CODES_TRANSLATION = { "i": "instrumental", }, } - # "R": "Adverb", - # "P": "Pronoun", - # "M": "Numeral", - # "S": "Preposition", - # "C": "Conjunction", - # "Q": "Particle", - # "I": "Interjection", - # "Y": "Abbreviation", - # "X": "Residual", - # - # - # "e": "perfective", - # "p": "progressive", - # "b": "biaspectual", - # - # - # "p": "personal", - # "d": "demonstrative", - # "r": "relative", - # "x": "reflexive", - # "q": "interrogative", - # "i": "indefinite", - # "z": "negative", - # "b": "bound", - # "d": "digit", - # "r": "roman", - # "l": "letter", - # "c": "cardinal", - # "o": "ordinal", - # "p": "pronominal", - # "s": "special", - # "c": "coordinating", - # "s": "subordinating", - # "f": "foreign", - # "t": "typo", - # "p": "program", - # "w": "web", - # "e": "emo", - # "h": "hashtag", - # "a: "at"" } CODES = { diff --git a/src/match_store.py b/src/match_store.py index 6025200..ed5bcb0 100644 --- a/src/match_store.py +++ b/src/match_store.py @@ -1,5 +1,7 @@ +import gc from collections import defaultdict from ast import literal_eval +from time import time from match import StructureMatch from representation_assigner import RepresentationAssigner @@ -110,6 +112,7 @@ class MatchStore: structures_dict = {s.id: s for s in structures} num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0]) + start_time = time() for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations): structure = structures_dict[sid] match = StructureMatch.from_db(self.db, cid, structure) @@ -119,7 +122,9 @@ class MatchStore: if len(inserts) > num_inserts: self.add_inserts(inserts) inserts = [] - + if time() - start_time > 5: + start_time = time() + gc.collect() self.add_inserts(inserts) self.db.step_is_done(step_name) diff --git a/src/representation.py b/src/representation.py index 2a7842a..f205d62 100644 --- a/src/representation.py +++ b/src/representation.py @@ -45,7 +45,7 @@ class LexisCR(ComponentRepresentation): class WordFormAllCR(ComponentRepresentation): def _render(self, sloleks_db=None): if len(self.words) == 0: - return None + return None, None else: forms = [w.text.lower() for w in self.words] msds = [w.msd for w in self.words] @@ -74,15 +74,17 @@ class WordFormAnyCR(ComponentRepresentation): if not all(agreements_matched): if sloleks_db is None: raise Exception('sloleks_db not properly setup!') - for agr in self.agreement: + for i, agr in enumerate(self.agreement): if not agr.match(word_msd): msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd) if msd is not None: agr.msds[0] = msd agr.words.append(WordDummy(msd, lemma, text)) - # agr.words[0].msd = msd - # agr.words[0].text = text - agreements_matched = [agr.match(word_msd) for agr in self.agreement] + # when we find element in sloleks automatically add it (no need for second checks, since msd + # is tailored to pass tests by default) + agr.rendition_candidate = text + agr.rendition_msd_candidate = msd + agreements_matched[i] = True else: break @@ -101,6 +103,7 @@ class WordFormAnyCR(ComponentRepresentation): agr.confirm_match() return text_forms[(word_msd, word_lemma)], word_msd + return None, None class WordFormMsdCR(WordFormAnyCR): @@ -153,7 +156,6 @@ class WordFormMsdCR(WordFormAnyCR): common_msd = ["-" if not all(msds[j][idx] == msds[0][idx] for j in range(1, len(self.msds))) else msds[0][idx] for idx in range(len(msds[0]))] common_msd = "".join(common_msd) - iommon_msd = "".join(common_msd) return self.word_renderer.common_lemma_msd(self.lemma, common_msd) diff --git a/src/sloleks_db.py b/src/sloleks_db.py index e20b537..ba3d3b6 100644 --- a/src/sloleks_db.py +++ b/src/sloleks_db.py @@ -1,3 +1,5 @@ +import gc + from psycopg2cffi import compat compat.register() @@ -5,11 +7,11 @@ from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Session, aliased from sqlalchemy import create_engine -from codes_tagset import TAGSET, CODES, CODES_TRANSLATION +from codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES class SloleksDatabase: - def __init__(self, db): + def __init__(self, db, load_sloleks): global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation [db_user, db_password, db_database, db_host] = db.split(':') @@ -71,12 +73,65 @@ class SloleksDatabase: self.session = Session(engine) + self.load_sloleks = load_sloleks + if self.load_sloleks: + self.init_load_sloleks() + + def init_load_sloleks(self): + query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value) + word_form_features = query_word_form_features.all() + query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form) + form_representations = query_form_representations.all() + query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id) + word_forms = query_word_forms.all() + query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma) + lexemes = query_lexemes.all() + + self.lemmas = {} + for lexeme in lexemes: + if lexeme.lemma not in self.lemmas: + self.lemmas[lexeme.lemma] = [] + self.lemmas[lexeme.lemma].append(lexeme.id) + + self.word_form_features = {} + for word_form_feature in word_form_features: + if word_form_feature.value not in POSSIBLE_WORD_FORM_FEATURE_VALUES: + continue + if word_form_feature.word_form_id not in self.word_form_features: + self.word_form_features[word_form_feature.word_form_id] = set() + self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value) + + self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation + in form_representations} + + self.word_forms = {} + for word_form in word_forms: + if word_form.lexeme_id not in self.word_forms: + self.word_forms[word_form.lexeme_id] = [] + self.word_forms[word_form.lexeme_id].append(word_form.id) + + + self.connected_lemmas = {} + for lemma, lemma_ids in self.lemmas.items(): + for lemma_id in lemma_ids: + if lemma_id in self.word_forms: + for word_form_id in self.word_forms[lemma_id]: + if word_form_id in self.word_form_features and word_form_id in self.form_representations: + if lemma not in self.connected_lemmas: + self.connected_lemmas[lemma] = [] + self.connected_lemmas[lemma].append((self.word_form_features[word_form_id], self.form_representations[word_form_id])) + + del self.lemmas, self.word_form_features, self.form_representations, self.word_forms + gc.collect() + + def close(self): self.session.close() def decypher_msd(self, msd): t = msd[0] decypher = [] + # IF ADDING OR CHANGING ATTRIBUTES HERE ALSO FIX POSSIBLE_WORD_FORM_FEATURE_VALUES if t == 'N': # gender = CODES_TRANSLATION[t][2][msd[2]] number = CODES_TRANSLATION[t][3][msd[3]] @@ -118,7 +173,6 @@ class SloleksDatabase: # fix for verbs with short msds if v + 1 >= len(msd): msd = msd + ['-' for _ in range(v - len(msd) + 2)] - # return None, None, None msd[v + 1] = align_msd[v_align_msd + 1] @@ -127,20 +181,31 @@ class SloleksDatabase: if not decypher_msd: return None, None, None - wfs = [aliased(WordFormFeature) for _ in decypher_msd] - query_preposition = self.session.query(FormRepresentation.form) \ - .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ - .join(Lexeme, Lexeme.id == WordForm.lexeme_id) + if self.load_sloleks and lemma in self.connected_lemmas: + for (word_form_features, form_representations) in self.connected_lemmas[lemma]: + fits = True + for d_m in decypher_msd: + if d_m not in word_form_features: + fits = False + break + if fits: + break + return ''.join(msd), lemma, form_representations + else: + wfs = [aliased(WordFormFeature) for _ in decypher_msd] + query_preposition = self.session.query(FormRepresentation.form) \ + .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ + .join(Lexeme, Lexeme.id == WordForm.lexeme_id) - for wf in wfs: - query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id) + for wf in wfs: + query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id) - query_preposition = query_preposition.filter(Lexeme.lemma == lemma) + query_preposition = query_preposition.filter(Lexeme.lemma == lemma) - for wf, msd_el in zip(wfs, decypher_msd): - query_preposition = query_preposition.filter(wf.value == msd_el) + for wf, msd_el in zip(wfs, decypher_msd): + query_preposition = query_preposition.filter(wf.value == msd_el) - pattern_translation_hws = query_preposition.all() - if len(pattern_translation_hws) > 0: - return ''.join(msd), lemma, pattern_translation_hws[0][0] + pattern_translation_hws = query_preposition.limit(1).all() + if len(pattern_translation_hws) > 0: + return ''.join(msd), lemma, pattern_translation_hws[0][0] return None, None, None diff --git a/src/wani.py b/src/wani.py index f10b5ef..b7f5b15 100644 --- a/src/wani.py +++ b/src/wani.py @@ -31,32 +31,17 @@ def match_file(words, structures, postprocessor): for w in words: mhere = s.match(w) for match in mhere: - # colocation_id = [(idx, w.lemma) for idx, w in match.items()] colocation_id = [[idx, w.lemma] for idx, w in match.items()] colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) match, collocation_id = postprocessor.process(match, colocation_id) colocation_id = tuple(colocation_id) matches[s].append((match, colocation_id)) - # for key, val in matches.items(): - # if key.id == '15': - # for el in val: - # if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje': - # word_id = '.'.join(words[0].id.split('.')[:-1]) - # print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}") - # print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id])) - - # if s.id == '15': - # if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje': - # word_id = '.'.join(match['1'].id.split('.')[:-1]) - # print(f"ID: {word_id}") - # print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id])) return matches def main(args): - sloleks_db = SloleksDatabase(args.sloleks_db) structures, lemma_msds, max_num_components = build_structures(args) timeinfo = TimeInfo(len(args.input)) @@ -95,7 +80,9 @@ def main(args): # figure out representations! if args.out or args.out_no_stat: + sloleks_db = SloleksDatabase(args.sloleks_db, args.load_sloleks) match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db) + sloleks_db.close() Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) @@ -106,8 +93,6 @@ def main(args): Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) - # sloleks_db.get_word_form(lemma, gender, number, case) - sloleks_db.close() if __name__ == '__main__': @@ -144,6 +129,10 @@ if __name__ == '__main__': help='Generate one output for each syntactic structure', action='store_true') + parser.add_argument('--load-sloleks', + help='Tells weather sloleks is loaded into memory at the beginning of processing or not.', + action='store_true') + parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1) parser.add_argument('--sort-reversed',