diff --git a/.gitignore b/.gitignore index 20d04bf..3a0f11c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ __pycache__ prev old +data diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/codes_tagset.py b/src/codes_tagset.py index c407fe6..be2ffcf 100644 --- a/src/codes_tagset.py +++ b/src/codes_tagset.py @@ -1,3 +1,129 @@ +CODES_TRANSLATION = { + "N": { + 2: { + 'm': 'masculine', + 'f': 'feminine', + 'n': 'neuter', + }, + 3: { + "s": "singular", + "d": "dual", + "p": "plural", + }, + 4: { + "n": "nominative", + "g": "genitive", + "d": "dative", + "a": "accusative", + "l": "locative", + "i": "instrumental", + }, + }, + "V": { + 1: { + "m": "main", + "a": "auxiliary", + }, + 3: { + "n": "infinitive", + "u": "supine", + "p": "participle", + "r": "present", + "f": "future", + "c": "conditional", + "m": "imperative", + }, + 4: { + "1": "first", + "2": "second", + "3": "third", + }, + 5: { + "s": "singular", + "d": "dual", + "p": "plural", + }, + 6: { + 'm': 'masculine', + 'f': 'feminine', + 'n': 'neuter', + }, + 8: { + "n": "no", + "y": "yes", + }, + }, + "A": { + 1: { + "g": "general", + "s": "possessive", + }, + 2: { + "p": "positive", + "c": "comparative", + "s": "superlative", + }, + 3: { + 'm': 'masculine', + 'f': 'feminine', + 'n': 'neuter', + }, + 4: { + "s": "singular", + "d": "dual", + "p": "plural", + }, + 5: { + "n": "nominative", + "g": "genitive", + "d": "dative", + "a": "accusative", + "l": "locative", + "i": "instrumental", + }, + } + # "R": "Adverb", + # "P": "Pronoun", + # "M": "Numeral", + # "S": "Preposition", + # "C": "Conjunction", + # "Q": "Particle", + # "I": "Interjection", + # "Y": "Abbreviation", + # "X": "Residual", + # + # + # "e": "perfective", + # "p": "progressive", + # "b": "biaspectual", + # + # + # "p": "personal", + # "d": "demonstrative", + # "r": "relative", + # "x": "reflexive", + # "q": "interrogative", + # "i": "indefinite", + # "z": "negative", + # "b": "bound", + # "d": "digit", + # "r": "roman", + # "l": "letter", + # "c": "cardinal", + # "o": "ordinal", + # "p": "pronominal", + # "s": "special", + # "c": "coordinating", + # "s": "subordinating", + # "f": "foreign", + # "t": "typo", + # "p": "program", + # "w": "web", + # "e": "emo", + # "h": "hashtag", + # "a: "at"" +} + CODES = { "Noun": "N", "Verb": "V", diff --git a/src/loader.py b/src/loader.py index 1876352..2ff41c5 100644 --- a/src/loader.py +++ b/src/loader.py @@ -124,9 +124,9 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): dest = l.get('dep') else: ana = l.get('ana') - if ana[:4] != 'syn:': # dont bother... + if ana[:8] != 'jos-syn:': # dont bother... continue - ana = ana[4:] + ana = ana[8:] lfrom, dest = l.get('target').replace('#', '').split() if lfrom in words: diff --git a/src/match_store.py b/src/match_store.py index 378278f..ff200ce 100644 --- a/src/match_store.py +++ b/src/match_store.py @@ -91,7 +91,14 @@ class MatchStore: (structure.id,)): yield StructureMatch.from_db(self.db, cid[0], structure) - def set_representations(self, word_renderer, structures): + def add_inserts(self, inserts): + for match in inserts: + for component_id, text in match.representations.items(): + self.db.execute(""" + INSERT INTO Representations (colocation_id, component_id, text) + VALUES (?,?,?)""", (match.match_id, component_id, text)) + + def set_representations(self, word_renderer, structures, sloleks_db=None): step_name = 'representation' if self.db.is_step_done(step_name): print("Representation step already done, skipping") @@ -105,17 +112,14 @@ class MatchStore: for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations): structure = structures_dict[sid] match = StructureMatch.from_db(self.db, cid, structure) - RepresentationAssigner.set_representations(match, word_renderer) + RepresentationAssigner.set_representations(match, word_renderer, sloleks_db=sloleks_db) inserts.append(match) if len(inserts) > num_inserts: - for match in inserts: - for component_id, text in match.representations.items(): - self.db.execute(""" - INSERT INTO Representations (colocation_id, component_id, text) - VALUES (?,?,?)""", (match.match_id, component_id, text)) + self.add_inserts(inserts) inserts = [] + self.add_inserts(inserts) self.db.step_is_done(step_name) def has_colocation_id_enough_frequency(self, colocation_id): diff --git a/src/postprocessor.py b/src/postprocessor.py new file mode 100644 index 0000000..375cb3f --- /dev/null +++ b/src/postprocessor.py @@ -0,0 +1,38 @@ + +class Postprocessor: + def __init__(self, fix_one_letter_words=True): + self.fix_one_letter_words = fix_one_letter_words + + @staticmethod + def fix_sz(next_word): + if next_word[0] in ['c', 'č', 'f', 'h', 'k', 'p', 's', 'š', 't']: + return 's' + return 'z' + + @staticmethod + def fix_kh(next_word): + if next_word[0] in ['g', 'k']: + return 'h' + return 'k' + + def process(self, match, collocation_id): + # self.matches = matches + # if self.fix_one_letter_words: + # for syn_structure_key, syn_structure_value in self.matches.items(): + # for match, collocation_id in syn_structure_value: + if len(collocation_id) > 2: + # a = collocation_id[1:-1] + # b = enumerate(collocation_id[1:-1]) + # for a, c in b: + # print('here') + for idx, (col_id, word) in enumerate(collocation_id[1:-1]): + if word in ['s', 'z']: + correct_letter = self.fix_sz(collocation_id[idx + 2][1]) + collocation_id[idx + 1][1] = correct_letter + match[col_id].text = correct_letter + elif word in ['k', 'h']: + correct_letter = self.fix_kh(collocation_id[idx + 2][1]) + collocation_id[idx + 1][1] = correct_letter + match[col_id].text = correct_letter + collocation_id = [collocation_id[0]] + [tuple(line) for line in collocation_id[1:]] + return match, collocation_id diff --git a/src/representation.py b/src/representation.py index 8c21aa2..70d0e66 100644 --- a/src/representation.py +++ b/src/representation.py @@ -4,6 +4,9 @@ from collections import Counter from codes_tagset import TAGSET, CODES from word import WordMsdOnly +from src.word import WordDummy + + class ComponentRepresentation: def __init__(self, data, word_renderer): self.data = data @@ -19,23 +22,23 @@ class ComponentRepresentation: def add_word(self, word): self.words.append(word) - def render(self): + def render(self, sloleks_db=None): if self.rendition_text is None: - self.rendition_text = self._render() + self.rendition_text = self._render(sloleks_db=sloleks_db) - def _render(self): + def _render(self, sloleks_db=None): raise NotImplementedError("Not implemented for class: {}".format(type(self))) class LemmaCR(ComponentRepresentation): - def _render(self): + def _render(self, sloleks_db=None): return self.words[0].lemma if len(self.words) > 0 else None class LexisCR(ComponentRepresentation): - def _render(self): + def _render(self, sloleks_db=None): return self.data['lexis'] class WordFormAllCR(ComponentRepresentation): - def _render(self): + def _render(self, sloleks_db=None): if len(self.words) == 0: return None else: @@ -43,7 +46,7 @@ class WordFormAllCR(ComponentRepresentation): return "/".join(set(forms)) class WordFormAnyCR(ComponentRepresentation): - def _render(self): + def _render(self, sloleks_db=None): text_forms = {} msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words]) for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()): @@ -59,7 +62,24 @@ class WordFormAnyCR(ComponentRepresentation): for word_msd, word_lemma in sorted_words: # check if agreements match agreements_matched = [agr.match(word_msd) for agr in self.agreement] - + + # in case all agreements do not match try to get data from sloleks and change properly + if not all(agreements_matched): + if sloleks_db is None: + raise Exception('sloleks_db not properly setup!') + for agr in self.agreement: + if not agr.match(word_msd): + msd, lemma, text = sloleks_db.get_word_form(agr.lemma, agr.msd(), agr.data, align_msd=word_msd) + if msd is not None: + agr.msds[0] = msd + agr.words.append(WordDummy(msd, lemma, text)) + # agr.words[0].msd = msd + # agr.words[0].text = text + agreements_matched = [agr.match(word_msd) for agr in self.agreement] + else: + break + + # if we are at the last "backup word", then confirm matches # that worked for this one and return if word_lemma is None: @@ -109,9 +129,15 @@ class WordFormMsdCR(WordFormAnyCR): if self.check_msd(word.msd): super().add_word(word) - def _render(self): + def _render(self, sloleks_db=None): + if len(self.words) == 0: + if sloleks_db is None: + raise Exception('sloleks_db not properly setup!') + msd, lemma, text = sloleks_db.get_word_form(self.lemma, self.msd(), self.data) + if msd is not None: + self.words.append(WordDummy(msd, lemma, text)) self.words.append(WordMsdOnly(self._common_msd())) - return super()._render() + return super()._render(sloleks_db) def _common_msd(self): msds = sorted(self.msds, key=len) @@ -182,5 +208,5 @@ class WordFormAgreementCR(WordFormMsdCR): return True - def render(self): + def render(self, sloleks_db=None): pass diff --git a/src/representation_assigner.py b/src/representation_assigner.py index ac4708d..3c8ca52 100644 --- a/src/representation_assigner.py +++ b/src/representation_assigner.py @@ -39,7 +39,7 @@ class RepresentationAssigner: return self.representation_factory(self.more, word_renderer) @staticmethod - def set_representations(match, word_renderer): + def set_representations(match, word_renderer, sloleks_db=None): representations = {} for c in match.structure.components: representations[c.idx] = [] @@ -70,7 +70,7 @@ class RepresentationAssigner: for cid, reps in representations.items(): for rep in reps: - rep.render() + rep.render(sloleks_db=sloleks_db) for cid, reps in representations.items(): reps = [rep.rendition_text for rep in reps] diff --git a/src/sloleks_db.py b/src/sloleks_db.py new file mode 100644 index 0000000..953048d --- /dev/null +++ b/src/sloleks_db.py @@ -0,0 +1,191 @@ +from collections import defaultdict +from ast import literal_eval + +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import Session, aliased +from sqlalchemy import create_engine +from sqlalchemy import func + +from match import StructureMatch +from representation_assigner import RepresentationAssigner +from progress_bar import progress + +# Lexeme = None +# LexemeFeature = None +# SyntacticStructure = None +# StructureComponent = None +# Feature = None +# LexicalUnitLexeme = None +# LexicalUnit = None +# LexicalUnitType = None +# Category = None +# Sense = None +# Measure = None +# LexicalUnitMeasure = None +# Corpus = None +# Definition = None +# WordForm = None +# WordFormFeature = None +# FormRepresentation = None +from src.codes_tagset import TAGSET, CODES, CODES_TRANSLATION + + +class SloleksDatabase: + def __init__(self, db): + # self.db = db + # self.dispersions = {} + # self.min_freq = args.min_freq + + # self.db.init("""CREATE TABLE Colocations ( + # colocation_id INTEGER PRIMARY KEY, + # structure_id varchar(8), + # key varchar(256)) + # """) + global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation + [db_user, db_password, db_database, db_host] = db.split(':') + + engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database, + pool_recycle=14400) + Base = declarative_base() + Base.metadata.reflect(engine) + + class Lexeme(Base): + __table__ = Base.metadata.tables['jedro_lexeme'] + + class LexemeFeature(Base): + __table__ = Base.metadata.tables['jedro_lexeme_feature'] + + class SyntacticStructure(Base): + __table__ = Base.metadata.tables['jedro_syntacticstructure'] + + class StructureComponent(Base): + __table__ = Base.metadata.tables['jedro_structurecomponent'] + + class Feature(Base): + __table__ = Base.metadata.tables['jedro_feature'] + + class LexicalUnitLexeme(Base): + __table__ = Base.metadata.tables['jedro_lexicalunit_lexeme'] + + class LexicalUnit(Base): + __table__ = Base.metadata.tables['jedro_lexicalunit'] + + class LexicalUnitType(Base): + __table__ = Base.metadata.tables['jedro_lexicalunittype'] + + class Category(Base): + __table__ = Base.metadata.tables['jedro_category'] + + class Sense(Base): + __table__ = Base.metadata.tables['jedro_sense'] + + class Measure(Base): + __table__ = Base.metadata.tables['jedro_measure'] + + class LexicalUnitMeasure(Base): + __table__ = Base.metadata.tables['jedro_lexicalunitmeasure'] + + class Corpus(Base): + __table__ = Base.metadata.tables['jedro_corpus'] + + class Definition(Base): + __table__ = Base.metadata.tables['jedro_definition'] + + class WordForm(Base): + __table__ = Base.metadata.tables['jedro_wordform'] + + class WordFormFeature(Base): + __table__ = Base.metadata.tables['jedro_wordform_feature'] + + class FormRepresentation(Base): + __table__ = Base.metadata.tables['jedro_formrepresentation'] + + self.session = Session(engine) + + def close(self): + self.session.close() + + def decypher_msd(self, msd): + t = msd[0] + decypher = [] + if t == 'N': + # gender = CODES_TRANSLATION[t][2][msd[2]] + number = CODES_TRANSLATION[t][3][msd[3]] + case = CODES_TRANSLATION[t][4][msd[4]] + decypher = [number, case] + elif t == 'V': + # gender = CODES_TRANSLATION[t][6][msd[6]] + vform = CODES_TRANSLATION[t][3][msd[3]] + number = CODES_TRANSLATION[t][5][msd[5]] + person = 'third' + decypher = [vform, number, person] + elif t == 'A': + gender = CODES_TRANSLATION[t][3][msd[3]] + number = CODES_TRANSLATION[t][4][msd[4]] + case = CODES_TRANSLATION[t][5][msd[5]] + decypher = [gender, number, case] + + return decypher + + def get_word_form(self, lemma, msd, data, align_msd=False): + # modify msd as required + msd = list(msd) + + if not align_msd and 'msd' in data: + for key, value in data['msd'].items(): + t = msd[0] + v = TAGSET[t].index(key.lower()) + msd[v + 1] = CODES[value] + + elif 'agreement' in data: + align_msd = list(align_msd) + t_align_msd = align_msd[0] + t = msd[0] + + for att in data['agreement']: + v_align_msd = TAGSET[t_align_msd].index(att.lower()) + v = TAGSET[t].index(att.lower()) + # fix for verbs with short msds + if v >= len(msd): + return None, None, None + # if v >= len(msd) and t == 'V' and att == 'number': + # if len(msd) == 4: + # msd += ['3'] + # if len(msd) == 5: + # msd += ['_'] + # try: + msd[v + 1] = align_msd[v_align_msd + 1] + # except: + # print('here') + + # msd = list(msd) + decypher_msd = self.decypher_msd(msd) + + if not decypher_msd: + return None, None, None + + wfs = [aliased(WordFormFeature) for _ in decypher_msd] + # wf1 = aliased(WordFormFeature) + # wf2 = aliased(WordFormFeature) + # wf3 = aliased(WordFormFeature) + query_preposition = self.session.query(FormRepresentation.form) \ + .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ + .join(Lexeme, Lexeme.id == WordForm.lexeme_id) + + for wf in wfs: + query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id) + # .join(wf1, wf1.word_form_id == WordForm.id) \ + # .join(wf2, wf2.word_form_id == WordForm.id) \ + # .join(wf3, wf3.word_form_id == WordForm.id) \ + + query_preposition = query_preposition.filter(Lexeme.lemma == lemma) + + for wf, msd_el in zip(wfs, decypher_msd): + query_preposition = query_preposition.filter(wf.value == msd_el) + + pattern_translation_hws = query_preposition.all() + if len(pattern_translation_hws) > 0: + return ''.join(msd), lemma, pattern_translation_hws[0][0] + # pattern_translation_hws = [el[0] for el in query_preposition.all()] + return None, None, None + # return pattern_translation_hws diff --git a/src/syntactic_structure.py b/src/syntactic_structure.py index 6cf4d28..222fe35 100644 --- a/src/syntactic_structure.py +++ b/src/syntactic_structure.py @@ -14,7 +14,7 @@ class SyntacticStructure: @staticmethod def from_xml(xml): st = SyntacticStructure() - st.id = xml.get('id') + st.id = xml.get('id_nsss') st.lbs = xml.get('LBS') assert len(list(xml)) == 1 diff --git a/src/wani.py b/src/wani.py index d285c62..9eeb9cf 100644 --- a/src/wani.py +++ b/src/wani.py @@ -11,6 +11,7 @@ import concurrent.futures import tempfile from progress_bar import progress +from sloleks_db import SloleksDatabase from word import Word from syntactic_structure import build_structures from match_store import MatchStore @@ -20,16 +21,20 @@ from loader import load_files from database import Database from time_info import TimeInfo +from src.postprocessor import Postprocessor -def match_file(words, structures): + +def match_file(words, structures, postprocessor): matches = {s: [] for s in structures} for s in progress(structures, "matching"): for w in words: mhere = s.match(w) for match in mhere: - colocation_id = [(idx, w.lemma) for idx, w in match.items()] + # colocation_id = [(idx, w.lemma) for idx, w in match.items()] + colocation_id = [[idx, w.lemma] for idx, w in match.items()] colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) + match, collocation_id = postprocessor.process(match, colocation_id) colocation_id = tuple(colocation_id) matches[s].append((match, colocation_id)) @@ -38,6 +43,7 @@ def match_file(words, structures): def main(args): + sloleks_db = SloleksDatabase(args.sloleks_db) structures, lemma_msds, max_num_components = build_structures(args) timeinfo = TimeInfo(len(args.input)) @@ -51,7 +57,11 @@ def main(args): continue start_time = time.time() - matches = match_file(words, structures) + postprocessor = Postprocessor() + matches = match_file(words, structures, postprocessor) + + # matches = .process() + # TODO Add postprocessing here or inside previous function! match_store.add_matches(matches) word_stats.add_words(words) database.commit() @@ -74,7 +84,7 @@ def main(args): # figure out representations! if args.out or args.out_no_stat: - match_store.set_representations(word_stats, structures) + match_store.set_representations(word_stats, structures, sloleks_db=sloleks_db) Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) @@ -85,6 +95,10 @@ def main(args): Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out( structures, match_store) + # sloleks_db.get_word_form(lemma, gender, number, case) + sloleks_db.close() + + if __name__ == '__main__': parser = argparse.ArgumentParser( description='Extract structures from a parsed corpus.') @@ -92,6 +106,7 @@ if __name__ == '__main__': help='Structures definitions in xml file') parser.add_argument('input', help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*') + parser.add_argument('--sloleks_db', type=str, help='Sloleks database credentials') parser.add_argument('--out', help='Classic output file') parser.add_argument('--out-no-stat', @@ -100,7 +115,7 @@ if __name__ == '__main__': help='Additional output file, writes more data') parser.add_argument('--stats', help='Output file for statistics') - +# parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true') diff --git a/src/word.py b/src/word.py index bbf3889..f30522c 100644 --- a/src/word.py +++ b/src/word.py @@ -14,6 +14,16 @@ class WordMsdOnly: return None +class WordDummy: + def __init__(self, msd, lemma, text): + self.msd = msd + self.lemma = lemma + self.text = text + + def most_frequent_text(self, word_renderer): + return word_renderer.render(self.lemma, self.msd) + + class Word: def __init__(self, lemma, msd, wid, text, do_msd_translate): self.lemma = lemma @@ -29,7 +39,7 @@ class Word: self.int_id = int(last_num) assert None not in (self.id, self.lemma, self.msd) - + @staticmethod def from_xml(xml, do_msd_translate): lemma = xml.get('lemma') @@ -41,10 +51,10 @@ class Word: @staticmethod def get_msd(comp): d = dict(comp.items()) - if 'msd' in d: - return d['msd'] - elif 'ana' in d: + if 'ana' in d: return d['ana'][4:] + elif 'msd' in d: + return d['msd'] else: logging.error(d) raise NotImplementedError("MSD?")