diff --git a/luscenje_struktur/sloleks_db.py b/luscenje_struktur/sloleks_db.py index 626d483..c3daee7 100644 --- a/luscenje_struktur/sloleks_db.py +++ b/luscenje_struktur/sloleks_db.py @@ -12,7 +12,7 @@ class SloleksDatabase: from sqlalchemy.orm import Session from sqlalchemy import create_engine - global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation + global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding [db_user, db_password, db_database, db_host] = db.split(':') engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database, @@ -71,17 +71,25 @@ class SloleksDatabase: class FormRepresentation(Base): __table__ = Base.metadata.tables['jedro_formrepresentation'] + class FormEncoding(Base): + __table__ = Base.metadata.tables['jedro_formencoding'] + self.session = Session(engine) self.load_sloleks = load_sloleks if self.load_sloleks: self.init_load_sloleks() + # def init_load_sloleks2(self): + + def init_load_sloleks(self): query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value) word_form_features = query_word_form_features.all() - query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form) + query_form_representations = self.session.query(FormRepresentation.word_form_id) form_representations = query_form_representations.all() + query_form_encoding = self.session.query(FormEncoding.form_representation_id, FormEncoding.text) + form_encodings = query_form_encoding.all() query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id) word_forms = query_word_forms.all() query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma) @@ -101,7 +109,10 @@ class SloleksDatabase: self.word_form_features[word_form_feature.word_form_id] = set() self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value) - self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation + form_encodings_dict = {form_encoding.form_representation_id: form_encoding.text for form_encoding + in form_encodings} + + self.form_representations = {form_representation.word_form_id: form_encodings_dict[form_representation.word_form_id] for form_representation in form_representations} self.word_forms = {} @@ -194,9 +205,14 @@ class SloleksDatabase: return ''.join(msd), lemma, form_representations else: wfs = [aliased(WordFormFeature) for _ in decypher_msd] - query_preposition = self.session.query(FormRepresentation.form) \ + # self.session.query(FormEncoding.form_representation_id, FormEncoding.text) + query_preposition = self.session.query(FormEncoding.text) \ + .join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \ .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ .join(Lexeme, Lexeme.id == WordForm.lexeme_id) + # query_preposition = self.session.query(FormRepresentation.form) \ + # .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ + # .join(Lexeme, Lexeme.id == WordForm.lexeme_id) for wf in wfs: query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id) diff --git a/luscenje_struktur/syntactic_structure.py b/luscenje_struktur/syntactic_structure.py index 6378637..931e3b1 100644 --- a/luscenje_struktur/syntactic_structure.py +++ b/luscenje_struktur/syntactic_structure.py @@ -137,7 +137,7 @@ def build_structures(args): structures = [] for structure in et.iter('syntactic_structure'): - if structure.attrib['type'] == 'single': + if structure.attrib['type'] != 'collocation': continue to_append = SyntacticStructure.from_xml(structure, no_stats) if to_append is None: diff --git a/luscenje_struktur/writerpy b/luscenje_struktur/writerpy deleted file mode 100644 index 54c3fd5..0000000 --- a/luscenje_struktur/writerpy +++ /dev/null @@ -1,133 +0,0 @@ -class Writer: - @staticmethod - def other_params(args): - return (args.multiple_output, int(args.sort_by), args.sort_reversed) - - @staticmethod - def make_output_writer(args, colocation_ids, word_renderer): - params = Writer.other_params(args) - return Writer(args.out, OutFormatter(colocation_ids, word_renderer), params) - - @staticmethod - def make_output_no_stat_writer(args, colocation_ids, word_renderer): - params = Writer.other_params(args) - return Writer(args.out_no_stat, OutNoStatFormatter(colocation_ids, word_renderer), params) - - @staticmethod - def make_all_writer(args, colocation_ids, word_renderer): - return Writer(args.all, AllFormatter(colocation_ids, word_renderer), None) - - @staticmethod - def make_stats_writer(args, colocation_ids, word_renderer): - params = Writer.other_params(args) - return Writer(args.stats, StatsFormatter(colocation_ids, word_renderer), params) - - def __init__(self, file_out, formatter, params): - if params is None: - self.multiple_output = False - self.sort_by = -1 - self.sort_order = None - else: - self.multiple_output = params[0] - self.sort_by = params[1] - self.sort_order = params[2] - - self.output_file = file_out - self.formatter = formatter - - def header(self): - repeating_cols = self.formatter.header_repeat() - cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) - for thd in repeating_cols] - - cols = ["Structure_ID"] + cols + ["Colocation_ID"] - cols += self.formatter.header_right() - return cols - - def sorted_rows(self, rows): - if self.sort_by < 0 or len(rows) < 2: - return rows - - if len(rows[0]) <= self.sort_by: - logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0]))) - return rows - - try: - int(rows[0][self.sort_by]) - def key(row): - return int(row[self.sort_by]) - except ValueError: - def key(row): - return row[self.sort_by].lower() - - return sorted(rows, key=key, reverse=self.sort_order) - - def write_header(self, file_handler): - file_handler.write(", ".join(self.header()) + "\n") - - def write_out_worker(self, file_handler, structure, colocation_ids): - rows = [] - components = structure.components - - for match in colocation_ids.get_matches_for(structure): - self.formatter.new_match(match) - - for words in match.matches: - to_write = [] - - for idx, _comp in enumerate(components): - idx = str(idx + 1) - if idx not in words: - to_write.extend([""] * self.formatter.length()) - else: - to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id)) - - # make them equal size - to_write.extend([""] * (MAX_NUM_COMPONENTS * self.formatter.length() - len(to_write))) - - # structure_id and colocation_id - to_write = [structure.id] + to_write + [match.match_id] - - # header_right - to_write.extend(self.formatter.content_right(len(match))) - rows.append(to_write) - - if self.formatter.group(): - break - - if rows != []: - rows = self.sorted_rows(rows) - file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n") - file_handler.flush() - - def write_out(self, structures, colocation_ids): - if self.output_file is None: - return - - def fp_close(fp_): - if fp_ != sys.stdout: - fp_.close() - - def fp_open(snum=None): - if snum is None: - return open(self.output_file, "w") - else: - return open("{}.{}".format(self.output_file, snum), "w") - - if not self.multiple_output: - fp = fp_open() - self.write_header(fp) - - for s in structures: - if self.multiple_output: - fp = fp_open(s.id) - self.write_header(fp) - - self.formatter.set_structure(s) - self.write_out_worker(fp, s, colocation_ids) - - if self.multiple_output: - fp_close(fp) - - if not self.multiple_output: - fp_close(fp)