Db and syntactic_structures fixes

This commit is contained in:
Luka 2022-10-23 21:50:08 +02:00
parent 598ab102b3
commit 7c735e33f7
3 changed files with 21 additions and 138 deletions

View File

@ -12,7 +12,7 @@ class SloleksDatabase:
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from sqlalchemy import create_engine from sqlalchemy import create_engine
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
[db_user, db_password, db_database, db_host] = db.split(':') [db_user, db_password, db_database, db_host] = db.split(':')
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database, engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
@ -71,17 +71,25 @@ class SloleksDatabase:
class FormRepresentation(Base): class FormRepresentation(Base):
__table__ = Base.metadata.tables['jedro_formrepresentation'] __table__ = Base.metadata.tables['jedro_formrepresentation']
class FormEncoding(Base):
__table__ = Base.metadata.tables['jedro_formencoding']
self.session = Session(engine) self.session = Session(engine)
self.load_sloleks = load_sloleks self.load_sloleks = load_sloleks
if self.load_sloleks: if self.load_sloleks:
self.init_load_sloleks() self.init_load_sloleks()
# def init_load_sloleks2(self):
def init_load_sloleks(self): def init_load_sloleks(self):
query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value) query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
word_form_features = query_word_form_features.all() word_form_features = query_word_form_features.all()
query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form) query_form_representations = self.session.query(FormRepresentation.word_form_id)
form_representations = query_form_representations.all() form_representations = query_form_representations.all()
query_form_encoding = self.session.query(FormEncoding.form_representation_id, FormEncoding.text)
form_encodings = query_form_encoding.all()
query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id) query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
word_forms = query_word_forms.all() word_forms = query_word_forms.all()
query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma) query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
@ -101,7 +109,10 @@ class SloleksDatabase:
self.word_form_features[word_form_feature.word_form_id] = set() self.word_form_features[word_form_feature.word_form_id] = set()
self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value) self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation form_encodings_dict = {form_encoding.form_representation_id: form_encoding.text for form_encoding
in form_encodings}
self.form_representations = {form_representation.word_form_id: form_encodings_dict[form_representation.word_form_id] for form_representation
in form_representations} in form_representations}
self.word_forms = {} self.word_forms = {}
@ -194,9 +205,14 @@ class SloleksDatabase:
return ''.join(msd), lemma, form_representations return ''.join(msd), lemma, form_representations
else: else:
wfs = [aliased(WordFormFeature) for _ in decypher_msd] wfs = [aliased(WordFormFeature) for _ in decypher_msd]
query_preposition = self.session.query(FormRepresentation.form) \ # self.session.query(FormEncoding.form_representation_id, FormEncoding.text)
query_preposition = self.session.query(FormEncoding.text) \
.join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
# query_preposition = self.session.query(FormRepresentation.form) \
# .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
# .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
for wf in wfs: for wf in wfs:
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id) query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)

View File

@ -137,7 +137,7 @@ def build_structures(args):
structures = [] structures = []
for structure in et.iter('syntactic_structure'): for structure in et.iter('syntactic_structure'):
if structure.attrib['type'] == 'single': if structure.attrib['type'] != 'collocation':
continue continue
to_append = SyntacticStructure.from_xml(structure, no_stats) to_append = SyntacticStructure.from_xml(structure, no_stats)
if to_append is None: if to_append is None:

View File

@ -1,133 +0,0 @@
class Writer:
@staticmethod
def other_params(args):
return (args.multiple_output, int(args.sort_by), args.sort_reversed)
@staticmethod
def make_output_writer(args, colocation_ids, word_renderer):
params = Writer.other_params(args)
return Writer(args.out, OutFormatter(colocation_ids, word_renderer), params)
@staticmethod
def make_output_no_stat_writer(args, colocation_ids, word_renderer):
params = Writer.other_params(args)
return Writer(args.out_no_stat, OutNoStatFormatter(colocation_ids, word_renderer), params)
@staticmethod
def make_all_writer(args, colocation_ids, word_renderer):
return Writer(args.all, AllFormatter(colocation_ids, word_renderer), None)
@staticmethod
def make_stats_writer(args, colocation_ids, word_renderer):
params = Writer.other_params(args)
return Writer(args.stats, StatsFormatter(colocation_ids, word_renderer), params)
def __init__(self, file_out, formatter, params):
if params is None:
self.multiple_output = False
self.sort_by = -1
self.sort_order = None
else:
self.multiple_output = params[0]
self.sort_by = params[1]
self.sort_order = params[2]
self.output_file = file_out
self.formatter = formatter
def header(self):
repeating_cols = self.formatter.header_repeat()
cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS)
for thd in repeating_cols]
cols = ["Structure_ID"] + cols + ["Colocation_ID"]
cols += self.formatter.header_right()
return cols
def sorted_rows(self, rows):
if self.sort_by < 0 or len(rows) < 2:
return rows
if len(rows[0]) <= self.sort_by:
logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])))
return rows
try:
int(rows[0][self.sort_by])
def key(row):
return int(row[self.sort_by])
except ValueError:
def key(row):
return row[self.sort_by].lower()
return sorted(rows, key=key, reverse=self.sort_order)
def write_header(self, file_handler):
file_handler.write(", ".join(self.header()) + "\n")
def write_out_worker(self, file_handler, structure, colocation_ids):
rows = []
components = structure.components
for match in colocation_ids.get_matches_for(structure):
self.formatter.new_match(match)
for words in match.matches:
to_write = []
for idx, _comp in enumerate(components):
idx = str(idx + 1)
if idx not in words:
to_write.extend([""] * self.formatter.length())
else:
to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id))
# make them equal size
to_write.extend([""] * (MAX_NUM_COMPONENTS * self.formatter.length() - len(to_write)))
# structure_id and colocation_id
to_write = [structure.id] + to_write + [match.match_id]
# header_right
to_write.extend(self.formatter.content_right(len(match)))
rows.append(to_write)
if self.formatter.group():
break
if rows != []:
rows = self.sorted_rows(rows)
file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
file_handler.flush()
def write_out(self, structures, colocation_ids):
if self.output_file is None:
return
def fp_close(fp_):
if fp_ != sys.stdout:
fp_.close()
def fp_open(snum=None):
if snum is None:
return open(self.output_file, "w")
else:
return open("{}.{}".format(self.output_file, snum), "w")
if not self.multiple_output:
fp = fp_open()
self.write_header(fp)
for s in structures:
if self.multiple_output:
fp = fp_open(s.id)
self.write_header(fp)
self.formatter.set_structure(s)
self.write_out_worker(fp, s, colocation_ids)
if self.multiple_output:
fp_close(fp)
if not self.multiple_output:
fp_close(fp)