Db and syntactic_structures fixes
This commit is contained in:
parent
598ab102b3
commit
7c735e33f7
|
@ -12,7 +12,7 @@ class SloleksDatabase:
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
|
|
||||||
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
|
||||||
[db_user, db_password, db_database, db_host] = db.split(':')
|
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||||
|
|
||||||
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
||||||
|
@ -71,17 +71,25 @@ class SloleksDatabase:
|
||||||
class FormRepresentation(Base):
|
class FormRepresentation(Base):
|
||||||
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
||||||
|
|
||||||
|
class FormEncoding(Base):
|
||||||
|
__table__ = Base.metadata.tables['jedro_formencoding']
|
||||||
|
|
||||||
self.session = Session(engine)
|
self.session = Session(engine)
|
||||||
|
|
||||||
self.load_sloleks = load_sloleks
|
self.load_sloleks = load_sloleks
|
||||||
if self.load_sloleks:
|
if self.load_sloleks:
|
||||||
self.init_load_sloleks()
|
self.init_load_sloleks()
|
||||||
|
|
||||||
|
# def init_load_sloleks2(self):
|
||||||
|
|
||||||
|
|
||||||
def init_load_sloleks(self):
|
def init_load_sloleks(self):
|
||||||
query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
|
query_word_form_features = self.session.query(WordFormFeature.word_form_id, WordFormFeature.value)
|
||||||
word_form_features = query_word_form_features.all()
|
word_form_features = query_word_form_features.all()
|
||||||
query_form_representations = self.session.query(FormRepresentation.word_form_id, FormRepresentation.form)
|
query_form_representations = self.session.query(FormRepresentation.word_form_id)
|
||||||
form_representations = query_form_representations.all()
|
form_representations = query_form_representations.all()
|
||||||
|
query_form_encoding = self.session.query(FormEncoding.form_representation_id, FormEncoding.text)
|
||||||
|
form_encodings = query_form_encoding.all()
|
||||||
query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
|
query_word_forms = self.session.query(WordForm.id, WordForm.lexeme_id)
|
||||||
word_forms = query_word_forms.all()
|
word_forms = query_word_forms.all()
|
||||||
query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
|
query_lexemes = self.session.query(Lexeme.id, Lexeme.lemma)
|
||||||
|
@ -101,7 +109,10 @@ class SloleksDatabase:
|
||||||
self.word_form_features[word_form_feature.word_form_id] = set()
|
self.word_form_features[word_form_feature.word_form_id] = set()
|
||||||
self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
|
self.word_form_features[word_form_feature.word_form_id].add(word_form_feature.value)
|
||||||
|
|
||||||
self.form_representations = {form_representation.word_form_id: form_representation.form for form_representation
|
form_encodings_dict = {form_encoding.form_representation_id: form_encoding.text for form_encoding
|
||||||
|
in form_encodings}
|
||||||
|
|
||||||
|
self.form_representations = {form_representation.word_form_id: form_encodings_dict[form_representation.word_form_id] for form_representation
|
||||||
in form_representations}
|
in form_representations}
|
||||||
|
|
||||||
self.word_forms = {}
|
self.word_forms = {}
|
||||||
|
@ -194,9 +205,14 @@ class SloleksDatabase:
|
||||||
return ''.join(msd), lemma, form_representations
|
return ''.join(msd), lemma, form_representations
|
||||||
else:
|
else:
|
||||||
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
|
wfs = [aliased(WordFormFeature) for _ in decypher_msd]
|
||||||
query_preposition = self.session.query(FormRepresentation.form) \
|
# self.session.query(FormEncoding.form_representation_id, FormEncoding.text)
|
||||||
|
query_preposition = self.session.query(FormEncoding.text) \
|
||||||
|
.join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
|
||||||
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
||||||
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
|
.join(Lexeme, Lexeme.id == WordForm.lexeme_id)
|
||||||
|
# query_preposition = self.session.query(FormRepresentation.form) \
|
||||||
|
# .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
||||||
|
# .join(Lexeme, Lexeme.id == WordForm.lexeme_id)
|
||||||
|
|
||||||
for wf in wfs:
|
for wf in wfs:
|
||||||
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
|
query_preposition = query_preposition.join(wf, wf.word_form_id == WordForm.id)
|
||||||
|
|
|
@ -137,7 +137,7 @@ def build_structures(args):
|
||||||
|
|
||||||
structures = []
|
structures = []
|
||||||
for structure in et.iter('syntactic_structure'):
|
for structure in et.iter('syntactic_structure'):
|
||||||
if structure.attrib['type'] == 'single':
|
if structure.attrib['type'] != 'collocation':
|
||||||
continue
|
continue
|
||||||
to_append = SyntacticStructure.from_xml(structure, no_stats)
|
to_append = SyntacticStructure.from_xml(structure, no_stats)
|
||||||
if to_append is None:
|
if to_append is None:
|
||||||
|
|
|
@ -1,133 +0,0 @@
|
||||||
class Writer:
|
|
||||||
@staticmethod
|
|
||||||
def other_params(args):
|
|
||||||
return (args.multiple_output, int(args.sort_by), args.sort_reversed)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def make_output_writer(args, colocation_ids, word_renderer):
|
|
||||||
params = Writer.other_params(args)
|
|
||||||
return Writer(args.out, OutFormatter(colocation_ids, word_renderer), params)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def make_output_no_stat_writer(args, colocation_ids, word_renderer):
|
|
||||||
params = Writer.other_params(args)
|
|
||||||
return Writer(args.out_no_stat, OutNoStatFormatter(colocation_ids, word_renderer), params)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def make_all_writer(args, colocation_ids, word_renderer):
|
|
||||||
return Writer(args.all, AllFormatter(colocation_ids, word_renderer), None)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def make_stats_writer(args, colocation_ids, word_renderer):
|
|
||||||
params = Writer.other_params(args)
|
|
||||||
return Writer(args.stats, StatsFormatter(colocation_ids, word_renderer), params)
|
|
||||||
|
|
||||||
def __init__(self, file_out, formatter, params):
|
|
||||||
if params is None:
|
|
||||||
self.multiple_output = False
|
|
||||||
self.sort_by = -1
|
|
||||||
self.sort_order = None
|
|
||||||
else:
|
|
||||||
self.multiple_output = params[0]
|
|
||||||
self.sort_by = params[1]
|
|
||||||
self.sort_order = params[2]
|
|
||||||
|
|
||||||
self.output_file = file_out
|
|
||||||
self.formatter = formatter
|
|
||||||
|
|
||||||
def header(self):
|
|
||||||
repeating_cols = self.formatter.header_repeat()
|
|
||||||
cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS)
|
|
||||||
for thd in repeating_cols]
|
|
||||||
|
|
||||||
cols = ["Structure_ID"] + cols + ["Colocation_ID"]
|
|
||||||
cols += self.formatter.header_right()
|
|
||||||
return cols
|
|
||||||
|
|
||||||
def sorted_rows(self, rows):
|
|
||||||
if self.sort_by < 0 or len(rows) < 2:
|
|
||||||
return rows
|
|
||||||
|
|
||||||
if len(rows[0]) <= self.sort_by:
|
|
||||||
logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])))
|
|
||||||
return rows
|
|
||||||
|
|
||||||
try:
|
|
||||||
int(rows[0][self.sort_by])
|
|
||||||
def key(row):
|
|
||||||
return int(row[self.sort_by])
|
|
||||||
except ValueError:
|
|
||||||
def key(row):
|
|
||||||
return row[self.sort_by].lower()
|
|
||||||
|
|
||||||
return sorted(rows, key=key, reverse=self.sort_order)
|
|
||||||
|
|
||||||
def write_header(self, file_handler):
|
|
||||||
file_handler.write(", ".join(self.header()) + "\n")
|
|
||||||
|
|
||||||
def write_out_worker(self, file_handler, structure, colocation_ids):
|
|
||||||
rows = []
|
|
||||||
components = structure.components
|
|
||||||
|
|
||||||
for match in colocation_ids.get_matches_for(structure):
|
|
||||||
self.formatter.new_match(match)
|
|
||||||
|
|
||||||
for words in match.matches:
|
|
||||||
to_write = []
|
|
||||||
|
|
||||||
for idx, _comp in enumerate(components):
|
|
||||||
idx = str(idx + 1)
|
|
||||||
if idx not in words:
|
|
||||||
to_write.extend([""] * self.formatter.length())
|
|
||||||
else:
|
|
||||||
to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id))
|
|
||||||
|
|
||||||
# make them equal size
|
|
||||||
to_write.extend([""] * (MAX_NUM_COMPONENTS * self.formatter.length() - len(to_write)))
|
|
||||||
|
|
||||||
# structure_id and colocation_id
|
|
||||||
to_write = [structure.id] + to_write + [match.match_id]
|
|
||||||
|
|
||||||
# header_right
|
|
||||||
to_write.extend(self.formatter.content_right(len(match)))
|
|
||||||
rows.append(to_write)
|
|
||||||
|
|
||||||
if self.formatter.group():
|
|
||||||
break
|
|
||||||
|
|
||||||
if rows != []:
|
|
||||||
rows = self.sorted_rows(rows)
|
|
||||||
file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
|
|
||||||
file_handler.flush()
|
|
||||||
|
|
||||||
def write_out(self, structures, colocation_ids):
|
|
||||||
if self.output_file is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
def fp_close(fp_):
|
|
||||||
if fp_ != sys.stdout:
|
|
||||||
fp_.close()
|
|
||||||
|
|
||||||
def fp_open(snum=None):
|
|
||||||
if snum is None:
|
|
||||||
return open(self.output_file, "w")
|
|
||||||
else:
|
|
||||||
return open("{}.{}".format(self.output_file, snum), "w")
|
|
||||||
|
|
||||||
if not self.multiple_output:
|
|
||||||
fp = fp_open()
|
|
||||||
self.write_header(fp)
|
|
||||||
|
|
||||||
for s in structures:
|
|
||||||
if self.multiple_output:
|
|
||||||
fp = fp_open(s.id)
|
|
||||||
self.write_header(fp)
|
|
||||||
|
|
||||||
self.formatter.set_structure(s)
|
|
||||||
self.write_out_worker(fp, s, colocation_ids)
|
|
||||||
|
|
||||||
if self.multiple_output:
|
|
||||||
fp_close(fp)
|
|
||||||
|
|
||||||
if not self.multiple_output:
|
|
||||||
fp_close(fp)
|
|
Loading…
Reference in New Issue
Block a user