From d67976c3d925355022f633128d9f9234a8c4164b Mon Sep 17 00:00:00 2001 From: Luka Date: Thu, 15 Apr 2021 14:16:34 +0200 Subject: [PATCH] Modified prints + sqlalchemy and psycopg2cffi made optional --- issue992/extract.py | 5 +++-- luscenje_struktur/formatter.py | 5 +++-- luscenje_struktur/loader.py | 4 ++-- luscenje_struktur/match_store.py | 5 +++-- luscenje_struktur/progress_bar.py | 5 +++-- luscenje_struktur/sloleks_db.py | 15 ++++++++------- luscenje_struktur/time_info.py | 3 ++- luscenje_struktur/word_stats.py | 4 ++-- 8 files changed, 26 insertions(+), 20 deletions(-) diff --git a/issue992/extract.py b/issue992/extract.py index 8e159c8..c91d164 100644 --- a/issue992/extract.py +++ b/issue992/extract.py @@ -2,6 +2,7 @@ import argparse import os import sys import tqdm +import logging good_lemmas = ["absurd", "absurdnost", "akuten", "akutno", "alkohol", "alkoholen", "aluminijast", "ananas", "aplikacija", "aplikativen", "aranžma", "arbiter", "armada", "avtomatičen", "avtomatiziran", "babica", "bajen", "bajka", "bakren", "bambusov", "barvan", "barvanje", "baseballski", "bazar", "bazičen", "belina", "bezgov", "bičati", "bife", "bilka", "biomasa", "biotop", "birma", "bivol", "blago", "blaženost", "bliskavica", "bobnič", "bolha", "bolnišnica", "bor", "borov", "borovničev", "brati", "briljant", "briti", "brusiti", "bučanje", "cikličen", "civilizacija", "dopust", "drama", "drezati", "duda", "dvorezen", "embalaža", "faks", "farsa", "glasno", "informiranje", "interier", "intima", "intimno", "investirati", "ironično", "istovetiti", "izvožen", "jagoda", "jeklar", "jezik", "karbon", "kitara", "kodrast", "molče", "mučiti", "novinarski", "obala", "občevati", "okrasiti", "pajčevina", "panoga", "prevajanje", "prevajati", "previti", "prihraniti", "priloga", "prisluškovati", "sopara"] @@ -15,14 +16,14 @@ def main(args): for fidx, filename in enumerate(filepaths): with open(filename, 'r') as fp: - print("loading next...", end="", flush=True) + logging.info("loading next...") line = fp.readline() lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell] file_lines = fp.read().split("\n") for lidx, good_lemma in enumerate(good_lemmas): spaces = " " * 20 if lidx == 0 else "" - print("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces), end="", flush=True) + logging.info("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces)) for line in file_lines: if good_lemma not in line: diff --git a/luscenje_struktur/formatter.py b/luscenje_struktur/formatter.py index 2ea8316..93a236d 100644 --- a/luscenje_struktur/formatter.py +++ b/luscenje_struktur/formatter.py @@ -1,5 +1,6 @@ from math import log2 import re +import logging from luscenje_struktur.component import ComponentType @@ -165,9 +166,9 @@ class StatsFormatter(Formatter): new_key = (sidx, idx, '') if new_key in self.colocation_ids.dispersions: key = new_key - print('Dispersions fixed.') + logging.info('Dispersions fixed.') else: - print('Dispersions not fixed.') + logging.info('Dispersions not fixed.') if key in self.colocation_ids.dispersions: distribution = self.colocation_ids.dispersions[key] else: diff --git a/luscenje_struktur/loader.py b/luscenje_struktur/loader.py index e77ff88..ddbf25b 100644 --- a/luscenje_struktur/loader.py +++ b/luscenje_struktur/loader.py @@ -29,13 +29,13 @@ def load_files(args, database, w_collection=None, input_corpus=None): database.init("CREATE TABLE Files ( filename varchar(2048) )") for idx, fname in enumerate(filenames): - print("FILE ", fname, "{}/{}".format(idx, len(filenames))) + logging.info("FILE " + fname + "{}/{}".format(idx, len(filenames))) extension = pathlib.Path(fname).suffix # check if file with the same name already loaded... loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone() if loaded is not None: - print("ALREADY LOADED") + logging.info("ALREADY LOADED") continue if extension == ".xml": diff --git a/luscenje_struktur/match_store.py b/luscenje_struktur/match_store.py index ec24a64..bcbadb5 100644 --- a/luscenje_struktur/match_store.py +++ b/luscenje_struktur/match_store.py @@ -2,6 +2,7 @@ import gc from collections import defaultdict from ast import literal_eval from time import time +import logging from luscenje_struktur.match import StructureMatch from luscenje_struktur.representation_assigner import RepresentationAssigner @@ -104,7 +105,7 @@ class MatchStore: def set_representations(self, word_renderer, structures, sloleks_db=None): step_name = 'representation' if self.db.is_step_done(step_name): - print("Representation step already done, skipping") + logging.info("Representation step already done, skipping") return num_inserts = 1000 @@ -148,7 +149,7 @@ class MatchStore: dispersions[(str(structure_id), component_id, lemma)] += 1 self.dispersions = dict(dispersions) - print("Storing dispersions...") + logging.info("Storing dispersions...") self.store_dispersions() self.db.step_is_done(step_name) diff --git a/luscenje_struktur/progress_bar.py b/luscenje_struktur/progress_bar.py index 4b4bcf6..6a16905 100644 --- a/luscenje_struktur/progress_bar.py +++ b/luscenje_struktur/progress_bar.py @@ -1,4 +1,5 @@ import time +import logging try: from tqdm import tqdm @@ -21,10 +22,10 @@ class Progress: for n, el in enumerate(iterable): now = time.time() if now - last_report > REPORT_ON: - print("\r{}: {}/{}".format(description, n, total), end="") + logging.info("\r{}: {}/{}".format(description, n, total), end="") last_report = now yield el - print(" -> {}".format(time.time() - start_time)) + logging.info(" -> {}".format(time.time() - start_time)) else: yield from tqdm(iterable, desc=description, total=total) diff --git a/luscenje_struktur/sloleks_db.py b/luscenje_struktur/sloleks_db.py index 1e5f8c7..626d483 100644 --- a/luscenje_struktur/sloleks_db.py +++ b/luscenje_struktur/sloleks_db.py @@ -1,17 +1,17 @@ import gc -from psycopg2cffi import compat -compat.register() - -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import Session, aliased -from sqlalchemy import create_engine - from luscenje_struktur.codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES class SloleksDatabase: def __init__(self, db, load_sloleks): + from psycopg2cffi import compat + compat.register() + + from sqlalchemy.ext.declarative import declarative_base + from sqlalchemy.orm import Session + from sqlalchemy import create_engine + global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation [db_user, db_password, db_database, db_host] = db.split(':') @@ -153,6 +153,7 @@ class SloleksDatabase: def get_word_form(self, lemma, msd, data, align_msd=False): # modify msd as required + from sqlalchemy.orm import aliased msd = list(msd) if 'msd' in data: for key, value in data['msd'].items(): diff --git a/luscenje_struktur/time_info.py b/luscenje_struktur/time_info.py index 25bfa0d..34bd9f6 100644 --- a/luscenje_struktur/time_info.py +++ b/luscenje_struktur/time_info.py @@ -1,4 +1,5 @@ from datetime import timedelta, datetime +import logging class TimeInfo: def __init__(self, to_go): @@ -14,5 +15,5 @@ class TimeInfo: seconds = sum(self.times) / len(self.times) td = timedelta(seconds = int(seconds * self.to_go)) ft = datetime.now() + td - print("Going to finish in {}".format(ft.strftime("%d/%m @ %H:%M"))) + logging.info("Going to finish in {}".format(ft.strftime("%d/%m @ %H:%M"))) diff --git a/luscenje_struktur/word_stats.py b/luscenje_struktur/word_stats.py index 64472e5..a26e48d 100644 --- a/luscenje_struktur/word_stats.py +++ b/luscenje_struktur/word_stats.py @@ -1,7 +1,7 @@ from collections import defaultdict, Counter from luscenje_struktur.progress_bar import progress - +import logging class WordStats: def __init__(self, lemma_features, db): @@ -46,7 +46,7 @@ class WordStats: def generate_renders(self): step_name = 'generate_renders' if self.db.is_step_done(step_name): - print("Skipping GenerateRenders, already complete") + logging.info("Skipping GenerateRenders, already complete") return lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]