Modified prints + sqlalchemy and psycopg2cffi made optional

This commit is contained in:
Luka 2021-04-15 14:16:34 +02:00
parent 39692e839f
commit d67976c3d9
8 changed files with 26 additions and 20 deletions

View File

@ -2,6 +2,7 @@ import argparse
import os import os
import sys import sys
import tqdm import tqdm
import logging
good_lemmas = ["absurd", "absurdnost", "akuten", "akutno", "alkohol", "alkoholen", "aluminijast", "ananas", "aplikacija", "aplikativen", "aranžma", "arbiter", "armada", "avtomatičen", "avtomatiziran", "babica", "bajen", "bajka", "bakren", "bambusov", "barvan", "barvanje", "baseballski", "bazar", "bazičen", "belina", "bezgov", "bičati", "bife", "bilka", "biomasa", "biotop", "birma", "bivol", "blago", "blaženost", "bliskavica", "bobnič", "bolha", "bolnišnica", "bor", "borov", "borovničev", "brati", "briljant", "briti", "brusiti", "bučanje", "cikličen", "civilizacija", "dopust", "drama", "drezati", "duda", "dvorezen", "embalaža", "faks", "farsa", "glasno", "informiranje", "interier", "intima", "intimno", "investirati", "ironično", "istovetiti", "izvožen", "jagoda", "jeklar", "jezik", "karbon", "kitara", "kodrast", "molče", "mučiti", "novinarski", "obala", "občevati", "okrasiti", "pajčevina", "panoga", "prevajanje", "prevajati", "previti", "prihraniti", "priloga", "prisluškovati", "sopara"] good_lemmas = ["absurd", "absurdnost", "akuten", "akutno", "alkohol", "alkoholen", "aluminijast", "ananas", "aplikacija", "aplikativen", "aranžma", "arbiter", "armada", "avtomatičen", "avtomatiziran", "babica", "bajen", "bajka", "bakren", "bambusov", "barvan", "barvanje", "baseballski", "bazar", "bazičen", "belina", "bezgov", "bičati", "bife", "bilka", "biomasa", "biotop", "birma", "bivol", "blago", "blaženost", "bliskavica", "bobnič", "bolha", "bolnišnica", "bor", "borov", "borovničev", "brati", "briljant", "briti", "brusiti", "bučanje", "cikličen", "civilizacija", "dopust", "drama", "drezati", "duda", "dvorezen", "embalaža", "faks", "farsa", "glasno", "informiranje", "interier", "intima", "intimno", "investirati", "ironično", "istovetiti", "izvožen", "jagoda", "jeklar", "jezik", "karbon", "kitara", "kodrast", "molče", "mučiti", "novinarski", "obala", "občevati", "okrasiti", "pajčevina", "panoga", "prevajanje", "prevajati", "previti", "prihraniti", "priloga", "prisluškovati", "sopara"]
@ -15,14 +16,14 @@ def main(args):
for fidx, filename in enumerate(filepaths): for fidx, filename in enumerate(filepaths):
with open(filename, 'r') as fp: with open(filename, 'r') as fp:
print("loading next...", end="", flush=True) logging.info("loading next...")
line = fp.readline() line = fp.readline()
lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell] lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell]
file_lines = fp.read().split("\n") file_lines = fp.read().split("\n")
for lidx, good_lemma in enumerate(good_lemmas): for lidx, good_lemma in enumerate(good_lemmas):
spaces = " " * 20 if lidx == 0 else "" spaces = " " * 20 if lidx == 0 else ""
print("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces), end="", flush=True) logging.info("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces))
for line in file_lines: for line in file_lines:
if good_lemma not in line: if good_lemma not in line:

View File

@ -1,5 +1,6 @@
from math import log2 from math import log2
import re import re
import logging
from luscenje_struktur.component import ComponentType from luscenje_struktur.component import ComponentType
@ -165,9 +166,9 @@ class StatsFormatter(Formatter):
new_key = (sidx, idx, '') new_key = (sidx, idx, '')
if new_key in self.colocation_ids.dispersions: if new_key in self.colocation_ids.dispersions:
key = new_key key = new_key
print('Dispersions fixed.') logging.info('Dispersions fixed.')
else: else:
print('Dispersions not fixed.') logging.info('Dispersions not fixed.')
if key in self.colocation_ids.dispersions: if key in self.colocation_ids.dispersions:
distribution = self.colocation_ids.dispersions[key] distribution = self.colocation_ids.dispersions[key]
else: else:

View File

@ -29,13 +29,13 @@ def load_files(args, database, w_collection=None, input_corpus=None):
database.init("CREATE TABLE Files ( filename varchar(2048) )") database.init("CREATE TABLE Files ( filename varchar(2048) )")
for idx, fname in enumerate(filenames): for idx, fname in enumerate(filenames):
print("FILE ", fname, "{}/{}".format(idx, len(filenames))) logging.info("FILE " + fname + "{}/{}".format(idx, len(filenames)))
extension = pathlib.Path(fname).suffix extension = pathlib.Path(fname).suffix
# check if file with the same name already loaded... # check if file with the same name already loaded...
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone() loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone()
if loaded is not None: if loaded is not None:
print("ALREADY LOADED") logging.info("ALREADY LOADED")
continue continue
if extension == ".xml": if extension == ".xml":

View File

@ -2,6 +2,7 @@ import gc
from collections import defaultdict from collections import defaultdict
from ast import literal_eval from ast import literal_eval
from time import time from time import time
import logging
from luscenje_struktur.match import StructureMatch from luscenje_struktur.match import StructureMatch
from luscenje_struktur.representation_assigner import RepresentationAssigner from luscenje_struktur.representation_assigner import RepresentationAssigner
@ -104,7 +105,7 @@ class MatchStore:
def set_representations(self, word_renderer, structures, sloleks_db=None): def set_representations(self, word_renderer, structures, sloleks_db=None):
step_name = 'representation' step_name = 'representation'
if self.db.is_step_done(step_name): if self.db.is_step_done(step_name):
print("Representation step already done, skipping") logging.info("Representation step already done, skipping")
return return
num_inserts = 1000 num_inserts = 1000
@ -148,7 +149,7 @@ class MatchStore:
dispersions[(str(structure_id), component_id, lemma)] += 1 dispersions[(str(structure_id), component_id, lemma)] += 1
self.dispersions = dict(dispersions) self.dispersions = dict(dispersions)
print("Storing dispersions...") logging.info("Storing dispersions...")
self.store_dispersions() self.store_dispersions()
self.db.step_is_done(step_name) self.db.step_is_done(step_name)

View File

@ -1,4 +1,5 @@
import time import time
import logging
try: try:
from tqdm import tqdm from tqdm import tqdm
@ -21,10 +22,10 @@ class Progress:
for n, el in enumerate(iterable): for n, el in enumerate(iterable):
now = time.time() now = time.time()
if now - last_report > REPORT_ON: if now - last_report > REPORT_ON:
print("\r{}: {}/{}".format(description, n, total), end="") logging.info("\r{}: {}/{}".format(description, n, total), end="")
last_report = now last_report = now
yield el yield el
print(" -> {}".format(time.time() - start_time)) logging.info(" -> {}".format(time.time() - start_time))
else: else:
yield from tqdm(iterable, desc=description, total=total) yield from tqdm(iterable, desc=description, total=total)

View File

@ -1,17 +1,17 @@
import gc import gc
from psycopg2cffi import compat
compat.register()
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, aliased
from sqlalchemy import create_engine
from luscenje_struktur.codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES from luscenje_struktur.codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
class SloleksDatabase: class SloleksDatabase:
def __init__(self, db, load_sloleks): def __init__(self, db, load_sloleks):
from psycopg2cffi import compat
compat.register()
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':') [db_user, db_password, db_database, db_host] = db.split(':')
@ -153,6 +153,7 @@ class SloleksDatabase:
def get_word_form(self, lemma, msd, data, align_msd=False): def get_word_form(self, lemma, msd, data, align_msd=False):
# modify msd as required # modify msd as required
from sqlalchemy.orm import aliased
msd = list(msd) msd = list(msd)
if 'msd' in data: if 'msd' in data:
for key, value in data['msd'].items(): for key, value in data['msd'].items():

View File

@ -1,4 +1,5 @@
from datetime import timedelta, datetime from datetime import timedelta, datetime
import logging
class TimeInfo: class TimeInfo:
def __init__(self, to_go): def __init__(self, to_go):
@ -14,5 +15,5 @@ class TimeInfo:
seconds = sum(self.times) / len(self.times) seconds = sum(self.times) / len(self.times)
td = timedelta(seconds = int(seconds * self.to_go)) td = timedelta(seconds = int(seconds * self.to_go))
ft = datetime.now() + td ft = datetime.now() + td
print("Going to finish in {}".format(ft.strftime("%d/%m @ %H:%M"))) logging.info("Going to finish in {}".format(ft.strftime("%d/%m @ %H:%M")))

View File

@ -1,7 +1,7 @@
from collections import defaultdict, Counter from collections import defaultdict, Counter
from luscenje_struktur.progress_bar import progress from luscenje_struktur.progress_bar import progress
import logging
class WordStats: class WordStats:
def __init__(self, lemma_features, db): def __init__(self, lemma_features, db):
@ -46,7 +46,7 @@ class WordStats:
def generate_renders(self): def generate_renders(self):
step_name = 'generate_renders' step_name = 'generate_renders'
if self.db.is_step_done(step_name): if self.db.is_step_done(step_name):
print("Skipping GenerateRenders, already complete") logging.info("Skipping GenerateRenders, already complete")
return return
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")] lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]