Modified prints + sqlalchemy and psycopg2cffi made optional
This commit is contained in:
parent
39692e839f
commit
d67976c3d9
|
@ -2,6 +2,7 @@ import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import tqdm
|
import tqdm
|
||||||
|
import logging
|
||||||
|
|
||||||
good_lemmas = ["absurd", "absurdnost", "akuten", "akutno", "alkohol", "alkoholen", "aluminijast", "ananas", "aplikacija", "aplikativen", "aranžma", "arbiter", "armada", "avtomatičen", "avtomatiziran", "babica", "bajen", "bajka", "bakren", "bambusov", "barvan", "barvanje", "baseballski", "bazar", "bazičen", "belina", "bezgov", "bičati", "bife", "bilka", "biomasa", "biotop", "birma", "bivol", "blago", "blaženost", "bliskavica", "bobnič", "bolha", "bolnišnica", "bor", "borov", "borovničev", "brati", "briljant", "briti", "brusiti", "bučanje", "cikličen", "civilizacija", "dopust", "drama", "drezati", "duda", "dvorezen", "embalaža", "faks", "farsa", "glasno", "informiranje", "interier", "intima", "intimno", "investirati", "ironično", "istovetiti", "izvožen", "jagoda", "jeklar", "jezik", "karbon", "kitara", "kodrast", "molče", "mučiti", "novinarski", "obala", "občevati", "okrasiti", "pajčevina", "panoga", "prevajanje", "prevajati", "previti", "prihraniti", "priloga", "prisluškovati", "sopara"]
|
good_lemmas = ["absurd", "absurdnost", "akuten", "akutno", "alkohol", "alkoholen", "aluminijast", "ananas", "aplikacija", "aplikativen", "aranžma", "arbiter", "armada", "avtomatičen", "avtomatiziran", "babica", "bajen", "bajka", "bakren", "bambusov", "barvan", "barvanje", "baseballski", "bazar", "bazičen", "belina", "bezgov", "bičati", "bife", "bilka", "biomasa", "biotop", "birma", "bivol", "blago", "blaženost", "bliskavica", "bobnič", "bolha", "bolnišnica", "bor", "borov", "borovničev", "brati", "briljant", "briti", "brusiti", "bučanje", "cikličen", "civilizacija", "dopust", "drama", "drezati", "duda", "dvorezen", "embalaža", "faks", "farsa", "glasno", "informiranje", "interier", "intima", "intimno", "investirati", "ironično", "istovetiti", "izvožen", "jagoda", "jeklar", "jezik", "karbon", "kitara", "kodrast", "molče", "mučiti", "novinarski", "obala", "občevati", "okrasiti", "pajčevina", "panoga", "prevajanje", "prevajati", "previti", "prihraniti", "priloga", "prisluškovati", "sopara"]
|
||||||
|
|
||||||
|
@ -15,14 +16,14 @@ def main(args):
|
||||||
|
|
||||||
for fidx, filename in enumerate(filepaths):
|
for fidx, filename in enumerate(filepaths):
|
||||||
with open(filename, 'r') as fp:
|
with open(filename, 'r') as fp:
|
||||||
print("loading next...", end="", flush=True)
|
logging.info("loading next...")
|
||||||
line = fp.readline()
|
line = fp.readline()
|
||||||
lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell]
|
lemma_rows = [idx for idx, cell in enumerate(line.split(",")) if "_Lemma" in cell]
|
||||||
file_lines = fp.read().split("\n")
|
file_lines = fp.read().split("\n")
|
||||||
|
|
||||||
for lidx, good_lemma in enumerate(good_lemmas):
|
for lidx, good_lemma in enumerate(good_lemmas):
|
||||||
spaces = " " * 20 if lidx == 0 else ""
|
spaces = " " * 20 if lidx == 0 else ""
|
||||||
print("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces), end="", flush=True)
|
logging.info("\r{}.{} / {}.{}{}".format(fidx, lidx, N2, N1, spaces))
|
||||||
|
|
||||||
for line in file_lines:
|
for line in file_lines:
|
||||||
if good_lemma not in line:
|
if good_lemma not in line:
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from math import log2
|
from math import log2
|
||||||
import re
|
import re
|
||||||
|
import logging
|
||||||
|
|
||||||
from luscenje_struktur.component import ComponentType
|
from luscenje_struktur.component import ComponentType
|
||||||
|
|
||||||
|
@ -165,9 +166,9 @@ class StatsFormatter(Formatter):
|
||||||
new_key = (sidx, idx, '')
|
new_key = (sidx, idx, '')
|
||||||
if new_key in self.colocation_ids.dispersions:
|
if new_key in self.colocation_ids.dispersions:
|
||||||
key = new_key
|
key = new_key
|
||||||
print('Dispersions fixed.')
|
logging.info('Dispersions fixed.')
|
||||||
else:
|
else:
|
||||||
print('Dispersions not fixed.')
|
logging.info('Dispersions not fixed.')
|
||||||
if key in self.colocation_ids.dispersions:
|
if key in self.colocation_ids.dispersions:
|
||||||
distribution = self.colocation_ids.dispersions[key]
|
distribution = self.colocation_ids.dispersions[key]
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -29,13 +29,13 @@ def load_files(args, database, w_collection=None, input_corpus=None):
|
||||||
database.init("CREATE TABLE Files ( filename varchar(2048) )")
|
database.init("CREATE TABLE Files ( filename varchar(2048) )")
|
||||||
|
|
||||||
for idx, fname in enumerate(filenames):
|
for idx, fname in enumerate(filenames):
|
||||||
print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
|
logging.info("FILE " + fname + "{}/{}".format(idx, len(filenames)))
|
||||||
extension = pathlib.Path(fname).suffix
|
extension = pathlib.Path(fname).suffix
|
||||||
|
|
||||||
# check if file with the same name already loaded...
|
# check if file with the same name already loaded...
|
||||||
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone()
|
loaded = database.execute("SELECT * FROM Files WHERE filename=?", (fname,)).fetchone()
|
||||||
if loaded is not None:
|
if loaded is not None:
|
||||||
print("ALREADY LOADED")
|
logging.info("ALREADY LOADED")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if extension == ".xml":
|
if extension == ".xml":
|
||||||
|
|
|
@ -2,6 +2,7 @@ import gc
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from ast import literal_eval
|
from ast import literal_eval
|
||||||
from time import time
|
from time import time
|
||||||
|
import logging
|
||||||
|
|
||||||
from luscenje_struktur.match import StructureMatch
|
from luscenje_struktur.match import StructureMatch
|
||||||
from luscenje_struktur.representation_assigner import RepresentationAssigner
|
from luscenje_struktur.representation_assigner import RepresentationAssigner
|
||||||
|
@ -104,7 +105,7 @@ class MatchStore:
|
||||||
def set_representations(self, word_renderer, structures, sloleks_db=None):
|
def set_representations(self, word_renderer, structures, sloleks_db=None):
|
||||||
step_name = 'representation'
|
step_name = 'representation'
|
||||||
if self.db.is_step_done(step_name):
|
if self.db.is_step_done(step_name):
|
||||||
print("Representation step already done, skipping")
|
logging.info("Representation step already done, skipping")
|
||||||
return
|
return
|
||||||
|
|
||||||
num_inserts = 1000
|
num_inserts = 1000
|
||||||
|
@ -148,7 +149,7 @@ class MatchStore:
|
||||||
dispersions[(str(structure_id), component_id, lemma)] += 1
|
dispersions[(str(structure_id), component_id, lemma)] += 1
|
||||||
|
|
||||||
self.dispersions = dict(dispersions)
|
self.dispersions = dict(dispersions)
|
||||||
print("Storing dispersions...")
|
logging.info("Storing dispersions...")
|
||||||
self.store_dispersions()
|
self.store_dispersions()
|
||||||
|
|
||||||
self.db.step_is_done(step_name)
|
self.db.step_is_done(step_name)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import time
|
import time
|
||||||
|
import logging
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -21,10 +22,10 @@ class Progress:
|
||||||
for n, el in enumerate(iterable):
|
for n, el in enumerate(iterable):
|
||||||
now = time.time()
|
now = time.time()
|
||||||
if now - last_report > REPORT_ON:
|
if now - last_report > REPORT_ON:
|
||||||
print("\r{}: {}/{}".format(description, n, total), end="")
|
logging.info("\r{}: {}/{}".format(description, n, total), end="")
|
||||||
last_report = now
|
last_report = now
|
||||||
yield el
|
yield el
|
||||||
print(" -> {}".format(time.time() - start_time))
|
logging.info(" -> {}".format(time.time() - start_time))
|
||||||
else:
|
else:
|
||||||
yield from tqdm(iterable, desc=description, total=total)
|
yield from tqdm(iterable, desc=description, total=total)
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
from psycopg2cffi import compat
|
|
||||||
compat.register()
|
|
||||||
|
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
|
||||||
from sqlalchemy.orm import Session, aliased
|
|
||||||
from sqlalchemy import create_engine
|
|
||||||
|
|
||||||
from luscenje_struktur.codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
|
from luscenje_struktur.codes_tagset import TAGSET, CODES, CODES_TRANSLATION, POSSIBLE_WORD_FORM_FEATURE_VALUES
|
||||||
|
|
||||||
|
|
||||||
class SloleksDatabase:
|
class SloleksDatabase:
|
||||||
def __init__(self, db, load_sloleks):
|
def __init__(self, db, load_sloleks):
|
||||||
|
from psycopg2cffi import compat
|
||||||
|
compat.register()
|
||||||
|
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
|
||||||
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
||||||
[db_user, db_password, db_database, db_host] = db.split(':')
|
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||||
|
|
||||||
|
@ -153,6 +153,7 @@ class SloleksDatabase:
|
||||||
|
|
||||||
def get_word_form(self, lemma, msd, data, align_msd=False):
|
def get_word_form(self, lemma, msd, data, align_msd=False):
|
||||||
# modify msd as required
|
# modify msd as required
|
||||||
|
from sqlalchemy.orm import aliased
|
||||||
msd = list(msd)
|
msd = list(msd)
|
||||||
if 'msd' in data:
|
if 'msd' in data:
|
||||||
for key, value in data['msd'].items():
|
for key, value in data['msd'].items():
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from datetime import timedelta, datetime
|
from datetime import timedelta, datetime
|
||||||
|
import logging
|
||||||
|
|
||||||
class TimeInfo:
|
class TimeInfo:
|
||||||
def __init__(self, to_go):
|
def __init__(self, to_go):
|
||||||
|
@ -14,5 +15,5 @@ class TimeInfo:
|
||||||
seconds = sum(self.times) / len(self.times)
|
seconds = sum(self.times) / len(self.times)
|
||||||
td = timedelta(seconds = int(seconds * self.to_go))
|
td = timedelta(seconds = int(seconds * self.to_go))
|
||||||
ft = datetime.now() + td
|
ft = datetime.now() + td
|
||||||
print("Going to finish in {}".format(ft.strftime("%d/%m @ %H:%M")))
|
logging.info("Going to finish in {}".format(ft.strftime("%d/%m @ %H:%M")))
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
|
|
||||||
from luscenje_struktur.progress_bar import progress
|
from luscenje_struktur.progress_bar import progress
|
||||||
|
import logging
|
||||||
|
|
||||||
class WordStats:
|
class WordStats:
|
||||||
def __init__(self, lemma_features, db):
|
def __init__(self, lemma_features, db):
|
||||||
|
@ -46,7 +46,7 @@ class WordStats:
|
||||||
def generate_renders(self):
|
def generate_renders(self):
|
||||||
step_name = 'generate_renders'
|
step_name = 'generate_renders'
|
||||||
if self.db.is_step_done(step_name):
|
if self.db.is_step_done(step_name):
|
||||||
print("Skipping GenerateRenders, already complete")
|
logging.info("Skipping GenerateRenders, already complete")
|
||||||
return
|
return
|
||||||
|
|
||||||
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
lemmas = [lemma for (lemma, ) in self.db.execute("SELECT DISTINCT lemma FROM UniqWords")]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user