STARK-web/app.py

466 lines
19 KiB
Python
Raw Normal View History

2023-10-04 15:24:40 +00:00
import configparser
import csv
import hashlib
2024-11-17 18:48:06 +00:00
import json
2023-10-04 15:24:40 +00:00
import os
import random
import re
2024-11-17 18:48:06 +00:00
import shutil
import string
import time
2024-11-17 18:48:06 +00:00
from pathlib import Path
2023-10-04 15:24:40 +00:00
import requests
from flask import Flask, render_template, request, send_file, redirect, url_for
2024-02-19 14:33:19 +00:00
from flask_babel import Babel, gettext
2023-10-04 15:24:40 +00:00
from werkzeug.utils import secure_filename
from stark import run
UPLOAD_FOLDER = 'uploads'
ALLOWED_EXTENSIONS = {'conllu'}
2024-11-17 18:48:06 +00:00
DAYS_BEFORE_DELETION = 7
2024-02-19 14:33:19 +00:00
DEFAULT_LANGUAGE = 'en'
LANGUAGES = ['en', 'sl']
_translations = {
2024-03-06 08:40:12 +00:00
'en': {},
'sl': {},
2024-02-19 14:33:19 +00:00
}
def get_locale():
lang = request.args.get('lang')
if lang in LANGUAGES:
return lang
return DEFAULT_LANGUAGE
2023-10-04 15:24:40 +00:00
def ilog(n, base):
"""
Find the integer log of n with respect to the base.
>>> import math
>>> for base in range(2, 16 + 1):
... for n in range(1, 1000):
... assert ilog(n, base) == int(math.log(n, base) + 1e-10), '%s %s' % (n, base)
"""
count = 0
while n >= base:
count += 1
n //= base
return count
def sci_notation(n, prec=2):
"""
Represent n in scientific notation, with the specified precision.
>>> sci_notation(1234 * 10**1000)
'1.234e+1003'
>>> sci_notation(10**1000 // 2, prec=1)
'5.0e+999'
"""
if -100000 < n < 100000:
return str(n)
if n < 0:
return "-" + sci_notation(-n, prec=prec)
base = 10
exponent = ilog(n, base)
mantissa = n / base**exponent
return '{0:.{1}f}e{2:+d}'.format(mantissa, prec, exponent)
2023-12-06 15:35:15 +00:00
def create_app():
2024-02-21 10:10:31 +00:00
app = Flask(__name__, static_url_path='/stark/static')
2024-02-19 14:33:19 +00:00
babel = Babel(app, locale_selector=get_locale, default_translation_directories='translations')
babel.list_translations = ['en', 'sl']
2023-12-06 15:35:15 +00:00
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
def create_default_configs():
configs = {}
# mandatory parameters
configs['input_path'] = 'data/sl_ssj-ud_v2.4.conllu'
configs['output'] = 'results/out_official.tsv'
configs['tree_size'] = '2-4'
configs['node_type'] = 'upos'
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
# mandatory parameters with default value
configs['internal_saves'] = './internal_saves'
configs['cpu_cores'] = 1
2023-12-06 15:35:15 +00:00
configs['complete_tree_type'] = True
configs['dependency_type'] = True
configs['node_order'] = True
configs['association_measures'] = False
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['label_whitelist'] = []
configs['root_whitelist'] = []
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['query'] = None
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['compare'] = None
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['frequency_threshold'] = 0
configs['lines_threshold'] = None
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['continuation_processing'] = False
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['nodes_number'] = True
configs['print_root'] = True
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
if configs['compare'] is not None:
configs['other_input_path'] = configs['compare']
return configs
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
2024-02-21 10:10:31 +00:00
@app.route('/stark/about', methods=['GET'])
2023-12-06 15:35:15 +00:00
def about():
return render_template('about.html')
2024-11-17 18:48:06 +00:00
@app.route('/stark/visualization/<result_id>/<sentence_id>/<subtree_ids>', methods=['GET'])
def visualization(result_id, sentence_id, subtree_ids):
annodoc = ''
subtree_ids = subtree_ids.split('+')
for subtree_id in subtree_ids:
annodoc += f'# visual-style {subtree_id} bgColor:lightgreen\n'
with open(os.path.join('media', result_id, 'annodoc', sentence_id), 'r') as rf:
annodoc += rf.read() + '\n\n'
return {'annodoc': annodoc}
@app.route('/stark/result/<result_id>/<subtree_hash>', methods=['GET'])
def examples(result_id, subtree_hash):
# find example details
with open(os.path.join('media', result_id, 'result.tsv'), 'r') as rf:
content = list(csv.reader(rf, delimiter='\t'))
head = content[0]
content_dict = {h: [] for h in head}
table_columns2displayed_table_columns = {
'Tree': gettext('Tree'),
'Absolute frequency': gettext('Frequency'),
'Absolute frequency in second treebank': gettext('Frequency in B'),
'Order': gettext('Order'),
'Number of nodes': gettext('Number of nodes'),
'Head node': gettext('Head node'),
'Grew-match URL': gettext('Grew-match URL'),
'Ratio': gettext('Ratio'),
'%DIFF': gettext('%DIFF'),
'OR': gettext('OR'),
'BIC': gettext('BIC'),
'MI': gettext('MI'),
'logDice': gettext('logDice'),
't-score': gettext('t-score')
}
if 'Absolute frequency in second treebank' in head:
table_columns2displayed_table_columns['Absolute frequency'] = gettext('Frequency in A')
else:
del table_columns2displayed_table_columns['Absolute frequency in second treebank']
displayed_table_columns2table_columns = {v: k for k, v in table_columns2displayed_table_columns.items()}
annodoc_index = head.index('Annodoc')
selected_content = [row for row in content[1:]
if json.loads(row[annodoc_index])['subtree_hash'] == subtree_hash]
for i, row in enumerate(selected_content):
for j, v in enumerate(row):
content_dict[head[j]].append(v)
head = [(k, v) for k, v in table_columns2displayed_table_columns.items() if k in head]
displayed_content_dict = {}
for f_h, h in head:
if f_h == '%DIFF' or f_h == 'OR':
displayed_content_dict[f_h] = [sci_notation(eval(n)) for n in
content_dict[displayed_table_columns2table_columns[h]]]
else:
displayed_content_dict[f_h] = content_dict[displayed_table_columns2table_columns[h]]
# add visualization parts to dict
visualization_dict = {'example_id': [], 'example_positions': []}
with open(os.path.join('media', result_id, 'annodoc_detailed', subtree_hash), 'r') as rf:
for vis in list(csv.reader(rf, delimiter='\t')):
visualization_dict['example_id'].append(vis[0])
visualization_dict['example_positions'].append('+'.join(map(str, eval(vis[1]))))
return render_template('examples.html', head=head, content=displayed_content_dict, visualization=visualization_dict)
2024-11-17 18:48:06 +00:00
2024-02-21 10:10:31 +00:00
@app.route('/stark/result/<result_id>', methods=['GET', 'POST'])
2023-12-06 15:35:15 +00:00
def result(result_id):
if request.method == 'POST':
for filename in os.listdir('media'):
2024-11-17 18:48:06 +00:00
path = os.path.join('media', filename)
if os.path.isdir(path):
file_path = os.path.join('media', filename, 'result.tsv')
else:
file_path = path
2023-12-06 15:35:15 +00:00
f_t = os.path.getmtime(file_path)
c_t = time.time()
file_age_seconds = c_t - f_t
if file_age_seconds > DAYS_BEFORE_DELETION * 86400:
2024-11-17 18:48:06 +00:00
if os.path.isdir(path):
shutil.rmtree(os.path.join('media', filename), ignore_errors=True)
else:
os.remove(path)
return send_file(os.path.join('media', result_id, 'result.tsv'), as_attachment=True, download_name='results.tsv')
2023-12-06 15:35:15 +00:00
2024-03-06 08:40:12 +00:00
2024-11-17 18:48:06 +00:00
with open(os.path.join('media', result_id, 'result.tsv'), 'r') as rf:
2023-12-06 15:35:15 +00:00
content = list(csv.reader(rf, delimiter='\t'))
content = [con + [str(i)] for i, con in enumerate(content)]
2023-12-06 15:35:15 +00:00
head = content[0]
content_dict = {h: [] for h in head}
2024-03-06 08:40:12 +00:00
table_columns2displayed_table_columns = {
'Tree': gettext('Tree'),
'Absolute frequency': gettext('Frequency'),
2024-03-12 09:16:15 +00:00
'Absolute frequency in second treebank': gettext('Frequency in B'),
'Order': gettext('Order'),
2024-03-06 08:40:12 +00:00
'Number of nodes': gettext('Number of nodes'),
'Head node': gettext('Head node'),
'Grew-match URL': gettext('Grew-match URL'),
'Ratio': gettext('Ratio'),
2024-03-13 08:06:09 +00:00
'%DIFF': gettext('%DIFF'),
'OR': gettext('OR'),
'BIC': gettext('BIC'),
2024-03-06 08:40:12 +00:00
'MI': gettext('MI'),
'logDice': gettext('logDice'),
't-score': gettext('t-score')
}
if 'Absolute frequency in second treebank' in head:
table_columns2displayed_table_columns['Absolute frequency'] = gettext('Frequency in A')
2024-03-12 09:16:15 +00:00
else:
del table_columns2displayed_table_columns['Absolute frequency in second treebank']
2024-03-06 08:40:12 +00:00
displayed_table_columns2table_columns = {v: k for k, v in table_columns2displayed_table_columns.items()}
order_by_display = request.args.get('order_by')
order_by = displayed_table_columns2table_columns[
order_by_display[:-1]] if order_by_display is not None else None
order_type = request.args.get('order_type')
if order_by is not None and order_by in head:
sort_id = head.index(order_by)
2023-12-06 15:35:15 +00:00
if order_type == 'asc':
# check if a number can be converted to float or int
2024-08-30 10:32:25 +00:00
ordered_content = sorted(content[1:], key=lambda x: -1 * float(x[sort_id]) if x[sort_id].isnumeric() or re.match(r'^-?\d+(?:\.\d+)$', x[sort_id]) is not None and order_by != 'Ratio' else x[sort_id], reverse=True)
2023-12-06 15:35:15 +00:00
else:
2024-08-30 10:32:25 +00:00
ordered_content = sorted(content[1:], key=lambda x: -1 * float(x[sort_id]) if x[sort_id].isnumeric() or re.match(r'^-?\d+(?:\.\d+)$', x[sort_id]) is not None and order_by != 'Ratio' else x[sort_id])
2023-12-06 15:35:15 +00:00
else:
ordered_content = content[1:]
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
for i, row in enumerate(ordered_content):
for j, v in enumerate(row):
content_dict[head[j]].append(v)
head = [(k, v) for k, v in table_columns2displayed_table_columns.items() if k in head]
displayed_content_dict = {}
for f_h, h in head:
if f_h == '%DIFF' or f_h == 'OR':
# for num_str in content_dict[displayed_table_columns2table_columns[h]]:
# if n < 0:
# return "-" + sci_notation(-n, prec=prec)
# sci_notation(eval(num))
displayed_content_dict[f_h] = [sci_notation(eval(n)) for n in content_dict[displayed_table_columns2table_columns[h]]]
else:
displayed_content_dict[f_h] = content_dict[displayed_table_columns2table_columns[h]]
2024-03-12 09:16:15 +00:00
2024-11-17 18:48:06 +00:00
# add visualization parts to dict
annodoc_data = [json.loads(el) for el in content_dict['Annodoc']]
displayed_content_dict['example_id'] = [el['id'] for el in annodoc_data]
displayed_content_dict['example_positions'] = ['+'.join([str(p) for p in el['positions']]) for el in annodoc_data]
displayed_content_dict['subtree_hash'] = [el['subtree_hash'] for el in annodoc_data]
2024-11-17 18:48:06 +00:00
return render_template('result.html', head=head, content=displayed_content_dict)
2024-02-21 10:10:31 +00:00
@app.route('/stark/', methods=['GET', 'POST'])
2024-02-13 14:19:22 +00:00
# @headers({'Cache-Control': 'no-store, no-cache, must-revalidate, post-check=0, pre-check=0'})
2023-12-06 15:35:15 +00:00
def index():
2024-02-19 14:33:19 +00:00
translations = _translations[get_locale()]
2023-12-06 15:35:15 +00:00
if request.method == 'POST':
form = request.form
configs = {
'greedy_counter': 'yes',
'input_path': '',
'tree_size': '1-1000000',
'display_size': '1-1000000',
'complete_tree_type': True,
'ignored_labels': []
}
2023-12-06 15:35:15 +00:00
# mandatory parameters
validation = {}
2023-12-06 15:35:15 +00:00
# handling input
if 'file' in request.files and request.files['file']:
# store file
f = request.files['file']
input_path = os.path.join('media', secure_filename(f.filename))
f.save(input_path)
2023-12-06 15:35:15 +00:00
configs['input_path'] = input_path
2023-12-06 15:35:15 +00:00
if 'input_url' in form and form['input_url']:
2024-02-19 14:33:19 +00:00
validation['file'] = gettext('Please insert either input url or file, not both of them.')
validation['input_url'] = gettext('Please insert either input url or file, not both of them.')
2023-12-06 15:35:15 +00:00
# TODO OPTIONALLY ADD conllu FILE CHECK
elif 'input_url' in form and form['input_url']:
try:
name = form['input_url'].split('/')[-1]
input_path = os.path.join('media', name)
response = requests.get(form['input_url'])
open(input_path, "wb").write(response.content)
configs['input_path'] = input_path
except:
2024-02-19 14:33:19 +00:00
validation['input_url'] = gettext('Incorrect URL!')
else:
2024-02-19 14:33:19 +00:00
validation['file'] = gettext('Please insert either input url or provide a file.')
validation['input_url'] = gettext('Please insert either input url or provide a file.')
if 'display_size' in form and form['display_size']:
configs['display_size'] = form['display_size']
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
def validate_node_type(node_type):
node_type_options = {'upos', 'form', 'lemma', 'upos', 'xpos', 'feats', 'deprel'}
if len(node_type) == 0:
2024-02-19 14:33:19 +00:00
validation['node_type'] = gettext('Please select at least one node type.')
2023-10-04 15:24:40 +00:00
return False
2023-12-06 15:35:15 +00:00
for el in node_type:
if el not in node_type_options:
2024-02-19 14:33:19 +00:00
validation['node_type'] = gettext('Node option') + f' {el} ' + gettext('is not supported. Please enter valid options.')
2023-12-06 15:35:15 +00:00
return False
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
return True
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
node_type = []
2023-12-06 15:35:15 +00:00
if 'node_type_upos' in form:
node_type.append('upos')
if 'node_type_form' in form:
node_type.append('form')
if 'node_type_lemma' in form:
node_type.append('lemma')
2023-10-04 15:24:40 +00:00
if 'node_type_none' in form:
configs['node_type'] = None
elif validate_node_type(node_type):
2023-12-06 15:35:15 +00:00
configs['node_type'] = '+'.join(node_type)
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
# mandatory parameters with default value
configs['internal_saves'] = None
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
# TODO depends on computer
configs['cpu_cores'] = 12
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['dependency_type'] = 'labeled_trees' in form and form['labeled_trees'] == 'on'
configs['node_order'] = 'fixed_order' in form and form['fixed_order'] == 'on'
configs['association_measures'] = 'association_measures' in form and form['association_measures'] == 'on'
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['label_whitelist'] = []
configs['root_whitelist'] = []
if 'root_restriction' in form and form['root_restriction']:
configs['root_whitelist'] = form['root_restriction'].split('|')
2023-10-04 15:24:40 +00:00
if 'ignored_labels' in form and form['ignored_labels']:
configs['ignored_labels'] = form['ignored_labels'].split('|')
2024-02-20 09:59:58 +00:00
if 'query' in form and form['query']:
configs['query'] = form['query']
configs['tree_size'] = '0'
configs['display_size'] = '0'
2024-02-20 09:59:58 +00:00
else:
configs['query'] = None
2024-02-21 10:10:31 +00:00
# handling input
if 'compare_file' in request.files and request.files['compare_file']:
# store file
f = request.files['compare_file']
input_path = os.path.join('media', secure_filename(f.filename))
f.save(input_path)
configs['compare'] = input_path
if 'compare_url' in form and form['compare_url']:
validation['compare_file'] = gettext('Please insert either compare url or file, not both of them.')
validation['compare_url'] = gettext('Please insert either compare url or file, not both of them.')
2024-02-20 09:59:58 +00:00
2024-02-21 10:10:31 +00:00
elif 'compare_url' in form and form['compare_url']:
try:
name = form['compare_url'].split('/')[-1]
input_path = os.path.join('media', name)
response = requests.get(form['compare_url'])
open(input_path, "wb").write(response.content)
2024-02-20 09:59:58 +00:00
configs['compare'] = input_path
2024-02-21 10:10:31 +00:00
except:
configs['compare'] = None
validation['compare_url'] = gettext('Incorrect URL!')
else:
configs['compare'] = None
2023-10-04 15:24:40 +00:00
2024-01-22 14:06:36 +00:00
configs['sentence_count_file'] = None
configs['detailed_results_file'] = None
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['frequency_threshold'] = 0
2024-02-13 14:19:22 +00:00
if 'frequency_threshold' in form and form['frequency_threshold']:
try:
int(form['frequency_threshold'])
except ValueError:
2024-02-19 14:33:19 +00:00
validation['frequency_threshold'] = gettext('Please insert an Integer.')
2024-02-13 14:19:22 +00:00
else:
configs['frequency_threshold'] = int(form['frequency_threshold'])
2023-12-06 15:35:15 +00:00
configs['lines_threshold'] = None
2024-11-17 18:48:06 +00:00
configs['node_info'] = None
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
configs['continuation_processing'] = False
2023-10-04 15:24:40 +00:00
2024-02-22 06:58:06 +00:00
configs['label_subtypes'] = True
2023-12-06 15:35:15 +00:00
configs['nodes_number'] = True
configs['print_root'] = True
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
if configs['compare'] is not None:
configs['other_input_path'] = configs['compare']
2023-10-04 15:24:40 +00:00
configs['grew_match'] = True
2023-12-06 15:35:15 +00:00
configs['depsearch'] = False
2024-11-17 18:48:06 +00:00
configs['example'] = True
2023-12-06 15:35:15 +00:00
name = ''.join(random.choices(string.ascii_uppercase + string.digits, k=60))
2024-11-17 18:48:06 +00:00
output_path = Path(os.path.join('media', name))
if output_path.exists():
shutil.rmtree(output_path, ignore_errors=True)
output_path.mkdir()
configs['output'] = os.path.join(output_path, 'result.tsv')
configs['annodoc_example_dir'] = os.path.join(output_path, 'annodoc')
configs['annodoc_detailed_dir'] = os.path.join(output_path, 'annodoc_detailed')
configs['detailed_results_file'] = os.path.join(output_path, 'detailed_results_file.tsv')
if len(validation) > 0:
2024-02-19 14:33:19 +00:00
return render_template('index.html', validation=validation, translations=translations)
try:
run(configs)
except Exception as e:
2024-02-19 14:33:19 +00:00
validation['general'] = gettext('Processing failed! Please recheck your settings, e.g. input format or head node description.')
if len(validation) > 0:
2024-02-19 14:33:19 +00:00
return render_template('index.html', validation=validation, translations=translations)
2024-02-13 14:19:22 +00:00
# check if there are no results
2024-11-17 18:48:06 +00:00
with open(os.path.join('media', name, 'result.tsv'), 'r') as rf:
2024-02-13 14:19:22 +00:00
content = list(csv.reader(rf, delimiter='\t'))
if len(content) == 1:
validation['results'] = False
2024-02-19 14:33:19 +00:00
return render_template('index.html', validation=validation, translations=translations)
2024-03-06 08:40:12 +00:00
order_by = gettext('Frequency ') if not configs['compare'] else gettext('Frequency in A ')
return redirect(url_for('result', result_id=name, order_by=order_by, order_type='desc', lang=gettext('code')))
2024-02-19 14:33:19 +00:00
return render_template('index.html', translations=translations)
2023-10-04 15:24:40 +00:00
2023-12-06 15:35:15 +00:00
return app
2023-10-04 15:24:40 +00:00
if __name__ == '__main__':
2023-12-06 15:35:15 +00:00
app = create_app()
2023-10-04 15:24:40 +00:00
app.run(debug=True)