407 lines
16 KiB
Python
Executable File
407 lines
16 KiB
Python
Executable File
import configparser
|
|
import csv
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
import shutil
|
|
import string
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from flask import Flask, render_template, request, send_file, redirect, url_for
|
|
from flask_babel import Babel, gettext
|
|
from werkzeug.utils import secure_filename
|
|
from stark import run
|
|
|
|
UPLOAD_FOLDER = 'uploads'
|
|
ALLOWED_EXTENSIONS = {'conllu'}
|
|
DAYS_BEFORE_DELETION = 7
|
|
DEFAULT_LANGUAGE = 'en'
|
|
LANGUAGES = ['en', 'sl']
|
|
|
|
_translations = {
|
|
'en': {},
|
|
'sl': {},
|
|
}
|
|
|
|
|
|
def get_locale():
|
|
lang = request.args.get('lang')
|
|
if lang in LANGUAGES:
|
|
return lang
|
|
return DEFAULT_LANGUAGE
|
|
|
|
|
|
def ilog(n, base):
|
|
"""
|
|
Find the integer log of n with respect to the base.
|
|
|
|
>>> import math
|
|
>>> for base in range(2, 16 + 1):
|
|
... for n in range(1, 1000):
|
|
... assert ilog(n, base) == int(math.log(n, base) + 1e-10), '%s %s' % (n, base)
|
|
"""
|
|
count = 0
|
|
while n >= base:
|
|
count += 1
|
|
n //= base
|
|
return count
|
|
|
|
|
|
def sci_notation(n, prec=2):
|
|
"""
|
|
Represent n in scientific notation, with the specified precision.
|
|
|
|
>>> sci_notation(1234 * 10**1000)
|
|
'1.234e+1003'
|
|
>>> sci_notation(10**1000 // 2, prec=1)
|
|
'5.0e+999'
|
|
"""
|
|
if -100000 < n < 100000:
|
|
return str(n)
|
|
|
|
if n < 0:
|
|
return "-" + sci_notation(-n, prec=prec)
|
|
base = 10
|
|
exponent = ilog(n, base)
|
|
mantissa = n / base**exponent
|
|
return '{0:.{1}f}e{2:+d}'.format(mantissa, prec, exponent)
|
|
|
|
|
|
def create_app():
|
|
app = Flask(__name__, static_url_path='/stark/static')
|
|
|
|
|
|
|
|
babel = Babel(app, locale_selector=get_locale, default_translation_directories='translations')
|
|
babel.list_translations = ['en', 'sl']
|
|
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
|
|
|
def create_default_configs():
|
|
configs = {}
|
|
# mandatory parameters
|
|
configs['input_path'] = 'data/sl_ssj-ud_v2.4.conllu'
|
|
configs['output'] = 'results/out_official.tsv'
|
|
configs['tree_size'] = '2-4'
|
|
configs['node_type'] = 'upos'
|
|
|
|
# mandatory parameters with default value
|
|
configs['internal_saves'] = './internal_saves'
|
|
configs['cpu_cores'] = 1
|
|
configs['complete_tree_type'] = True
|
|
configs['dependency_type'] = True
|
|
configs['node_order'] = True
|
|
configs['association_measures'] = False
|
|
|
|
configs['label_whitelist'] = []
|
|
configs['root_whitelist'] = []
|
|
|
|
configs['query'] = None
|
|
|
|
configs['compare'] = None
|
|
|
|
configs['frequency_threshold'] = 0
|
|
configs['lines_threshold'] = None
|
|
|
|
configs['continuation_processing'] = False
|
|
|
|
configs['nodes_number'] = True
|
|
configs['print_root'] = True
|
|
|
|
if configs['compare'] is not None:
|
|
configs['other_input_path'] = configs['compare']
|
|
return configs
|
|
|
|
|
|
def allowed_file(filename):
|
|
return '.' in filename and \
|
|
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
|
|
|
@app.route('/stark/about', methods=['GET'])
|
|
def about():
|
|
return render_template('about.html')
|
|
|
|
@app.route('/stark/visualization/<result_id>/<sentence_id>/<subtree_ids>', methods=['GET'])
|
|
def visualization(result_id, sentence_id, subtree_ids):
|
|
annodoc = ''
|
|
subtree_ids = subtree_ids.split('+')
|
|
for subtree_id in subtree_ids:
|
|
annodoc += f'# visual-style {subtree_id} bgColor:lightgreen\n'
|
|
with open(os.path.join('media', result_id, 'annodoc', sentence_id), 'r') as rf:
|
|
# annodoc += '\n'.join(rf.readlines())
|
|
annodoc += rf.read() + '\n\n'
|
|
return {'annodoc': annodoc}
|
|
# return '<div>' + annodoc + '</div>'
|
|
|
|
@app.route('/stark/result/<result_id>', methods=['GET', 'POST'])
|
|
def result(result_id):
|
|
if request.method == 'POST':
|
|
for filename in os.listdir('media'):
|
|
path = os.path.join('media', filename)
|
|
if os.path.isdir(path):
|
|
file_path = os.path.join('media', filename, 'result.tsv')
|
|
else:
|
|
file_path = path
|
|
f_t = os.path.getmtime(file_path)
|
|
c_t = time.time()
|
|
file_age_seconds = c_t - f_t
|
|
if file_age_seconds > DAYS_BEFORE_DELETION * 86400:
|
|
if os.path.isdir(path):
|
|
shutil.rmtree(os.path.join('media', filename), ignore_errors=True)
|
|
else:
|
|
os.remove(path)
|
|
return send_file(os.path.join('media', result_id, 'result.tsv'), as_attachment=True, download_name='results.tsv')
|
|
|
|
|
|
with open(os.path.join('media', result_id, 'result.tsv'), 'r') as rf:
|
|
|
|
content = list(csv.reader(rf, delimiter='\t'))
|
|
head = content[0]
|
|
content_dict = {h: [] for h in head}
|
|
table_columns2displayed_table_columns = {
|
|
'Tree': gettext('Tree'),
|
|
'Absolute frequency': gettext('Frequency'),
|
|
'Absolute frequency in second treebank': gettext('Frequency in B'),
|
|
'Order': gettext('Order'),
|
|
'Number of nodes': gettext('Number of nodes'),
|
|
'Head node': gettext('Head node'),
|
|
'Grew-match URL': gettext('Grew-match URL'),
|
|
'Ratio': gettext('Ratio'),
|
|
'%DIFF': gettext('%DIFF'),
|
|
'OR': gettext('OR'),
|
|
'BIC': gettext('BIC'),
|
|
'MI': gettext('MI'),
|
|
'logDice': gettext('logDice'),
|
|
't-score': gettext('t-score')
|
|
}
|
|
if 'Absolute frequency in second treebank' in head:
|
|
table_columns2displayed_table_columns['Absolute frequency'] = gettext('Frequency in A')
|
|
else:
|
|
del table_columns2displayed_table_columns['Absolute frequency in second treebank']
|
|
displayed_table_columns2table_columns = {v: k for k, v in table_columns2displayed_table_columns.items()}
|
|
order_by_display = request.args.get('order_by')
|
|
order_by = displayed_table_columns2table_columns[
|
|
order_by_display[:-1]] if order_by_display is not None else None
|
|
order_type = request.args.get('order_type')
|
|
if order_by is not None and order_by in head:
|
|
sort_id = head.index(order_by)
|
|
if order_type == 'asc':
|
|
# check if a number can be converted to float or int
|
|
ordered_content = sorted(content[1:], key=lambda x: -1 * float(x[sort_id]) if x[sort_id].isnumeric() or re.match(r'^-?\d+(?:\.\d+)$', x[sort_id]) is not None and order_by != 'Ratio' else x[sort_id], reverse=True)
|
|
else:
|
|
ordered_content = sorted(content[1:], key=lambda x: -1 * float(x[sort_id]) if x[sort_id].isnumeric() or re.match(r'^-?\d+(?:\.\d+)$', x[sort_id]) is not None and order_by != 'Ratio' else x[sort_id])
|
|
else:
|
|
ordered_content = content[1:]
|
|
|
|
for i, row in enumerate(ordered_content):
|
|
for j, v in enumerate(row):
|
|
content_dict[head[j]].append(v)
|
|
|
|
head = [(k, v) for k, v in table_columns2displayed_table_columns.items() if k in head]
|
|
displayed_content_dict = {}
|
|
for f_h, h in head:
|
|
if f_h == '%DIFF' or f_h == 'OR':
|
|
# for num_str in content_dict[displayed_table_columns2table_columns[h]]:
|
|
# if n < 0:
|
|
# return "-" + sci_notation(-n, prec=prec)
|
|
# sci_notation(eval(num))
|
|
displayed_content_dict[f_h] = [sci_notation(eval(n)) for n in content_dict[displayed_table_columns2table_columns[h]]]
|
|
else:
|
|
displayed_content_dict[f_h] = content_dict[displayed_table_columns2table_columns[h]]
|
|
|
|
# add visualization parts to dict
|
|
annodoc_data = [json.loads(el) for el in content_dict['Annodoc']]
|
|
displayed_content_dict['example_id'] = [el['id'] for el in annodoc_data]
|
|
displayed_content_dict['example_positions'] = ['+'.join([str(p) for p in el['positions']]) for el in annodoc_data]
|
|
|
|
|
|
return render_template('result.html', head=head, content=displayed_content_dict)
|
|
|
|
@app.route('/stark/', methods=['GET', 'POST'])
|
|
# @headers({'Cache-Control': 'no-store, no-cache, must-revalidate, post-check=0, pre-check=0'})
|
|
def index():
|
|
translations = _translations[get_locale()]
|
|
if request.method == 'POST':
|
|
form = request.form
|
|
configs = {
|
|
'greedy_counter': 'yes',
|
|
'input_path': '',
|
|
'tree_size': '1-1000000',
|
|
'display_size': '1-1000000',
|
|
'complete_tree_type': True,
|
|
'ignored_labels': []
|
|
}
|
|
# mandatory parameters
|
|
validation = {}
|
|
|
|
|
|
# handling input
|
|
if 'file' in request.files and request.files['file']:
|
|
# store file
|
|
f = request.files['file']
|
|
input_path = os.path.join('media', secure_filename(f.filename))
|
|
f.save(input_path)
|
|
|
|
configs['input_path'] = input_path
|
|
|
|
if 'input_url' in form and form['input_url']:
|
|
validation['file'] = gettext('Please insert either input url or file, not both of them.')
|
|
validation['input_url'] = gettext('Please insert either input url or file, not both of them.')
|
|
# TODO OPTIONALLY ADD conllu FILE CHECK
|
|
elif 'input_url' in form and form['input_url']:
|
|
try:
|
|
name = form['input_url'].split('/')[-1]
|
|
input_path = os.path.join('media', name)
|
|
response = requests.get(form['input_url'])
|
|
open(input_path, "wb").write(response.content)
|
|
configs['input_path'] = input_path
|
|
except:
|
|
validation['input_url'] = gettext('Incorrect URL!')
|
|
else:
|
|
validation['file'] = gettext('Please insert either input url or provide a file.')
|
|
validation['input_url'] = gettext('Please insert either input url or provide a file.')
|
|
|
|
if 'display_size' in form and form['display_size']:
|
|
configs['display_size'] = form['display_size']
|
|
|
|
def validate_node_type(node_type):
|
|
node_type_options = {'upos', 'form', 'lemma', 'upos', 'xpos', 'feats', 'deprel'}
|
|
if len(node_type) == 0:
|
|
validation['node_type'] = gettext('Please select at least one node type.')
|
|
return False
|
|
|
|
for el in node_type:
|
|
if el not in node_type_options:
|
|
validation['node_type'] = gettext('Node option') + f' {el} ' + gettext('is not supported. Please enter valid options.')
|
|
return False
|
|
|
|
return True
|
|
|
|
node_type = []
|
|
|
|
if 'node_type_upos' in form:
|
|
node_type.append('upos')
|
|
if 'node_type_form' in form:
|
|
node_type.append('form')
|
|
if 'node_type_lemma' in form:
|
|
node_type.append('lemma')
|
|
|
|
if 'node_type_none' in form:
|
|
configs['node_type'] = None
|
|
elif validate_node_type(node_type):
|
|
configs['node_type'] = '+'.join(node_type)
|
|
|
|
# mandatory parameters with default value
|
|
configs['internal_saves'] = None
|
|
|
|
# TODO depends on computer
|
|
configs['cpu_cores'] = 12
|
|
|
|
configs['dependency_type'] = 'labeled_trees' in form and form['labeled_trees'] == 'on'
|
|
configs['node_order'] = 'fixed_order' in form and form['fixed_order'] == 'on'
|
|
configs['association_measures'] = 'association_measures' in form and form['association_measures'] == 'on'
|
|
|
|
configs['label_whitelist'] = []
|
|
configs['root_whitelist'] = []
|
|
if 'root_restriction' in form and form['root_restriction']:
|
|
configs['root_whitelist'] = form['root_restriction'].split('|')
|
|
|
|
if 'ignored_labels' in form and form['ignored_labels']:
|
|
configs['ignored_labels'] = form['ignored_labels'].split('|')
|
|
|
|
if 'query' in form and form['query']:
|
|
configs['query'] = form['query']
|
|
configs['tree_size'] = '0'
|
|
configs['display_size'] = '0'
|
|
else:
|
|
configs['query'] = None
|
|
|
|
# handling input
|
|
if 'compare_file' in request.files and request.files['compare_file']:
|
|
# store file
|
|
f = request.files['compare_file']
|
|
input_path = os.path.join('media', secure_filename(f.filename))
|
|
f.save(input_path)
|
|
|
|
configs['compare'] = input_path
|
|
if 'compare_url' in form and form['compare_url']:
|
|
validation['compare_file'] = gettext('Please insert either compare url or file, not both of them.')
|
|
validation['compare_url'] = gettext('Please insert either compare url or file, not both of them.')
|
|
|
|
elif 'compare_url' in form and form['compare_url']:
|
|
try:
|
|
name = form['compare_url'].split('/')[-1]
|
|
input_path = os.path.join('media', name)
|
|
response = requests.get(form['compare_url'])
|
|
open(input_path, "wb").write(response.content)
|
|
configs['compare'] = input_path
|
|
except:
|
|
configs['compare'] = None
|
|
validation['compare_url'] = gettext('Incorrect URL!')
|
|
else:
|
|
configs['compare'] = None
|
|
|
|
configs['sentence_count_file'] = None
|
|
configs['detailed_results_file'] = None
|
|
|
|
configs['frequency_threshold'] = 0
|
|
if 'frequency_threshold' in form and form['frequency_threshold']:
|
|
try:
|
|
int(form['frequency_threshold'])
|
|
except ValueError:
|
|
validation['frequency_threshold'] = gettext('Please insert an Integer.')
|
|
else:
|
|
configs['frequency_threshold'] = int(form['frequency_threshold'])
|
|
|
|
configs['lines_threshold'] = None
|
|
configs['node_info'] = None
|
|
|
|
configs['continuation_processing'] = False
|
|
|
|
configs['label_subtypes'] = True
|
|
configs['nodes_number'] = True
|
|
configs['print_root'] = True
|
|
|
|
if configs['compare'] is not None:
|
|
configs['other_input_path'] = configs['compare']
|
|
|
|
configs['grew_match'] = True
|
|
configs['depsearch'] = False
|
|
configs['example'] = True
|
|
name = ''.join(random.choices(string.ascii_uppercase + string.digits, k=60))
|
|
|
|
|
|
output_path = Path(os.path.join('media', name))
|
|
if output_path.exists():
|
|
shutil.rmtree(output_path, ignore_errors=True)
|
|
output_path.mkdir()
|
|
|
|
configs['output'] = os.path.join(output_path, 'result.tsv')
|
|
configs['annodoc_example_dir'] = os.path.join(output_path, 'annodoc')
|
|
if len(validation) > 0:
|
|
return render_template('index.html', validation=validation, translations=translations)
|
|
try:
|
|
run(configs)
|
|
except Exception as e:
|
|
validation['general'] = gettext('Processing failed! Please recheck your settings, e.g. input format or head node description.')
|
|
if len(validation) > 0:
|
|
return render_template('index.html', validation=validation, translations=translations)
|
|
# check if there are no results
|
|
with open(os.path.join('media', name, 'result.tsv'), 'r') as rf:
|
|
content = list(csv.reader(rf, delimiter='\t'))
|
|
if len(content) == 1:
|
|
validation['results'] = False
|
|
return render_template('index.html', validation=validation, translations=translations)
|
|
order_by = gettext('Frequency ') if not configs['compare'] else gettext('Frequency in A ')
|
|
return redirect(url_for('result', result_id=name, order_by=order_by, order_type='desc', lang=gettext('code')))
|
|
return render_template('index.html', translations=translations)
|
|
|
|
return app
|
|
|
|
|
|
if __name__ == '__main__':
|
|
app = create_app()
|
|
app.run(debug=True)
|