You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

343 lines
14 KiB

import configparser
import csv
import os
import random
import re
import string
import time
import requests
from flask import Flask, render_template, request, send_file, redirect, url_for
from flask_babel import Babel, gettext
from werkzeug.utils import secure_filename
from stark import run
UPLOAD_FOLDER = 'uploads'
ALLOWED_EXTENSIONS = {'conllu'}
DAYS_BEFORE_DELETION = 1
DEFAULT_LANGUAGE = 'en'
LANGUAGES = ['en', 'sl']
_translations = {
'en': {},
'sl': {},
}
def get_locale():
lang = request.args.get('lang')
if lang in LANGUAGES:
return lang
return DEFAULT_LANGUAGE
def create_app():
app = Flask(__name__, static_url_path='/stark/static')
babel = Babel(app, locale_selector=get_locale, default_translation_directories='translations')
babel.list_translations = ['en', 'sl']
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
def create_default_configs():
configs = {}
# mandatory parameters
configs['input_path'] = 'data/sl_ssj-ud_v2.4.conllu'
configs['output'] = 'results/out_official.tsv'
configs['tree_size'] = '2-4'
configs['node_type'] = 'upos'
# mandatory parameters with default value
configs['internal_saves'] = './internal_saves'
configs['cpu_cores'] = 12
configs['complete_tree_type'] = True
configs['dependency_type'] = True
configs['node_order'] = True
configs['association_measures'] = False
configs['label_whitelist'] = []
configs['root_whitelist'] = []
configs['query'] = None
configs['compare'] = None
configs['frequency_threshold'] = 0
configs['lines_threshold'] = None
configs['continuation_processing'] = False
configs['nodes_number'] = True
configs['print_root'] = True
if configs['compare'] is not None:
configs['other_input_path'] = configs['compare']
return configs
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/stark/about', methods=['GET'])
def about():
return render_template('about.html')
@app.route('/stark/result/<result_id>', methods=['GET', 'POST'])
def result(result_id):
if request.method == 'POST':
for filename in os.listdir('media'):
file_path = os.path.join('media', filename)
f_t = os.path.getmtime(file_path)
c_t = time.time()
file_age_seconds = c_t - f_t
if file_age_seconds > DAYS_BEFORE_DELETION * 86400:
os.remove(file_path)
return send_file(os.path.join('media', result_id), as_attachment=True, download_name='results.tsv')
with open(os.path.join('media', result_id), 'r') as rf:
content = list(csv.reader(rf, delimiter='\t'))
head = content[0]
content_dict = {h: [] for h in head}
table_columns2displayed_table_columns = {
'Tree': gettext('Tree'),
'Absolute frequency': gettext('Frequency'),
'Absolute frequency in second treebank': gettext('Frequency in B'),
'Order': gettext('Order'),
'Number of nodes': gettext('Number of nodes'),
'Head node': gettext('Head node'),
'Grew-match URL': gettext('Grew-match URL'),
'%DIFF': gettext('%DIFF'),
'OR': gettext('OR'),
'BIC': gettext('BIC'),
'MI': gettext('MI'),
'logDice': gettext('logDice'),
't-score': gettext('t-score')
}
if 'Absolute frequency in second treebank' in head:
table_columns2displayed_table_columns['Absolute frequency'] = gettext('Frequency in A')
# table_columns2displayed_table_columns['Absolute frequency in second treebank'] = gettext('Frequency in B')
# if 'MI' in table_columns2displayed_table_columns:
# del table_columns2displayed_table_columns['MI']
# if 'logDice' in table_columns2displayed_table_columns:
# del table_columns2displayed_table_columns['logDice']
# if 't-score' in table_columns2displayed_table_columns:
# del table_columns2displayed_table_columns['t-score']
else:
del table_columns2displayed_table_columns['Absolute frequency in second treebank']
displayed_table_columns2table_columns = {v: k for k, v in table_columns2displayed_table_columns.items()}
order_by_display = request.args.get('order_by')
order_by = displayed_table_columns2table_columns[
order_by_display[:-1]] if order_by_display is not None else None
order_type = request.args.get('order_type')
if order_by is not None and order_by in head:
sort_id = head.index(order_by)
if order_type == 'asc':
# check if a number can be converted to float or int
ordered_content = sorted(content[1:], key=lambda x: -1 * float(x[sort_id]) if x[sort_id].isnumeric() or re.match(r'^-?\d+(?:\.\d+)$', x[sort_id]) is not None else x[sort_id], reverse=True)
else:
ordered_content = sorted(content[1:], key=lambda x: -1 * float(x[sort_id]) if x[sort_id].isnumeric() or re.match(r'^-?\d+(?:\.\d+)$', x[sort_id]) is not None else x[sort_id])
else:
ordered_content = content[1:]
for i, row in enumerate(ordered_content):
for j, v in enumerate(row):
content_dict[head[j]].append(v)
displayed_head = [v for k, v in table_columns2displayed_table_columns.items() if k in head]
displayed_content_dict = {}
for h in displayed_head:
displayed_content_dict[h] = content_dict[displayed_table_columns2table_columns[h]]
return render_template('result.html', head_row=displayed_head, content=displayed_content_dict)
@app.route('/stark/', methods=['GET', 'POST'])
# @headers({'Cache-Control': 'no-store, no-cache, must-revalidate, post-check=0, pre-check=0'})
def index():
translations = _translations[get_locale()]
if request.method == 'POST':
form = request.form
configs = {}
# mandatory parameters
configs['input_path'] = ''
validation = {}
# handling input
if 'file' in request.files and request.files['file']:
# store file
f = request.files['file']
input_path = os.path.join('media', secure_filename(f.filename))
f.save(input_path)
configs['input_path'] = input_path
if 'input_url' in form and form['input_url']:
validation['file'] = gettext('Please insert either input url or file, not both of them.')
validation['input_url'] = gettext('Please insert either input url or file, not both of them.')
# TODO OPTIONALLY ADD conllu FILE CHECK
elif 'input_url' in form and form['input_url']:
try:
name = form['input_url'].split('/')[-1]
input_path = os.path.join('media', name)
response = requests.get(form['input_url'])
open(input_path, "wb").write(response.content)
configs['input_path'] = input_path
except:
validation['input_url'] = gettext('Incorrect URL!')
else:
validation['file'] = gettext('Please insert either input url or provide a file.')
validation['input_url'] = gettext('Please insert either input url or provide a file.')
tree_size_min = None
if 'tree_size_min' in form:
tree_size_min = form['tree_size_min']
tree_size_max = None
if 'tree_size_max' in form:
tree_size_max = form['tree_size_max']
def validate_tree_size(tree_size_min, tree_size_max):
if tree_size_min is None or tree_size_max is None:
validation['tree_size'] = gettext('Please provide information about minimum and maximum tree size.')
return False
if int(tree_size_min) > int(tree_size_max):
validation['tree_size'] = gettext('Tree size minimum should be smaller than tree size maximum.')
return False
return True
if validate_tree_size(tree_size_min, tree_size_max):
configs['tree_size'] = f'{tree_size_min}-{tree_size_max}' if tree_size_min != tree_size_max else f'{tree_size_min}'
def validate_node_type(node_type):
# TODO EXPAND NODE TYPE
node_type_options = {'upos', 'form', 'lemma', 'upos', 'xpos', 'feats', 'deprel'}
if len(node_type) == 0:
validation['node_type'] = gettext('Please select at least one node type.')
return False
for el in node_type:
if el not in node_type_options:
validation['node_type'] = gettext('Node option') + f' {el} ' + gettext('is not supported. Please enter valid options.')
return False
return True
node_type = []
if 'node_type_upos' in form:
node_type.append('upos')
if 'node_type_form' in form:
node_type.append('form')
if 'node_type_lemma' in form:
node_type.append('lemma')
if validate_node_type(node_type):
configs['node_type'] = '+'.join(node_type)
# mandatory parameters with default value
configs['internal_saves'] = None
# TODO depends on computer
configs['cpu_cores'] = 12
# TODO FINALIZE THIS!
configs['complete_tree_type'] = True
configs['dependency_type'] = 'labeled_trees' in form and form['labeled_trees'] == 'on'
configs['node_order'] = 'fixed_order' in form and form['fixed_order'] == 'on'
configs['association_measures'] = 'association_measures' in form and form['association_measures'] == 'on'
configs['label_whitelist'] = []
configs['root_whitelist'] = []
if 'root_restriction' in form and form['root_restriction']:
configs['root_whitelist'] = form['root_restriction'].split('|')
if 'query' in form and form['query']:
configs['query'] = form['query']
configs['tree_size'] = '0'
else:
configs['query'] = None
# handling input
if 'compare_file' in request.files and request.files['compare_file']:
# store file
f = request.files['compare_file']
input_path = os.path.join('media', secure_filename(f.filename))
f.save(input_path)
configs['compare'] = input_path
if 'compare_url' in form and form['compare_url']:
validation['compare_file'] = gettext('Please insert either compare url or file, not both of them.')
validation['compare_url'] = gettext('Please insert either compare url or file, not both of them.')
elif 'compare_url' in form and form['compare_url']:
try:
name = form['compare_url'].split('/')[-1]
input_path = os.path.join('media', name)
response = requests.get(form['compare_url'])
open(input_path, "wb").write(response.content)
configs['compare'] = input_path
except:
configs['compare'] = None
validation['compare_url'] = gettext('Incorrect URL!')
else:
configs['compare'] = None
configs['sentence_count_file'] = None
configs['detailed_results_file'] = None
configs['frequency_threshold'] = 0
if 'frequency_threshold' in form and form['frequency_threshold']:
try:
int(form['frequency_threshold'])
except ValueError:
validation['frequency_threshold'] = gettext('Please insert an Integer.')
else:
configs['frequency_threshold'] = int(form['frequency_threshold'])
configs['lines_threshold'] = None
configs['continuation_processing'] = False
configs['label_subtypes'] = True
configs['nodes_number'] = True
configs['print_root'] = True
if configs['compare'] is not None:
configs['other_input_path'] = configs['compare']
configs['grew_match'] = True
configs['depsearch'] = False
configs['example'] = False
name = ''.join(random.choices(string.ascii_uppercase + string.digits, k=60))
configs['output'] = os.path.join('media', name)
if len(validation) > 0:
return render_template('index.html', validation=validation, translations=translations)
try:
run(configs)
except Exception as e:
validation['general'] = gettext('Processing failed! Please recheck your settings, e.g. input format or head node description.')
if len(validation) > 0:
return render_template('index.html', validation=validation, translations=translations)
# check if there are no results
with open(os.path.join('media', name), 'r') as rf:
content = list(csv.reader(rf, delimiter='\t'))
if len(content) == 1:
validation['results'] = False
return render_template('index.html', validation=validation, translations=translations)
order_by = gettext('Frequency ') if not configs['compare'] else gettext('Frequency in A ')
return redirect(url_for('result', result_id=name, order_by=order_by, order_type='desc', lang=gettext('code')))
return render_template('index.html', translations=translations)
return app
if __name__ == '__main__':
app = create_app()
app.run(debug=True)