commit fd092e9cc62e9761aea4faad754ddb9ed745b97c Author: Ozbolt Menegatti Date: Tue May 19 22:12:41 2020 +0200 First commit diff --git a/app.py b/app.py new file mode 100644 index 0000000..8dbd952 --- /dev/null +++ b/app.py @@ -0,0 +1,107 @@ +import pathlib +import os +import xml.etree.ElementTree as etree + +def check_sentence_id(sentence_id): + assert(sentence_id.startswith("GF")) + assert(len(sentence_id.split(".")) == 3) + +def get_file_contents(sentence_id): + word_base_id = sentence_id.split(".")[0] + gf_folder = os.environ["GF_FOLDER"] + + gf_folder = pathlib.Path(gf_folder) + gf_inner_folder = gf_folder.joinpath(word_base_id[:4]) + gf_file = gf_inner_folder.joinpath(word_base_id + "-dedup.xml") + + assert(gf_file.exists()) + + with gf_file.open('r') as fp: + return fp.read() + +def obtain_sentence(sentence_id, content): + substring = '", idx_start) + 4 + + sentence = content[idx_start:idx_end] + sentence = sentence.replace(" xml:", " ") + + return sentence + +def as_example(sentence_id, headword, sentence): + root = etree.XML(sentence) + result = {} + current_text = "" + + for element in root: + if element.text == headword: + # first, lets pack left part + result["left"] = current_text + # then, lets pack the mid - headword part + result["mid"] = element.text + # and cleanup the current_text + current_text = "" + + else: + # else, lets just add + current_text += element.text + + # lastly add the right part + if "mid" not in result: + return "headword not found" + elif "left" not in result: + result["left"] = "" + + result["right"] = current_text + return result + +def obtain_example_from_gf2(sentence_id, headword): + try: + check_sentence_id(sentence_id) + contents = get_file_contents(sentence_id) + sentence = obtain_sentence(sentence_id, contents) + return as_example(sentence_id, headword, sentence) + except ValueError: + return "Could not find word id" + except IndexError: + return "Word id index after last dot should be an integer" + except AssertionError: + return "Bad word id" + except KeyError: + return "GF2 location not set on the server" + + +from flask import Response, Flask, request, jsonify +app = Flask(__name__) + +@app.route('/') +def home(): + return "An api to get examples from gigafida, just use /get_example/$WORD_ID to get them" + +@app.route('/get_example//') +def test(sentence_id, headword): + result = obtain_example_from_gf2(sentence_id, headword) + mimetype = "application/json" if type(result) is dict else "text/plain" + return Response(result, mimetype=mimetype) + +@app.route('/get_examples', methods=["POST"]) +def get_examples(): + data = request.get_json() + if data is None or type(data) is not dict: + return "", 500 + + bad_ones = {} + good_ones = {} + for sentence_id, headword in data.items(): + headword = headword.strip() + example = obtain_example_from_gf2(sentence_id, headword) + if type(example) is dict: + good_ones[sentence_id] = example + else: + bad_ones[sentence_id] = example + + return jsonify({"good": good_ones, "bad": bad_ones}) + + diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..fb6bb52 --- /dev/null +++ b/run.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# cd to script dir +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# activate python venv +source ./venv/bin/activate + +# set gf2 location +export GF_FOLDER="/net/nas/resources/corpus/gigafida/v2.0/gigafida_dedup/tei/data/" + +# run the app +flask run --port=2001 --host=0.0.0.0 --debugger --reload