First commit

master
Ozbolt Menegatti 4 years ago
commit fd092e9cc6

107
app.py

@ -0,0 +1,107 @@
import pathlib
import os
import xml.etree.ElementTree as etree
def check_sentence_id(sentence_id):
assert(sentence_id.startswith("GF"))
assert(len(sentence_id.split(".")) == 3)
def get_file_contents(sentence_id):
word_base_id = sentence_id.split(".")[0]
gf_folder = os.environ["GF_FOLDER"]
gf_folder = pathlib.Path(gf_folder)
gf_inner_folder = gf_folder.joinpath(word_base_id[:4])
gf_file = gf_inner_folder.joinpath(word_base_id + "-dedup.xml")
assert(gf_file.exists())
with gf_file.open('r') as fp:
return fp.read()
def obtain_sentence(sentence_id, content):
substring = '<s xml:id=\"' + sentence_id
idx_start = content.index(substring)
idx_end = content.index("</s>", idx_start) + 4
sentence = content[idx_start:idx_end]
sentence = sentence.replace(" xml:", " ")
return sentence
def as_example(sentence_id, headword, sentence):
root = etree.XML(sentence)
result = {}
current_text = ""
for element in root:
if element.text == headword:
# first, lets pack left part
result["left"] = current_text
# then, lets pack the mid - headword part
result["mid"] = element.text
# and cleanup the current_text
current_text = ""
else:
# else, lets just add
current_text += element.text
# lastly add the right part
if "mid" not in result:
return "headword not found"
elif "left" not in result:
result["left"] = ""
result["right"] = current_text
return result
def obtain_example_from_gf2(sentence_id, headword):
try:
check_sentence_id(sentence_id)
contents = get_file_contents(sentence_id)
sentence = obtain_sentence(sentence_id, contents)
return as_example(sentence_id, headword, sentence)
except ValueError:
return "Could not find word id"
except IndexError:
return "Word id index after last dot should be an integer"
except AssertionError:
return "Bad word id"
except KeyError:
return "GF2 location not set on the server"
from flask import Response, Flask, request, jsonify
app = Flask(__name__)
@app.route('/')
def home():
return "An api to get examples from gigafida, just use /get_example/$WORD_ID to get them"
@app.route('/get_example/<string:sentence_id>/<string:headword>')
def test(sentence_id, headword):
result = obtain_example_from_gf2(sentence_id, headword)
mimetype = "application/json" if type(result) is dict else "text/plain"
return Response(result, mimetype=mimetype)
@app.route('/get_examples', methods=["POST"])
def get_examples():
data = request.get_json()
if data is None or type(data) is not dict:
return "", 500
bad_ones = {}
good_ones = {}
for sentence_id, headword in data.items():
headword = headword.strip()
example = obtain_example_from_gf2(sentence_id, headword)
if type(example) is dict:
good_ones[sentence_id] = example
else:
bad_ones[sentence_id] = example
return jsonify({"good": good_ones, "bad": bad_ones})

@ -0,0 +1,13 @@
#!/bin/bash
# cd to script dir
cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
# activate python venv
source ./venv/bin/activate
# set gf2 location
export GF_FOLDER="/net/nas/resources/corpus/gigafida/v2.0/gigafida_dedup/tei/data/"
# run the app
flask run --port=2001 --host=0.0.0.0 --debugger --reload
Loading…
Cancel
Save