First commit
This commit is contained in:
commit
fd092e9cc6
107
app.py
Normal file
107
app.py
Normal file
|
@ -0,0 +1,107 @@
|
|||
import pathlib
|
||||
import os
|
||||
import xml.etree.ElementTree as etree
|
||||
|
||||
def check_sentence_id(sentence_id):
|
||||
assert(sentence_id.startswith("GF"))
|
||||
assert(len(sentence_id.split(".")) == 3)
|
||||
|
||||
def get_file_contents(sentence_id):
|
||||
word_base_id = sentence_id.split(".")[0]
|
||||
gf_folder = os.environ["GF_FOLDER"]
|
||||
|
||||
gf_folder = pathlib.Path(gf_folder)
|
||||
gf_inner_folder = gf_folder.joinpath(word_base_id[:4])
|
||||
gf_file = gf_inner_folder.joinpath(word_base_id + "-dedup.xml")
|
||||
|
||||
assert(gf_file.exists())
|
||||
|
||||
with gf_file.open('r') as fp:
|
||||
return fp.read()
|
||||
|
||||
def obtain_sentence(sentence_id, content):
|
||||
substring = '<s xml:id=\"' + sentence_id
|
||||
|
||||
idx_start = content.index(substring)
|
||||
idx_end = content.index("</s>", idx_start) + 4
|
||||
|
||||
sentence = content[idx_start:idx_end]
|
||||
sentence = sentence.replace(" xml:", " ")
|
||||
|
||||
return sentence
|
||||
|
||||
def as_example(sentence_id, headword, sentence):
|
||||
root = etree.XML(sentence)
|
||||
result = {}
|
||||
current_text = ""
|
||||
|
||||
for element in root:
|
||||
if element.text == headword:
|
||||
# first, lets pack left part
|
||||
result["left"] = current_text
|
||||
# then, lets pack the mid - headword part
|
||||
result["mid"] = element.text
|
||||
# and cleanup the current_text
|
||||
current_text = ""
|
||||
|
||||
else:
|
||||
# else, lets just add
|
||||
current_text += element.text
|
||||
|
||||
# lastly add the right part
|
||||
if "mid" not in result:
|
||||
return "headword not found"
|
||||
elif "left" not in result:
|
||||
result["left"] = ""
|
||||
|
||||
result["right"] = current_text
|
||||
return result
|
||||
|
||||
def obtain_example_from_gf2(sentence_id, headword):
|
||||
try:
|
||||
check_sentence_id(sentence_id)
|
||||
contents = get_file_contents(sentence_id)
|
||||
sentence = obtain_sentence(sentence_id, contents)
|
||||
return as_example(sentence_id, headword, sentence)
|
||||
except ValueError:
|
||||
return "Could not find word id"
|
||||
except IndexError:
|
||||
return "Word id index after last dot should be an integer"
|
||||
except AssertionError:
|
||||
return "Bad word id"
|
||||
except KeyError:
|
||||
return "GF2 location not set on the server"
|
||||
|
||||
|
||||
from flask import Response, Flask, request, jsonify
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/')
|
||||
def home():
|
||||
return "An api to get examples from gigafida, just use /get_example/$WORD_ID to get them"
|
||||
|
||||
@app.route('/get_example/<string:sentence_id>/<string:headword>')
|
||||
def test(sentence_id, headword):
|
||||
result = obtain_example_from_gf2(sentence_id, headword)
|
||||
mimetype = "application/json" if type(result) is dict else "text/plain"
|
||||
return Response(result, mimetype=mimetype)
|
||||
|
||||
@app.route('/get_examples', methods=["POST"])
|
||||
def get_examples():
|
||||
data = request.get_json()
|
||||
if data is None or type(data) is not dict:
|
||||
return "", 500
|
||||
|
||||
bad_ones = {}
|
||||
good_ones = {}
|
||||
for sentence_id, headword in data.items():
|
||||
headword = headword.strip()
|
||||
example = obtain_example_from_gf2(sentence_id, headword)
|
||||
if type(example) is dict:
|
||||
good_ones[sentence_id] = example
|
||||
else:
|
||||
bad_ones[sentence_id] = example
|
||||
|
||||
return jsonify({"good": good_ones, "bad": bad_ones})
|
||||
|
||||
|
13
run.sh
Executable file
13
run.sh
Executable file
|
@ -0,0 +1,13 @@
|
|||
#!/bin/bash
|
||||
|
||||
# cd to script dir
|
||||
cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
# activate python venv
|
||||
source ./venv/bin/activate
|
||||
|
||||
# set gf2 location
|
||||
export GF_FOLDER="/net/nas/resources/corpus/gigafida/v2.0/gigafida_dedup/tei/data/"
|
||||
|
||||
# run the app
|
||||
flask run --port=2001 --host=0.0.0.0 --debugger --reload
|
Loading…
Reference in New Issue
Block a user