108 lines
3.1 KiB
Python
108 lines
3.1 KiB
Python
|
import pathlib
|
||
|
import os
|
||
|
import xml.etree.ElementTree as etree
|
||
|
|
||
|
def check_sentence_id(sentence_id):
|
||
|
assert(sentence_id.startswith("GF"))
|
||
|
assert(len(sentence_id.split(".")) == 3)
|
||
|
|
||
|
def get_file_contents(sentence_id):
|
||
|
word_base_id = sentence_id.split(".")[0]
|
||
|
gf_folder = os.environ["GF_FOLDER"]
|
||
|
|
||
|
gf_folder = pathlib.Path(gf_folder)
|
||
|
gf_inner_folder = gf_folder.joinpath(word_base_id[:4])
|
||
|
gf_file = gf_inner_folder.joinpath(word_base_id + "-dedup.xml")
|
||
|
|
||
|
assert(gf_file.exists())
|
||
|
|
||
|
with gf_file.open('r') as fp:
|
||
|
return fp.read()
|
||
|
|
||
|
def obtain_sentence(sentence_id, content):
|
||
|
substring = '<s xml:id=\"' + sentence_id
|
||
|
|
||
|
idx_start = content.index(substring)
|
||
|
idx_end = content.index("</s>", idx_start) + 4
|
||
|
|
||
|
sentence = content[idx_start:idx_end]
|
||
|
sentence = sentence.replace(" xml:", " ")
|
||
|
|
||
|
return sentence
|
||
|
|
||
|
def as_example(sentence_id, headword, sentence):
|
||
|
root = etree.XML(sentence)
|
||
|
result = {}
|
||
|
current_text = ""
|
||
|
|
||
|
for element in root:
|
||
|
if element.text == headword:
|
||
|
# first, lets pack left part
|
||
|
result["left"] = current_text
|
||
|
# then, lets pack the mid - headword part
|
||
|
result["mid"] = element.text
|
||
|
# and cleanup the current_text
|
||
|
current_text = ""
|
||
|
|
||
|
else:
|
||
|
# else, lets just add
|
||
|
current_text += element.text
|
||
|
|
||
|
# lastly add the right part
|
||
|
if "mid" not in result:
|
||
|
return "headword not found"
|
||
|
elif "left" not in result:
|
||
|
result["left"] = ""
|
||
|
|
||
|
result["right"] = current_text
|
||
|
return result
|
||
|
|
||
|
def obtain_example_from_gf2(sentence_id, headword):
|
||
|
try:
|
||
|
check_sentence_id(sentence_id)
|
||
|
contents = get_file_contents(sentence_id)
|
||
|
sentence = obtain_sentence(sentence_id, contents)
|
||
|
return as_example(sentence_id, headword, sentence)
|
||
|
except ValueError:
|
||
|
return "Could not find word id"
|
||
|
except IndexError:
|
||
|
return "Word id index after last dot should be an integer"
|
||
|
except AssertionError:
|
||
|
return "Bad word id"
|
||
|
except KeyError:
|
||
|
return "GF2 location not set on the server"
|
||
|
|
||
|
|
||
|
from flask import Response, Flask, request, jsonify
|
||
|
app = Flask(__name__)
|
||
|
|
||
|
@app.route('/')
|
||
|
def home():
|
||
|
return "An api to get examples from gigafida, just use /get_example/$WORD_ID to get them"
|
||
|
|
||
|
@app.route('/get_example/<string:sentence_id>/<string:headword>')
|
||
|
def test(sentence_id, headword):
|
||
|
result = obtain_example_from_gf2(sentence_id, headword)
|
||
|
mimetype = "application/json" if type(result) is dict else "text/plain"
|
||
|
return Response(result, mimetype=mimetype)
|
||
|
|
||
|
@app.route('/get_examples', methods=["POST"])
|
||
|
def get_examples():
|
||
|
data = request.get_json()
|
||
|
if data is None or type(data) is not dict:
|
||
|
return "", 500
|
||
|
|
||
|
bad_ones = {}
|
||
|
good_ones = {}
|
||
|
for sentence_id, headword in data.items():
|
||
|
headword = headword.strip()
|
||
|
example = obtain_example_from_gf2(sentence_id, headword)
|
||
|
if type(example) is dict:
|
||
|
good_ones[sentence_id] = example
|
||
|
else:
|
||
|
bad_ones[sentence_id] = example
|
||
|
|
||
|
return jsonify({"good": good_ones, "bad": bad_ones})
|
||
|
|
||
|
|