From 522904150e1bddc05bd577ac2592124d6befe7d2 Mon Sep 17 00:00:00 2001 From: lkrsnik Date: Sun, 9 Mar 2025 13:49:59 +0100 Subject: [PATCH] Updated classla to v2.2 + Added spoken endpoint --- .gitignore | 1 + README.md | 20 ++++++++++++++++++++ app.py | 9 +++++++++ requirements.txt | 14 +++++++------- 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 429f144..0f515d4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ data/ __pycache__/ venv/ +.venv diff --git a/README.md b/README.md index 4986790..b0a5d9a 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,26 @@ Usage example: curl -X POST -d '{"text": "kva smo mi zurali zadnje leto v zagrebu..."}' https://orodja.cjvt.si/oznacevalnik/nonstandard-jos ``` +## Slovenian Spoken +Preset classla settings: +```json +{ + "lang": "sl", + "pos_use_lexicon": false, + "processors": { + "tokenize": "standard", + "lemma": "spoken", + "pos": "spoken", + "depparse": "spoken", + "ner": "nonstandard" + } +} +``` +Usage example: +```commandline +curl -X POST -d '{"text": "kva smo mi zurali zadnje leto v zagrebu..."}' https://orodja.cjvt.si/oznacevalnik/sl-spoken +``` + ## Croatian Standard UD Preset classla settings: diff --git a/app.py b/app.py index 4cfcc3d..da8517d 100644 --- a/app.py +++ b/app.py @@ -6,6 +6,7 @@ import torch classla.download('sl') classla.download('sl', type='standard_jos') classla.download('sl', type='nonstandard') +classla.download('sl', type='spoken') classla.download('hr') classla.download('hr', type='nonstandard') classla.download('sr') @@ -23,6 +24,7 @@ nlp_nonstandard_JOS = classla.Pipeline('sl', processors={ "depparse": "standard_jos", "ner": "nonstandard" }) +nlp_standard_SPOKEN = classla.Pipeline('sl', type='spoken') nlp_hr_standard_UD = classla.Pipeline('hr') nlp_hr_nonstandard_UD = classla.Pipeline('hr', type='nonstandard') nlp_sr_standard_UD = classla.Pipeline('sr') @@ -80,6 +82,13 @@ def nonstandard_jos(): return doc.to_conll() +@app.route('/sl-spoken', methods=["POST"]) +def sl_spoken(): + input_json = request.get_json(force=True) + doc = nlp_standard_SPOKEN(input_json['text']) + return doc.to_conll() + + @app.route('/hr-standard-ud', methods=["POST"]) def hr_standard_ud(): input_json = request.get_json(force=True) diff --git a/requirements.txt b/requirements.txt index 24210e0..d7ea679 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ certifi==2021.10.8 charset-normalizer==2.0.8 -classla==1.1.0 +classla==2.2 click==8.0.3 Flask==2.0.2 idna==3.3 @@ -9,13 +9,13 @@ itsdangerous==2.0.1 Jinja2==3.0.3 lxml==4.6.4 MarkupSafe==2.0.1 -numpy==1.21.4 -obeliks==1.1.3 -protobuf==3.19.1 +numpy==1.23.0 +obeliks==1.1.6 +protobuf==4.21.2 regex==2021.11.10 -reldi-tokeniser==1.0.0 -requests==2.26.0 -torch==1.10.0 +reldi-tokeniser==1.0.3 +requests==2.28.0 +torch==1.12.0 tqdm==4.62.3 typing_extensions==4.0.1 urllib3==1.26.7