Updated classla to v2.2 + Added spoken endpoint

This commit is contained in:
lkrsnik 2025-03-09 13:49:59 +01:00
parent d81b8dd513
commit 522904150e
4 changed files with 37 additions and 7 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@
data/
__pycache__/
venv/
.venv

View File

@ -64,6 +64,26 @@ Usage example:
curl -X POST -d '{"text": "kva smo mi zurali zadnje leto v zagrebu..."}' https://orodja.cjvt.si/oznacevalnik/nonstandard-jos
```
## Slovenian Spoken
Preset classla settings:
```json
{
"lang": "sl",
"pos_use_lexicon": false,
"processors": {
"tokenize": "standard",
"lemma": "spoken",
"pos": "spoken",
"depparse": "spoken",
"ner": "nonstandard"
}
}
```
Usage example:
```commandline
curl -X POST -d '{"text": "kva smo mi zurali zadnje leto v zagrebu..."}' https://orodja.cjvt.si/oznacevalnik/sl-spoken
```
## Croatian Standard UD
Preset classla settings:

9
app.py
View File

@ -6,6 +6,7 @@ import torch
classla.download('sl')
classla.download('sl', type='standard_jos')
classla.download('sl', type='nonstandard')
classla.download('sl', type='spoken')
classla.download('hr')
classla.download('hr', type='nonstandard')
classla.download('sr')
@ -23,6 +24,7 @@ nlp_nonstandard_JOS = classla.Pipeline('sl', processors={
"depparse": "standard_jos",
"ner": "nonstandard"
})
nlp_standard_SPOKEN = classla.Pipeline('sl', type='spoken')
nlp_hr_standard_UD = classla.Pipeline('hr')
nlp_hr_nonstandard_UD = classla.Pipeline('hr', type='nonstandard')
nlp_sr_standard_UD = classla.Pipeline('sr')
@ -80,6 +82,13 @@ def nonstandard_jos():
return doc.to_conll()
@app.route('/sl-spoken', methods=["POST"])
def sl_spoken():
input_json = request.get_json(force=True)
doc = nlp_standard_SPOKEN(input_json['text'])
return doc.to_conll()
@app.route('/hr-standard-ud', methods=["POST"])
def hr_standard_ud():
input_json = request.get_json(force=True)

View File

@ -1,6 +1,6 @@
certifi==2021.10.8
charset-normalizer==2.0.8
classla==1.1.0
classla==2.2
click==8.0.3
Flask==2.0.2
idna==3.3
@ -9,13 +9,13 @@ itsdangerous==2.0.1
Jinja2==3.0.3
lxml==4.6.4
MarkupSafe==2.0.1
numpy==1.21.4
obeliks==1.1.3
protobuf==3.19.1
numpy==1.23.0
obeliks==1.1.6
protobuf==4.21.2
regex==2021.11.10
reldi-tokeniser==1.0.0
requests==2.26.0
torch==1.10.0
reldi-tokeniser==1.0.3
requests==2.28.0
torch==1.12.0
tqdm==4.62.3
typing_extensions==4.0.1
urllib3==1.26.7