16 Commits

41 changed files with 2514 additions and 119 deletions

11
.gitignore vendored
View File

@@ -7,6 +7,17 @@ data/appindex.json
src/frontend_vue/node_modules/ src/frontend_vue/node_modules/
src/frontend_vue/dist/ src/frontend_vue/dist/
dockerfiles/database/create.js dockerfiles/database/create.js
dockerfiles/database/create_mongo.js
dockerfiles/database/create_postgres.js
dockerfiles/database/mongo_db.gz
dockerfiles/database/postgres_db.tar
dockerfiles/database/postgres_db_OLD.tar
*__pycache__/ *__pycache__/
env.local env.local
logs/* logs/*
.idea/
venv*
data/
data
deploy_instructions/
run.sh

3
.gitmodules vendored
View File

@@ -1,3 +1,6 @@
[submodule "src/pkg/cjvt-corpusparser"] [submodule "src/pkg/cjvt-corpusparser"]
path = src/pkg/cjvt-corpusparser path = src/pkg/cjvt-corpusparser
url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git
[submodule "src/pkg/luscenje_struktur"]
path = src/pkg/luscenje_struktur
url = https://gitea.cjvt.si/ozbolt/luscenje_struktur.git

View File

@@ -13,10 +13,11 @@ SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml" # KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link" # KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml" KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml"
GIGAFIDA_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/giga_orig"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json" # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link" # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json" KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
GIGAFIDA_SRL_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/final_json"
# This file comes with the source code. Make sure you unpack it and name it right. # This file comes with the source code. Make sure you unpack it and name it right.
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json" SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json" SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
@@ -26,14 +27,14 @@ APPINDEX_PATH = "$(MAKE_ROOT)/data/appindex.json"
OUTPUT = "db" OUTPUT = "db"
# OUTPUT = "file" # OUTPUT = "file"
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume OUTDIR = "/project/data" # if you're running this in docker, make sure to mount the volume
DBADDR = "0.0.0.0:27017" # don't use localhost DBADDR = "0.0.0.0:27017" # don't use localhost
# credentials from .gitignored file # credentials from .gitignored file
# create it from env.default # create it from env.default
include env.local include env.local
N_CORES = 3 N_CORES = 4
# insert kres files into database in chunks, for fewer connections # insert kres files into database in chunks, for fewer connections
KRES_CHUNK_SIZE = 30 KRES_CHUNK_SIZE = 30
@@ -56,6 +57,12 @@ database-service:
database-users: database-users:
cd dockerfiles/database; $(MAKE) create_users cd dockerfiles/database; $(MAKE) create_users
database-restore:
cd dockerfiles/database; $(MAKE) restore_db
database-restore-postgres:
cd dockerfiles/database; $(MAKE) restore_postgres_db
# also useful, if we want to restart the db # also useful, if we want to restart the db
database-clean: database-clean:
cd dockerfiles/database; $(MAKE) clean_stack cd dockerfiles/database; $(MAKE) clean_stack
@@ -69,6 +76,7 @@ python-env-install:
pip3 install -e src/pkg/cjvt-corpusparser/. pip3 install -e src/pkg/cjvt-corpusparser/.
pip3 install -e src/pkg/valency/. pip3 install -e src/pkg/valency/.
pip3 install -e src/pkg/seqparser/. pip3 install -e src/pkg/seqparser/.
pip3 install -e src/pkg/luscenje_struktur/.
# from inside python-env container: # from inside python-env container:
data/samples: data/samples:
@@ -93,7 +101,14 @@ fill-database-kres: data/samples
--chunk-size $(KRES_CHUNK_SIZE) \ --chunk-size $(KRES_CHUNK_SIZE) \
--cores $(N_CORES) --cores $(N_CORES)
fill-database-gigafida: data/samples
python3 src/pkg/cjvt-corpusparser/corpusparser/main.py --kres-folder $(GIGAFIDA_FOLDER) \
--corpus="gigafida" \
--ssj-file $(SSJ_FILE) --kres-srl-folder $(GIGAFIDA_SRL_FOLDER) \
--output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \
--chunk-size $(KRES_CHUNK_SIZE) \
--cores $(N_CORES)
## Frontend ## Frontend
@@ -146,4 +161,5 @@ sskj-senses:
--dbpass $(DB_USR_PASS) --dbpass $(DB_USR_PASS)
deploy-prod-stack: deploy-prod-stack:
docker stack deploy -c production.yaml val - docker network create val-backend
docker stack deploy -c production.yaml val

View File

@@ -111,13 +111,17 @@ Prerequisite: machine with free ports 80 and 8084.
### Database ### Database
Either build the database from scratch (lenghty process) using above instructions or just migrate the database from the faculty server (recommended). Either build the database from scratch (lenghty process) using above instructions or just migrate the database from the faculty server (recommended).
TODO: build my-mongo Build container my-mongo:
```bash
# run once and destroy containers
$ make database-service
```
### Backend ### Backend
Set database connection details in `/src/backend_flask/db_config.py`. Set database connection details in `/src/backend_flask/db_config.py`.
``` Change 'valuser' and 'valuserpass' to the database user.
mongodb://valuser:valuserpass@127.0.0.1:27017/valdb ```bash
mongodb://valuser:valuserpass@my_mongo/valdb
``` ```
In the above line, replace `valuser` with the username and `valuserpass` with the password that was used to create the database tables (the values were set in the root Makefile). In the above line, replace `valuser` with the username and `valuserpass` with the password that was used to create the database tables (the values were set in the root Makefile).
@@ -140,7 +144,82 @@ $ make build-frontend-prod
``` ```
All set, now run the stack. All set, now run the stack.
Stack configuration in `production.yaml`.
```bash ```bash
# From git root # From git root
$ make deploy-prod-stack $ make deploy-prod-stack
```
## Uploading a mongo dump
There's a 15GB mongo dump containing the fully processed kres and ssj data.
We can use that file to deploy our aplication.
With this database, we will need a minimum of 8GB ram to serve the app.
If the server is struggling, frontend will throw "Network errors".
Check `0.0.0.0:8081` and remove (or backup) the current example database `valdb`.
Run the stack with mongo port mapped:
(uncomment the lines in `production.yaml`)
```yml
ports:
- 27017:27017
```
Run a separate my-mongo container with the mounted data:
```bash
$ mongo run -it --net host -v <local_dump_path>/dumps my-mongo /bin/bash
```
Inside the container (edit the uesrname, password):
```bash
$ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0.0:27017
```
After uploading, restart the stack with `27017` commented out.
## Script running
### Environment setup
```bash
pip install -r requirements.txt
pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git
```
### Running on already setup environment
```bash
make database-service
```
### Setting up environment for running on ramdisk
```bash
# create ramdisk
sudo mount -t tmpfs tmpfs /mnt/tmp
sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp
# change volumes to /mnt/tmp:/data/db
vim dockerfiles/database/valency-stack.yml
# change Makefile -runStack to mkdir -p /mnt/tmp
vim dockerfiles/database/Makefile
# run service
make database-service
# run ONLY ONCE to create users and restore database
make database-users
make database-restore
# double check if it worked
docker exec -it ef0a /bin/bash
# following steps in docker bash:
# check if it worked by
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
db.getRoles()
``` ```

View File

@@ -1 +0,0 @@
/home/kristjan/workdir/final_json/

View File

@@ -1 +0,0 @@
/home/kristjan/kres_mount/kres_parsed/tei/

Binary file not shown.

View File

@@ -1 +0,0 @@
/home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,5 @@
FROM mongo:latest FROM mongo:4.2.9
WORKDIR / WORKDIR /
COPY init_inside_container.sh /. COPY init_inside_mongo_container.sh /.
COPY create.js /. COPY create_mongo.js /.

View File

@@ -2,33 +2,62 @@
# collection names: lower case, plural # collection names: lower case, plural
# user names? # user names?
# mongo admin -u root -p password --eval "db.getSiblingDB('vlDB').addUser('vluser', 'password')"
STACKNAME = dbstack
.PHONY: start_db FORCE
all: build_run create_users all: build_run create_users
build_run: build_mongo run_stack build_run: build_mongo run_docker_compose
create.js: FORCE postgres_create_roles:
echo 'psql -v ON_ERROR_STOP=OFF --username $(DB_ADM_USER) <<-EOSQL' > create_postgres.js
echo "create user $(DB_USR_USER) with encrypted password '$(DB_USR_PASS)';" >> create_postgres.js
echo "create database superdb_small;" >> create_postgres.js
echo "grant all privileges on database superdb_small to $(DB_USR_USER);" >> create_postgres.js
echo "grant usage on schema public to $(DB_USR_USER);" >> create_postgres.js
echo "grant select on all tables in schema public to $(DB_USR_USER);" >> create_postgres.js
echo "EOSQL" >> create_postgres.js
chmod +x create_postgres.js
FORCE: mongo_create_roles:
echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create.js echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create_mongo.js
echo 'use valdb' >> create.js echo 'use valdb' >> create_mongo.js
echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create.js echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create_mongo.js
echo 'db.grantRolesToUser("$(DB_USR_USER)", [{ role: "readWrite", db: "extvaldb"}])' >> create_mongo.js
build_mongo: create.js build_mongo: mongo_create_roles
docker build . -t my-mongo --no-cache docker build . -t my-mongo --no-cache
clean_stack: # build_postgres: postgres_create_roles
docker stack rm $(STACKNAME) # docker build . -t my-mongo --no-cache
run_stack: run_docker_compose:
mkdir -p ${HOME}/mongo_container/data/ mkdir -p ${HOME}/valency_data/mongo_container/data/
docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME) #docker kill $(shell ./get_mongo_container_name.sh)
#docker kill $(shell ./get_postgres_container_name.sh)
#docker-compose stop
docker-compose -f valency-stack.yml up -d --force-recreate
# docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME)
create_users: create.js create_users: create_mongo_users create_postgres_users
docker exec $(shell ./get_container_name.sh) /init_inside_container.sh
create_mongo_users: mongo_create_roles
docker exec $(shell ./get_mongo_container_name.sh) /init_inside_mongo_container.sh
# rm create.js # rm create.js
create_postgres_users: postgres_create_roles
docker exec $(shell ./get_postgres_container_name.sh) /scripts/init_inside_postgres_container.sh
restore_db: restore_mongo_db restore_postgres_db
restore_mongo_db:
ifeq (,$(wildcard ./mongo_db.gz))
$(error "mongo_db.gz does not exists. Make sure to have dump of mongo db in 'dockerfiles/database/mongo_db.gz'")
else
docker exec $(shell ./get_mongo_container_name.sh) sh -c 'mongorestore --gzip --archive=/scripts/mongo_db.gz --db valdb --username $(DB_USR_USER) --password $(DB_USR_PASS) --authenticationDatabase valdb'
endif
restore_postgres_db:
ifeq (,$(wildcard ./postgres_db.tar))
$(error "postgres_db.tar does not exists. Make sure to have dump of postgres db in 'dockerfiles/database/postgres_db.tar'")
else
docker exec $(shell ./get_postgres_container_name.sh) sh -c 'pg_restore -U $(DB_ADM_USER) --dbname=superdb_small --create --verbose /scripts/postgres_db.tar'
endif

View File

@@ -0,0 +1,2 @@
#!/bin/bash
docker ps | grep postgres | awk '{print $1}'

View File

@@ -1,3 +0,0 @@
#!/bin/bash
mongo admin < /create.js

View File

@@ -0,0 +1,3 @@
#!/bin/bash
mongo admin < /create_mongo.js

View File

@@ -0,0 +1,3 @@
#!/bin/bash
/scripts/create_postgres.js

View File

@@ -1,26 +0,0 @@
version: '3.1'
services:
my_mongo:
image: my-mongo
restart: always
ports:
- 27017:27017
environment:
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
volumes:
- ${HOME}/mongo_container/data/:/data/db
mongo_express:
image: mongo-express
restart: always
ports:
- 8087:8081
environment:
ME_CONFIG_BASICAUTH_USERNAME: ${MONGOEXPRESS_USER}
ME_CONFIG_BASICAUTH_PASSWORD: ${MONGOEXPRESS_PASS}
ME_CONFIG_MONGODB_ADMINUSERNAME: ${DB_ADM_USER}
ME_CONFIG_MONGODB_ADMINPASSWORD: ${DB_ADM_PASS}
ME_CONFIG_MONGODB_SERVER: my_mongo

View File

@@ -0,0 +1,27 @@
version: '3.1'
services:
my_mongo:
image: my-mongo
restart: always
ports:
- 127.0.0.1:27017:27017
environment:
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
volumes:
- ${HOME}/valency_data/mongo_container/data/:/data/db
- ./:/scripts
my_postgres:
image: postgres
restart: always
ports:
- 127.0.0.1:5432:5432
environment:
POSTGRES_USER: ${DB_ADM_USER}
POSTGRES_PASSWORD: ${DB_ADM_PASS}
volumes:
- ${HOME}/valency_data/postgres_container/data/:/var/lib/postgresql/data
- ./:/scripts

View File

@@ -6,7 +6,8 @@ vim \
python3 \ python3 \
python3-pip \ python3-pip \
sshfs \ sshfs \
curl curl \
locales
RUN pip3 install --upgrade pip RUN pip3 install --upgrade pip
@@ -21,6 +22,16 @@ RUN pip3 install \
flask_cors \ flask_cors \
pymongo \ pymongo \
flask-pymongo \ flask-pymongo \
gunicorn gunicorn \
SQLAlchemy \
tqdm \
psycopg2-binary
# Set the locale
RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
locale-gen
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
ENV PYTHONIOENCODING UTF-8 ENV PYTHONIOENCODING UTF-8

View File

@@ -7,6 +7,10 @@ server {
root /srv/dist; root /srv/dist;
index index.html index.htm; index index.html index.htm;
} }
location /home {
return 301 /;
}
} }
# backend # backend
@@ -19,4 +23,7 @@ server {
proxy_set_header Host $http_host; proxy_set_header Host $http_host;
proxy_pass http://backend_flask:8084; proxy_pass http://backend_flask:8084;
} }
} }
https://vezljivostni.cjvt.si/api/* -> http://vezljivostni-host.cjvt.si:8084/api/*
https://vezljivostni.cjvt.si/* -> http://vezljivostni-host.cjvt.si:80/*

View File

@@ -18,9 +18,8 @@ services:
mongo_express: mongo_express:
image: mongo-express image: mongo-express
restart: always restart: always
# TODO comment this out
ports: ports:
- 8087:8081 - 8081:8081
environment: environment:
ME_CONFIG_BASICAUTH_USERNAME: test ME_CONFIG_BASICAUTH_USERNAME: test
ME_CONFIG_BASICAUTH_PASSWORD: test ME_CONFIG_BASICAUTH_PASSWORD: test
@@ -41,3 +40,4 @@ services:
volumes: volumes:
- ./nginx.conf:/etc/nginx/conf.d/default.conf - ./nginx.conf:/etc/nginx/conf.d/default.conf
- ./src/frontend_vue/dist:/srv/dist - ./src/frontend_vue/dist:/srv/dist

37
requirements.txt Normal file
View File

@@ -0,0 +1,37 @@
asn1crypto==0.24.0
beautifulsoup4==4.8.0
bs4==0.0.1
cffi==1.12.3
Click==7.0
cryptography==2.1.4
Flask==1.1.1
Flask-Cors==3.0.8
Flask-PyMongo==2.3.0
gunicorn==19.9.0
idna==2.6
itsdangerous==1.1.0
Jinja2==2.10.1
joblib==0.13.2
keyring==10.6.0
keyrings.alt==3.0
lxml==4.4.0
MarkupSafe==1.1.1
numpy==1.17.0
pandas==0.25.0
pathlib==1.0.1
psycopg2==2.8.4
pycparser==2.19
pycrypto==2.6.1
pymongo==3.8.0
python-dateutil==2.8.0
pytz==2019.2
pyxdg==0.25
PyYAML==5.1.2
scikit-learn==0.21.3
scipy==1.3.0
SecretStorage==2.3.1
six==1.11.0
sklearn==0.0
soupsieve==1.9.3
SQLAlchemy==1.3.12
Werkzeug==0.15.5

1708
scripts/create_xml.py Normal file

File diff suppressed because it is too large Load Diff

189
scripts/extract_keywords.py Normal file
View File

@@ -0,0 +1,189 @@
import copy
import csv
from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import gc
import subprocess
import concurrent.futures
import tempfile
def read_gigafida(path):
words = {}
with open(path) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
words[row[0]] = int(row[2])
return words
def read_sloleks(path):
words = set()
with open(path) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
words.add(row[1])
return words
def read_zele(path):
with open(path) as f:
content = f.readlines()
# fix content
content[0] = content[0][1:]
# a = content[2]
# a = content[2].split()
# a = content[2].split()[0].split('<IZT>')[1]
# a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0]
content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content]
# content = [x.split() for x in content]
return set(content)
def read_wordlist(path):
with open(path) as f:
content = [line[:-1] for line in f.readlines()]
print(content[-1])
return set(content)
def filter_gigafida(gigafida_raw, min_limit, max_limit):
return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit}
def set_list_intersection(gigafida_filtered, sloleks):
intersection = {}
for word, num in gigafida_filtered.items():
if word in sloleks:
intersection[word] = num
return intersection
def list_list_union(list1, list2):
union = copy.copy(list1)
for w, n in list2.items():
if w not in list1:
union[w] = list2[w]
return union
def list_list_subtraction(list1, list2):
subtraction = {}
for w, n in list2.items():
# if w == 'dejati':
# print('here')
if w not in list1:
subtraction[w] = n
return subtraction
def set_set_subtraction(set1, set2):
subtraction = {}
for w in set2:
if w not in set1:
subtraction[w] = -1
return subtraction
def create_document(list1, path):
with open(path, "w") as text_file:
for w, n in list1.items():
text_file.write("%s\t%d\n" % (w, n))
def create_document_set(list1, path):
with open(path, "w") as text_file:
for w in sorted(list(list1)):
text_file.write("%s\n" % w)
def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max):
gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max)
sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks)
gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize)
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
return sloleks_zele_subtraction
def main(args):
gigafida_raw = read_gigafida(args.gigafida_verb_list)
sloleks = read_sloleks(args.sloleks)
zele = read_zele(args.zele)
if args.wordlist is not None:
sloleks_wordlist = set()
# sloleks_wordlist = set()
for el in sloleks:
if el in gigafida_raw:
sloleks_wordlist.add(el)
filtered_wordlist = read_wordlist(args.wordlist)
# sloleks_wordlist = set()
for el in sloleks:
if el in gigafida_raw:
filtered_wordlist.add(el)
create_document_set(filtered_wordlist, 'wordlist.tsv')
# gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize)
gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize)
sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks)
nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True)
res = [el[0] for el in nouns_sloleks_gf_intersect]
gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize)
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
sloleks_zele_subtraction = set_set_subtraction(sloleks, zele)
create_document(gigafida_filtered3, 'gigafida_3+.tsv')
# create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv')
create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv')
create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv')
# gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize)
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize)
create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv')
# gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10)
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10)
create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv')
# pass
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract keywords from multiple lists.')
parser.add_argument('gigafida_verb_list',
help='Path to gigafida list of verbs in tsv format.')
parser.add_argument('sloleks',
help='Path to Sloleks in tsv format.')
parser.add_argument('--zele',
help='Path to zele valency dictionary.')
parser.add_argument('--wordlist', default=None,
help='Path to filtered wordlist.')
parser.add_argument('--handchecked_words', default=None,
help='Path to handchecked words.')
# parser.add_argument('--min_limit',
# help='Limit min number of ocurrences',
# type=int, default=0)
# parser.add_argument('--max_limit',
# help='Limit max number of ocurrences',
# type=int, default=sys.maxsize)
parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))

117
scripts/form_csv.py Normal file
View File

@@ -0,0 +1,117 @@
import argparse
import csv
import os
from lxml import etree, objectify, html
def write_general_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio'])
for line in out_list:
writer.writerow(line)
def write_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example'])
for line in out_list:
writer.writerow(line)
def main(args):
for file in sorted(os.listdir(args.input)):
path = os.path.join(args.input, file)
tree = etree.parse(path)
gf_output = []
ssj_output = []
head = next(tree.iter('head'))
headword = head.find('headword').find('lemma').text
#for div in root.iterfind('.//div'):
for elem in tree.iter('statisticsContainer'):
# for element in tree.iterfind('statisticsContainer'):
# for element in tree.find('statisticsContainer'):
semRole = elem.find('semanticRole').text
gf_pattern = None
gf_sentence = None
ssj_pattern = None
ssj_sentence = None
measure = elem.find('measureList')
for el in measure:
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_sentence = el.text
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_sentence = el.text
if gf_pattern is not None and gf_sentence is not None:
gf_output.append([semRole, gf_pattern, gf_sentence])
if ssj_pattern is not None and ssj_sentence is not None:
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
print(file)
analyze_output = []
for elem in tree.iter('valencyPattern'):
valency_pattern_id = elem.attrib['id']
# get frequency
measure = ''
for measure_el in elem.find('measureList').findall('measure'):
if measure_el.attrib['source'] == 'Gigafida 2.0':
measure = measure_el.text
# get semantic roles
semantic_roles_list = []
for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'):
semantic_roles_list.append(semantic_rol_con.find('semanticRole').text)
semantic_roles = '_'.join(semantic_roles_list)
# pattern representation
pattern_representation = elem.find('patternRepresentation').text
# corpus example
if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None:
corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode')
else:
continue
# ugly postprocessing to remove xmlns:xsi=... duh..
root = etree.fromstring(corpus_example_text)
# Remove namespace prefixes
for elem in root.getiterator():
elem.tag = etree.QName(elem).localname
# Remove unused namespace declarations
etree.cleanup_namespaces(root)
corpus_example = etree.tostring(root, encoding='unicode')
print(f"Valency pattern {valency_pattern_id}")
analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example])
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
arg_parser.add_argument('--input', type=str, help='Input directory')
arg_parser.add_argument('--output', type=str, help='Output directory')
args = arg_parser.parse_args()
main(args)

1
scripts/valency Symbolic link
View File

@@ -0,0 +1 @@
../src/pkg/valency/valency

8
scripts/xsd_checker.py Normal file
View File

@@ -0,0 +1,8 @@
from lxml import etree as lxml
with open('../data/inventory.xsd') as f:
xmlschema_doc = lxml.parse(f)
xmlschema = lxml.XMLSchema(xmlschema_doc)
with open('../data/xmls/output.xml') as op:
doc = lxml.parse(op)
print(xmlschema.validate(doc))

0
src/__init__.py Normal file
View File

View File

@@ -37,7 +37,8 @@ app = Flask(__name__)
app.config.from_object("db_config") app.config.from_object("db_config")
mongo = PyMongo(app) mongo = PyMongo(app)
app.config["CORPORA"] = ["ssj", "kres"] # app.config["CORPORA"] = ["ssj", "kres", "gigafida"]
app.config["CORPORA"] = ["gigafida"]
app.config["BANNED_HEADWORDS"] = ["biti"] app.config["BANNED_HEADWORDS"] = ["biti"]
app.config["QUERY_LIMIT"] = 1000 app.config["QUERY_LIMIT"] = 1000
@@ -247,20 +248,23 @@ def api_get_frames():
if corpus not in app.config["CORPORA"]: if corpus not in app.config["CORPORA"]:
return json.dumps({"error": "cor={kres,ssj}"}) return json.dumps({"error": "cor={kres,ssj}"})
log.info("Test1")
cur = mongo.db[corpus].find({"headwords": hw}) cur = mongo.db[corpus].find({"headwords": hw})
log.info("Test2")
frames = [] frames = []
for ent in cur[:app.config["QUERY_LIMIT"]]: for ent in cur[:app.config["QUERY_LIMIT"]]:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close() cur.close()
log.info("Test3")
# filter by relevant hw # filter by relevant hw
frames = [x for x in frames if x.hw == hw] frames = [x for x in frames if x.hw == hw]
ret_frames = RF(frames, mongo.db.sensemap) ret_frames = RF(frames, mongo.db.sensemap)
log.info("Test3")
json_ret = {"frames": []} json_ret = {"frames": []}
for frame in ret_frames: for frame in ret_frames:
json_ret["frames"].append(frame.to_json()) json_ret["frames"].append(frame.to_json())
log.info("Test4")
return json.dumps(json_ret) return json.dumps(json_ret)
# return prepare_frames(ret_frames) # return prepare_frames(ret_frames)
@@ -444,7 +448,7 @@ def _is_banned(hw):
banned = False banned = False
return banned return banned
def prepare_app_index(appindex_json, sskj_wordlist): def prepare_app_index(appindex_json):
log.info("[*] preparing app_index") log.info("[*] preparing app_index")
# create app_index (used in frontend, left side word index) # create app_index (used in frontend, left side word index)
tmp_app_index = {c: {} for c in app.config["CORPORA"]} tmp_app_index = {c: {} for c in app.config["CORPORA"]}
@@ -452,6 +456,14 @@ def prepare_app_index(appindex_json, sskj_wordlist):
res_hws = {} res_hws = {}
res_fns = {} res_fns = {}
# print('CORPUS...!!...')
# print(corpus)
# a = mongo.db[corpus]
# print('TEST_OK')
# print(a)
# print(mongo.db)
# a = mongo.db.list_collection_names()
# print('TEST_OK2')
nentries = mongo.db[corpus].count() nentries = mongo.db[corpus].count()
idx = 0 idx = 0
for e in mongo.db[corpus].find({}): for e in mongo.db[corpus].find({}):
@@ -484,6 +496,7 @@ def prepare_app_index(appindex_json, sskj_wordlist):
for letter, words in alphabetical.items(): for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])] filtered_words = [x for x in words if not _is_banned(x[0])]
# filtered_words = [x for x in words]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0]) alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical tmp_app_index[corpus]["words"] = alphabetical
@@ -560,12 +573,16 @@ if __name__ == "__main__":
if args.prepare_db: if args.prepare_db:
with Path(args.sskj_wordlist).open("r") as fp: with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp) sskj_wordlist = json.load(fp)
prepare_app_index(args.appindex_json, sskj_wordlist) prepare_app_index(args.appindex_json)
sys.exit() sys.exit()
# app index from db # app index from db
with Path(args.appindex_json).open("r") as fp: with Path(args.appindex_json).open("r") as fp:
app.config["app_index"] = json.load(fp) app.config["app_index"] = json.load(fp)
# a = app.config["app_index"]
# b = app.config["app_index"]["kres"]
# c = app.config["app_index"]["kres"]["words"]
# print('HERE')
# log.info("[*] Starting app.py with config:\n%s".format(config)) # log.info("[*] Starting app.py with config:\n%s".format(config))
log.info("[*] Starting app.py with config:\n{}".format(config)) log.info("[*] Starting app.py with config:\n{}".format(config))

View File

@@ -0,0 +1,106 @@
import argparse
import json
from flask import Flask
from flask_pymongo import PyMongo
from pathlib import Path
app = Flask(__name__)
app.config.from_object("db_config")
mongo = PyMongo(app)
app.config["BANNED_HEADWORDS"] = ["biti"]
def _is_banned(hw):
banned = True
if hw in app.config["BANNED_HEADWORDS"]:
banned = True
elif hw in sskj_wordlist["wordlist"]:
banned = False
elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False
return banned
def prepare_app_index(appindex_json, corporas, previous_json=None):
if previous_json:
with Path(previous_json).open("r") as fp:
tmp_app_index = json.load(fp)
else:
tmp_app_index = {}
# create app_index (used in frontend, left side word index)
for c in corporas:
tmp_app_index[c] = {}
for corpus in corporas:
res_hws = {}
res_fns = {}
# print('CORPUS...!!...')
# print(corpus)
# a = mongo.db[corpus]
# print('TEST_OK')
# print(a)
# print(mongo.db)
# a = mongo.db.list_collection_names()
# print('TEST_OK2')
nentries = mongo.db[corpus].count()
idx = 0
for e in mongo.db[corpus].find({}):
if "headwords" not in e:
continue
for hw in e["headwords"]:
if hw in res_hws:
res_hws[hw] += 1
else:
res_hws[hw] = 1
if "functors" not in e:
continue
for fn in e["functors"]:
if fn in res_fns:
res_fns[fn] += 1
else:
res_fns[fn] = 1
idx += 1
if idx % 10000 == 0:
print("indexing {}: {}/{}".format(
corpus, idx, nentries))
alphabetical = {}
for k, e in res_hws.items():
fst = k[0].lower()
if fst in alphabetical:
alphabetical[fst].append((k, e))
else:
alphabetical[fst] = [(k, e)]
for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])]
# filtered_words = [x for x in words]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
functors = sorted(functors, key=lambda x: x[0])
tmp_app_index[corpus]["functors"] = functors
with Path(appindex_json).open("w") as fp:
json.dump(tmp_app_index, fp)
if __name__ == "__main__":
print("Starting app.py main()")
aparser = argparse.ArgumentParser(description="Arguments for app.py")
aparser.add_argument("--previous-json", type=str, default=None)
aparser.add_argument("--appindex-json", type=str)
aparser.add_argument("--sskj-wordlist", type=str)
args = aparser.parse_args()
corporas = ['gigafida']
with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp)
prepare_app_index(args.appindex_json, corporas, args.previous_json)

View File

@@ -1,2 +1,2 @@
MONGO_URI = "mongodb://valuser:valuserpass@my_mongo:27017/valdb" MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb"
MONGO_AUTH_SOURCE = 'admin' MONGO_AUTH_SOURCE = 'admin'

View File

@@ -0,0 +1,18 @@
import json
import os
input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json"
output_file = "../../all_sentences.json"
results = {}
filenames = os.listdir(input_dir)
len(filenames)
for i, filename in enumerate(filenames):
if filename.endswith(".json"):
with open(os.path.join(input_dir, filename)) as json_file:
data = json.load(json_file)
results[filename.split('-')[0]] = list(data.keys())
print('Progress: %.2f %%' % (i/len(filenames)))
with open(output_file, 'w') as f:
json.dump(results, f)

View File

@@ -1,3 +1,3 @@
{ {
"api_addr": "http://192.168.1.117:8084" "api_addr": "http://0.0.0.0:8084"
} }

View File

@@ -1,3 +1,3 @@
{ {
"api_addr": "http://192.168.1.117:8084" "api_addr": "http://193.2.76.103:8084"
} }

View File

@@ -120,7 +120,7 @@
"ansi-styles": { "ansi-styles": {
"version": "3.2.1", "version": "3.2.1",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz",
"integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", "integrity": "sha1-QfuyAkPlCxK+DwS43tvwdSDOhB0=",
"requires": { "requires": {
"color-convert": "^1.9.0" "color-convert": "^1.9.0"
} }
@@ -138,7 +138,7 @@
"aproba": { "aproba": {
"version": "1.2.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/aproba/-/aproba-1.2.0.tgz", "resolved": "https://registry.npmjs.org/aproba/-/aproba-1.2.0.tgz",
"integrity": "sha512-Y9J6ZjXtoYh8RnXVCMOU/ttDmk1aBjunq9vO0ta5x85WDQiQfUF9sIPBITdbiiIVcBo03Hi3jMxigBtsddlXRw==", "integrity": "sha1-aALmJk79GMeQobDVF/DyYnvyyUo=",
"dev": true "dev": true
}, },
"argparse": { "argparse": {
@@ -272,7 +272,7 @@
"async-limiter": { "async-limiter": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.0.tgz", "resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.0.tgz",
"integrity": "sha512-jp/uFnooOiO+L211eZOoSyzpOITMXx1rBITauYykG3BRYPu8h0UcxsPNB04RR5vo4Tyz3+ay17tR6JVf9qzYWg==", "integrity": "sha1-ePrtjD0HSrgfIrTphdeehzj3IPg=",
"dev": true "dev": true
}, },
"atob": { "atob": {
@@ -1469,7 +1469,7 @@
"buffer-indexof": { "buffer-indexof": {
"version": "1.1.1", "version": "1.1.1",
"resolved": "https://registry.npmjs.org/buffer-indexof/-/buffer-indexof-1.1.1.tgz", "resolved": "https://registry.npmjs.org/buffer-indexof/-/buffer-indexof-1.1.1.tgz",
"integrity": "sha512-4/rOEg86jivtPTeOUUT61jJO1Ya1TrR/OkqCSZDyq84WJh3LuuiphBYJN+fm5xufIk4XAFcEwte/8WzC8If/1g==", "integrity": "sha1-Uvq8xqYG0aADAoAmSO9o9jnaJow=",
"dev": true "dev": true
}, },
"buffer-xor": { "buffer-xor": {
@@ -1493,7 +1493,7 @@
"cacache": { "cacache": {
"version": "10.0.4", "version": "10.0.4",
"resolved": "https://registry.npmjs.org/cacache/-/cacache-10.0.4.tgz", "resolved": "https://registry.npmjs.org/cacache/-/cacache-10.0.4.tgz",
"integrity": "sha512-Dph0MzuH+rTQzGPNT9fAnrPmMmjKfST6trxJeK7NQuHRaVw24VzPRWTmg9MpcwOVQZO0E1FBICUlFeNaKPIfHA==", "integrity": "sha1-ZFI2eZnv+dQYiu/ZoU6dfGomNGA=",
"dev": true, "dev": true,
"requires": { "requires": {
"bluebird": "^3.5.1", "bluebird": "^3.5.1",
@@ -1916,7 +1916,7 @@
"concat-stream": { "concat-stream": {
"version": "1.6.2", "version": "1.6.2",
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", "integrity": "sha1-kEvfGUzTEi/Gdcd/xKw9T/D9GjQ=",
"dev": true, "dev": true,
"requires": { "requires": {
"buffer-from": "^1.0.0", "buffer-from": "^1.0.0",
@@ -1969,7 +1969,7 @@
"content-type": { "content-type": {
"version": "1.0.4", "version": "1.0.4",
"resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz", "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
"integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==", "integrity": "sha1-4TjMdeBAxyexlm/l5fjJruJW/js=",
"dev": true "dev": true
}, },
"convert-hex": { "convert-hex": {
@@ -2003,7 +2003,7 @@
"copy-concurrently": { "copy-concurrently": {
"version": "1.0.5", "version": "1.0.5",
"resolved": "https://registry.npmjs.org/copy-concurrently/-/copy-concurrently-1.0.5.tgz", "resolved": "https://registry.npmjs.org/copy-concurrently/-/copy-concurrently-1.0.5.tgz",
"integrity": "sha512-f2domd9fsVDFtaFcbaRZuYXwtdmnzqbADSwhSWYxYB/Q8zsdUUFMXVRwXGDMWmbEzAn1kdRrtI1T/KTFOL4X2A==", "integrity": "sha1-kilzmMrjSTf8r9bsgTnBgFHwteA=",
"dev": true, "dev": true,
"requires": { "requires": {
"aproba": "^1.1.1", "aproba": "^1.1.1",
@@ -2637,7 +2637,7 @@
"dns-packet": { "dns-packet": {
"version": "1.3.1", "version": "1.3.1",
"resolved": "https://registry.npmjs.org/dns-packet/-/dns-packet-1.3.1.tgz", "resolved": "https://registry.npmjs.org/dns-packet/-/dns-packet-1.3.1.tgz",
"integrity": "sha512-0UxfQkMhYAUaZI+xrNZOz/as5KgDU0M/fQ9b6SpkyLbk3GEswDi6PADJVaYJradtRVsRIlF1zLyOodbcTCDzUg==", "integrity": "sha1-EqpCaYEHW+UAuRDu3NC0fdfe2lo=",
"dev": true, "dev": true,
"requires": { "requires": {
"ip": "^1.1.0", "ip": "^1.1.0",
@@ -2746,7 +2746,7 @@
"ejs": { "ejs": {
"version": "2.6.1", "version": "2.6.1",
"resolved": "https://registry.npmjs.org/ejs/-/ejs-2.6.1.tgz", "resolved": "https://registry.npmjs.org/ejs/-/ejs-2.6.1.tgz",
"integrity": "sha512-0xy4A/twfrRCnkhfk8ErDi5DqdAsAqeGxht4xkCUrsvhhbQNs7E+4jV0CN7+NKIY0aHE72+XvqtBIXzD31ZbXQ==", "integrity": "sha1-SY7A1JVlWrxvI81hho2SZGQHGqA=",
"dev": true "dev": true
}, },
"electron-to-chromium": { "electron-to-chromium": {
@@ -2785,7 +2785,7 @@
"end-of-stream": { "end-of-stream": {
"version": "1.4.1", "version": "1.4.1",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz",
"integrity": "sha512-1MkrZNvWTKCaigbn+W15elq2BB/L22nqrSY5DKlo3X6+vclJm8Bb5djXJBmEX6fS3+zCh/F4VBK5Z2KxJt4s2Q==", "integrity": "sha1-7SljTRm6ukY7bOa4CjchPqtx7EM=",
"dev": true, "dev": true,
"requires": { "requires": {
"once": "^1.4.0" "once": "^1.4.0"
@@ -3284,7 +3284,7 @@
"filesize": { "filesize": {
"version": "3.6.1", "version": "3.6.1",
"resolved": "https://registry.npmjs.org/filesize/-/filesize-3.6.1.tgz", "resolved": "https://registry.npmjs.org/filesize/-/filesize-3.6.1.tgz",
"integrity": "sha512-7KjR1vv6qnicaPMi1iiTcI85CyYwRO/PSFCu6SvqL8jN2Wjt/NIYQTFtFs7fSDCYOstUkEWIQGFUg5YZQfjlcg==", "integrity": "sha1-CQuz7gG2+AGoqL6Z0xcQs0Irsxc=",
"dev": true "dev": true
}, },
"fill-range": { "fill-range": {
@@ -3313,7 +3313,7 @@
"finalhandler": { "finalhandler": {
"version": "1.1.1", "version": "1.1.1",
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.1.tgz", "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.1.tgz",
"integrity": "sha512-Y1GUDo39ez4aHAw7MysnUD5JzYX+WaIj8I57kO3aEPT1fFRL4sr7mjei97FgnwhAyyzRYmQZaTHb2+9uZ1dPtg==", "integrity": "sha1-7r9O2EAHnIP0JJA4ydcDAIMBsQU=",
"dev": true, "dev": true,
"requires": { "requires": {
"debug": "2.6.9", "debug": "2.6.9",
@@ -4389,7 +4389,7 @@
"http-proxy": { "http-proxy": {
"version": "1.17.0", "version": "1.17.0",
"resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.17.0.tgz", "resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.17.0.tgz",
"integrity": "sha512-Taqn+3nNvYRfJ3bGvKfBSRwy1v6eePlm3oc/aWVxZp57DQr5Eq3xhKJi7Z4hZpS8PC3H4qI+Yly5EmFacGuA/g==", "integrity": "sha1-etOElGWPhGBeL220Q230EPTlvpo=",
"dev": true, "dev": true,
"requires": { "requires": {
"eventemitter3": "^3.0.0", "eventemitter3": "^3.0.0",
@@ -4500,7 +4500,7 @@
"import-local": { "import-local": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/import-local/-/import-local-1.0.0.tgz", "resolved": "https://registry.npmjs.org/import-local/-/import-local-1.0.0.tgz",
"integrity": "sha512-vAaZHieK9qjGo58agRBg+bhHX3hoTZU/Oa3GESWLz7t1U62fk63aHuDJJEteXoDeTCcPmUT+z38gkHPZkkmpmQ==", "integrity": "sha1-Xk/9wD9P5sAJxnKb6yljHC+CJ7w=",
"dev": true, "dev": true,
"requires": { "requires": {
"pkg-dir": "^2.0.0", "pkg-dir": "^2.0.0",
@@ -4744,7 +4744,7 @@
"is-path-in-cwd": { "is-path-in-cwd": {
"version": "1.0.1", "version": "1.0.1",
"resolved": "https://registry.npmjs.org/is-path-in-cwd/-/is-path-in-cwd-1.0.1.tgz", "resolved": "https://registry.npmjs.org/is-path-in-cwd/-/is-path-in-cwd-1.0.1.tgz",
"integrity": "sha512-FjV1RTW48E7CWM7eE/J2NJvAEEVektecDBVBE5Hh3nM1Jd0kvhHtX68Pr3xsDf857xt3Y4AkwVULK1Vku62aaQ==", "integrity": "sha1-WsSLNF72dTOb1sekipEhELJBz1I=",
"dev": true, "dev": true,
"requires": { "requires": {
"is-path-inside": "^1.0.0" "is-path-inside": "^1.0.0"
@@ -5070,7 +5070,7 @@
"make-dir": { "make-dir": {
"version": "1.3.0", "version": "1.3.0",
"resolved": "https://registry.npmjs.org/make-dir/-/make-dir-1.3.0.tgz", "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-1.3.0.tgz",
"integrity": "sha512-2w31R7SJtieJJnQtGc7RVL2StM2vGYVfqUOvUDxH6bC6aJTxPxTF0GnIgCyu7tjockiUWAYQRbxa7vKn34s5sQ==", "integrity": "sha1-ecEDO4BRW9bSTsmTPoYMp17ifww=",
"dev": true, "dev": true,
"requires": { "requires": {
"pify": "^3.0.0" "pify": "^3.0.0"
@@ -5352,7 +5352,7 @@
"mississippi": { "mississippi": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/mississippi/-/mississippi-2.0.0.tgz", "resolved": "https://registry.npmjs.org/mississippi/-/mississippi-2.0.0.tgz",
"integrity": "sha512-zHo8v+otD1J10j/tC+VNoGK9keCuByhKovAvdn74dmxJl9+mWHnx6EMsDN4lgRoMI/eYo2nchAxniIbUPb5onw==", "integrity": "sha1-NEKlCPr8KFAEhv7qmUCWduTuWm8=",
"dev": true, "dev": true,
"requires": { "requires": {
"concat-stream": "^1.5.0", "concat-stream": "^1.5.0",
@@ -5419,7 +5419,7 @@
"multicast-dns": { "multicast-dns": {
"version": "6.2.3", "version": "6.2.3",
"resolved": "https://registry.npmjs.org/multicast-dns/-/multicast-dns-6.2.3.tgz", "resolved": "https://registry.npmjs.org/multicast-dns/-/multicast-dns-6.2.3.tgz",
"integrity": "sha512-ji6J5enbMyGRHIAkAOu3WdV8nggqviKCEKtXcOqfphZZtQrmHKycfynJ2V7eVPUA4NhJ6V7Wf4TmGbTwKE9B6g==", "integrity": "sha1-oOx72QVcQoL3kMPIL04o2zsxsik=",
"dev": true, "dev": true,
"requires": { "requires": {
"dns-packet": "^1.3.1", "dns-packet": "^1.3.1",
@@ -5502,7 +5502,7 @@
"node-forge": { "node-forge": {
"version": "0.7.5", "version": "0.7.5",
"resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.7.5.tgz", "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.7.5.tgz",
"integrity": "sha512-MmbQJ2MTESTjt3Gi/3yG1wGpIMhUfcIypUCGtTizFR9IiccFwxSpfp0vtIZlkFclEqERemxfnSdZEMR9VqqEFQ==", "integrity": "sha1-bBUsNFzhHFL0ZcKr2VfoY5zWdN8=",
"dev": true "dev": true
}, },
"node-libs-browser": { "node-libs-browser": {
@@ -5698,7 +5698,7 @@
"obuf": { "obuf": {
"version": "1.1.2", "version": "1.1.2",
"resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz",
"integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==", "integrity": "sha1-Cb6jND1BhZ69RGKS0RydTbYZCE4=",
"dev": true "dev": true
}, },
"on-finished": { "on-finished": {
@@ -5836,7 +5836,7 @@
"p-map": { "p-map": {
"version": "1.2.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/p-map/-/p-map-1.2.0.tgz", "resolved": "https://registry.npmjs.org/p-map/-/p-map-1.2.0.tgz",
"integrity": "sha512-r6zKACMNhjPJMTl8KcFH4li//gkrXWfbD6feV8l6doRHlzljFWGJ2AP6iKaCJXyZmAUMOPtvbW7EXkbWO/pLEA==", "integrity": "sha1-5OlPMR6rvIYzoeeZCBZfyiYkG2s=",
"dev": true "dev": true
}, },
"p-try": { "p-try": {
@@ -5957,7 +5957,7 @@
"path-type": { "path-type": {
"version": "3.0.0", "version": "3.0.0",
"resolved": "https://registry.npmjs.org/path-type/-/path-type-3.0.0.tgz", "resolved": "https://registry.npmjs.org/path-type/-/path-type-3.0.0.tgz",
"integrity": "sha512-T2ZUsdZFHgA3u4e5PfPbjd7HDDpxPnQb5jN0SrDsjNSuVXHJqtwTnWqG0B1jZrgmJ/7lj1EmVIByWt1gxGkWvg==", "integrity": "sha1-zvMdyOCho7sNEFwM2Xzzv0f0428=",
"dev": true, "dev": true,
"requires": { "requires": {
"pify": "^3.0.0" "pify": "^3.0.0"
@@ -8309,7 +8309,7 @@
"pump": { "pump": {
"version": "2.0.1", "version": "2.0.1",
"resolved": "https://registry.npmjs.org/pump/-/pump-2.0.1.tgz", "resolved": "https://registry.npmjs.org/pump/-/pump-2.0.1.tgz",
"integrity": "sha512-ruPMNRkN3MHP1cWJc9OWr+T/xDP0jhXYCLfJcBuX54hhfIBnaQmAUMfDcG4DM5UMWByBbJY69QSphm3jtDKIkA==", "integrity": "sha1-Ejma3W5M91Jtlzy8i1zi4pCLOQk=",
"dev": true, "dev": true,
"requires": { "requires": {
"end-of-stream": "^1.1.0", "end-of-stream": "^1.1.0",
@@ -8319,7 +8319,7 @@
"pumpify": { "pumpify": {
"version": "1.5.1", "version": "1.5.1",
"resolved": "https://registry.npmjs.org/pumpify/-/pumpify-1.5.1.tgz", "resolved": "https://registry.npmjs.org/pumpify/-/pumpify-1.5.1.tgz",
"integrity": "sha512-oClZI37HvuUJJxSKKrC17bZ9Cu0ZYhEAGPsPUy9KlMUmv9dKX2o77RUmq7f3XjIxbwyGwYzbzQ1L2Ks8sIradQ==", "integrity": "sha1-NlE74karJ1cLGjdKXOJ4v9dDcM4=",
"dev": true, "dev": true,
"requires": { "requires": {
"duplexify": "^3.6.0", "duplexify": "^3.6.0",
@@ -8813,7 +8813,7 @@
"sax": { "sax": {
"version": "1.2.4", "version": "1.2.4",
"resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz", "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz",
"integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==", "integrity": "sha1-KBYjTiN4vdxOU1T6tcqold9xANk=",
"dev": true "dev": true
}, },
"schema-utils": { "schema-utils": {
@@ -8863,7 +8863,7 @@
"send": { "send": {
"version": "0.16.2", "version": "0.16.2",
"resolved": "https://registry.npmjs.org/send/-/send-0.16.2.tgz", "resolved": "https://registry.npmjs.org/send/-/send-0.16.2.tgz",
"integrity": "sha512-E64YFPUssFHEFBvpbbjr44NCLtI1AohxQ8ZSiJjQLskAdKuriYEP6VyGEsRDH8ScozGpkaX1BGvhanqCwkcEZw==", "integrity": "sha1-bsyh4PjBVtFBWXVZhI32RzCmu8E=",
"dev": true, "dev": true,
"requires": { "requires": {
"debug": "2.6.9", "debug": "2.6.9",
@@ -8884,7 +8884,7 @@
"mime": { "mime": {
"version": "1.4.1", "version": "1.4.1",
"resolved": "https://registry.npmjs.org/mime/-/mime-1.4.1.tgz", "resolved": "https://registry.npmjs.org/mime/-/mime-1.4.1.tgz",
"integrity": "sha512-KI1+qOZu5DcW6wayYHSzR/tXKCDC5Om4s1z2QJjDULzLcmf3DvzS7oluY4HCTrc+9FiKmWUgeNLg7W3uIQvxtQ==", "integrity": "sha1-Eh+evEnjdm8xGnbh+hyAA8SwOqY=",
"dev": true "dev": true
} }
} }
@@ -8913,7 +8913,7 @@
"serve-static": { "serve-static": {
"version": "1.13.2", "version": "1.13.2",
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.13.2.tgz", "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.13.2.tgz",
"integrity": "sha512-p/tdJrO4U387R9oMjb1oj7qSMaMfmOyd4j9hOFoxZe2baQszgHcSWjuya/CiT5kgZZKRudHNOA0pYXOl8rQ5nw==", "integrity": "sha1-CV6Ecv1bRiN9tQzkhqQ/S4bGzsE=",
"dev": true, "dev": true,
"requires": { "requires": {
"encodeurl": "~1.0.2", "encodeurl": "~1.0.2",
@@ -9011,7 +9011,7 @@
"shellwords": { "shellwords": {
"version": "0.1.1", "version": "0.1.1",
"resolved": "https://registry.npmjs.org/shellwords/-/shellwords-0.1.1.tgz", "resolved": "https://registry.npmjs.org/shellwords/-/shellwords-0.1.1.tgz",
"integrity": "sha512-vFwSUfQvqybiICwZY5+DAWIPLKsWO31Q91JSKl3UYv+K5c2QRPzn0qzec6QPu1Qc9eHYItiP3NdJqNVqetYAww==", "integrity": "sha1-1rkYHBpI05cyTISHHvvPxz/AZUs=",
"dev": true "dev": true
}, },
"signal-exit": { "signal-exit": {
@@ -9137,7 +9137,7 @@
"sockjs": { "sockjs": {
"version": "0.3.19", "version": "0.3.19",
"resolved": "https://registry.npmjs.org/sockjs/-/sockjs-0.3.19.tgz", "resolved": "https://registry.npmjs.org/sockjs/-/sockjs-0.3.19.tgz",
"integrity": "sha512-V48klKZl8T6MzatbLlzzRNhMepEys9Y4oGFpypBFFn1gLI/QQ9HtLLyWJNbPlwGLelOVOEijUbTTJeLLI59jLw==", "integrity": "sha1-2Xa76ACve9IK4IWY1YI5NQiZPA0=",
"dev": true, "dev": true,
"requires": { "requires": {
"faye-websocket": "^0.10.0", "faye-websocket": "^0.10.0",
@@ -9245,7 +9245,7 @@
"spdx-expression-parse": { "spdx-expression-parse": {
"version": "3.0.0", "version": "3.0.0",
"resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-3.0.0.tgz", "resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-3.0.0.tgz",
"integrity": "sha512-Yg6D3XpRD4kkOmTpdgbUiEJFKghJH03fiC1OPll5h/0sO6neh2jqRDVHOQ4o/LMea0tgCkbMgea5ip/e+MkWyg==", "integrity": "sha1-meEZt6XaAOBUkcn6M4t5BII7QdA=",
"dev": true, "dev": true,
"requires": { "requires": {
"spdx-exceptions": "^2.1.0", "spdx-exceptions": "^2.1.0",
@@ -9348,7 +9348,7 @@
"ssri": { "ssri": {
"version": "5.3.0", "version": "5.3.0",
"resolved": "https://registry.npmjs.org/ssri/-/ssri-5.3.0.tgz", "resolved": "https://registry.npmjs.org/ssri/-/ssri-5.3.0.tgz",
"integrity": "sha512-XRSIPqLij52MtgoQavH/x/dU1qVKtWUAAZeOHsR9c2Ddi4XerFy3mc1alf+dLJKl9EUIm/Ht+EowFkTUOA6GAQ==", "integrity": "sha1-ujhyycbTOgcEp9cf8EXl7EiZnQY=",
"dev": true, "dev": true,
"requires": { "requires": {
"safe-buffer": "^5.1.1" "safe-buffer": "^5.1.1"
@@ -10055,7 +10055,7 @@
"vendors": { "vendors": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/vendors/-/vendors-1.0.2.tgz", "resolved": "https://registry.npmjs.org/vendors/-/vendors-1.0.2.tgz",
"integrity": "sha512-w/hry/368nO21AN9QljsaIhb9ZiZtZARoVH5f3CsFbawdLdayCgKRPup7CggujvySMxx0I91NOyxdVENohprLQ==", "integrity": "sha1-f8te759WI7FWvOqJ7DfWNnbyGAE=",
"dev": true "dev": true
}, },
"vm-browserify": { "vm-browserify": {
@@ -10211,7 +10211,7 @@
"wbuf": { "wbuf": {
"version": "1.7.3", "version": "1.7.3",
"resolved": "https://registry.npmjs.org/wbuf/-/wbuf-1.7.3.tgz", "resolved": "https://registry.npmjs.org/wbuf/-/wbuf-1.7.3.tgz",
"integrity": "sha512-O84QOnr0icsbFGLS0O3bI5FswxzRr8/gHwWkDlQFskhSPryQXvrTMxjxGP4+iWYoauLoBvfDpkrOauZ+0iZpDA==", "integrity": "sha1-wdjRSTFtPqhShIiVy2oL/oh7h98=",
"dev": true, "dev": true,
"requires": { "requires": {
"minimalistic-assert": "^1.0.0" "minimalistic-assert": "^1.0.0"
@@ -10344,7 +10344,7 @@
"webpack-dev-middleware": { "webpack-dev-middleware": {
"version": "1.12.2", "version": "1.12.2",
"resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-1.12.2.tgz", "resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-1.12.2.tgz",
"integrity": "sha512-FCrqPy1yy/sN6U/SaEZcHKRXGlqU0DUaEBL45jkUYoB8foVb6wCnbIJ1HKIx+qUFTW+3JpVcCJCxZ8VATL4e+A==", "integrity": "sha1-+PwRIM47T8VoDO7LQ9d3lmshEF4=",
"dev": true, "dev": true,
"requires": { "requires": {
"memory-fs": "~0.4.1", "memory-fs": "~0.4.1",
@@ -10605,7 +10605,7 @@
"websocket-extensions": { "websocket-extensions": {
"version": "0.1.3", "version": "0.1.3",
"resolved": "https://registry.npmjs.org/websocket-extensions/-/websocket-extensions-0.1.3.tgz", "resolved": "https://registry.npmjs.org/websocket-extensions/-/websocket-extensions-0.1.3.tgz",
"integrity": "sha512-nqHUnMXmBzT0w570r2JpJxfiSD1IzoI+HGVdd3aZ0yNi3ngvQ4jv1dtHt5VGxfI2yj5yqImPhOK4vmIh2xMbGg==", "integrity": "sha1-XS/yKXcAPsaHpLhwc9+7rBRszyk=",
"dev": true "dev": true
}, },
"whet.extend": { "whet.extend": {
@@ -10644,7 +10644,7 @@
"worker-farm": { "worker-farm": {
"version": "1.6.0", "version": "1.6.0",
"resolved": "https://registry.npmjs.org/worker-farm/-/worker-farm-1.6.0.tgz", "resolved": "https://registry.npmjs.org/worker-farm/-/worker-farm-1.6.0.tgz",
"integrity": "sha512-6w+3tHbM87WnSWnENBUvA2pxJPLhQUg5LKwUQHq3r+XPhIM+Gh2R5ycbwPCyuGbNg+lPgdcnQUhuC02kJCvffQ==", "integrity": "sha1-rsxAWXb6talVJhgIRvDboojzpKA=",
"dev": true, "dev": true,
"requires": { "requires": {
"errno": "~0.1.7" "errno": "~0.1.7"
@@ -10715,7 +10715,7 @@
"y18n": { "y18n": {
"version": "4.0.0", "version": "4.0.0",
"resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz", "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz",
"integrity": "sha512-r9S/ZyXu/Xu9q1tYlpsLIsa3EeLXXk0VwlxqTcFRfg9EhMW+17kbt9G0NrgCmhGb5vT2hyhJZLfDGx+7+5Uj/w==", "integrity": "sha1-le+U+F7MgdAHwmThkKEg8KPIVms=",
"dev": true "dev": true
}, },
"yallist": { "yallist": {

View File

@@ -62,7 +62,7 @@ export default {
name: "Nav", name: "Nav",
props: ["appState"], props: ["appState"],
data() {return { data() {return {
optCorpora: ["kres", "ssj"], optCorpora: ["kres", "ssj", "gigafida"],
optIndexes: [ optIndexes: [
{key: "besede", val: "words"}, {key: "besede", val: "words"},
{key: "udeleženske vloge", val: "functors"}, {key: "udeleženske vloge", val: "functors"},

0
src/pkg/__init__.py Normal file
View File

View File

@@ -3,6 +3,41 @@ from corpusparser import enriched_lemma
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def frames_from_db_entry_headword(dbent, headword):
def _full_tid(tid):
return ".".join([dbent["sid"], str(tid)])
token_dict = {str(x["tid"]): x for x in dbent["tokens"]}
frames = []
if "srl_links" not in dbent:
return []
srldict = {}
for srl in dbent["srl_links"]:
key = str(srl["from"])
if enriched_lemma(token_dict[key]) != headword:
continue
if key not in srldict:
srldict[key] = [srl]
else:
srldict[key] += [srl]
for hwtid, srlarr in srldict.items():
frames += [Frame(
hw_lemma=enriched_lemma(token_dict[hwtid]),
tids=[_full_tid(hwtid)],
slots=[
Slot(
functor=srl["afun"],
tids=[_full_tid(srl["to"])]
) for srl in srlarr
],
# sentences=[(dbent["sid"], dbent["tokens"])],
sentences=[
[(_full_tid(t["tid"]), t) for t in dbent["tokens"]],
]
)]
return frames
def frames_from_db_entry(dbent): def frames_from_db_entry(dbent):
def _full_tid(tid): def _full_tid(tid):
return ".".join([dbent["sid"], str(tid)]) return ".".join([dbent["sid"], str(tid)])