forked from kristjan/cjvt-valency
		
	Compare commits
	
		
			14 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| ec083a8d63 | |||
| 69c3521e4b | |||
| 75b015dcda | |||
| c18aaff11f | |||
| 34b776be11 | |||
| 26bca0b083 | |||
| 2551a9c6a8 | |||
| 5cdc963c2d | |||
| ce1fb46b4e | |||
| 220529b777 | |||
| ae5f2869bc | |||
| 931b3531b3 | |||
| 3d91251905 | |||
| c803057164 | 
							
								
								
									
										11
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										11
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -7,6 +7,17 @@ data/appindex.json | |||||||
| src/frontend_vue/node_modules/ | src/frontend_vue/node_modules/ | ||||||
| src/frontend_vue/dist/ | src/frontend_vue/dist/ | ||||||
| dockerfiles/database/create.js | dockerfiles/database/create.js | ||||||
|  | dockerfiles/database/create_mongo.js | ||||||
|  | dockerfiles/database/create_postgres.js | ||||||
|  | dockerfiles/database/mongo_db.gz | ||||||
|  | dockerfiles/database/postgres_db.tar | ||||||
|  | dockerfiles/database/postgres_db_OLD.tar | ||||||
| *__pycache__/ | *__pycache__/ | ||||||
| env.local | env.local | ||||||
| logs/* | logs/* | ||||||
|  | .idea/ | ||||||
|  | venv* | ||||||
|  | data/ | ||||||
|  | data | ||||||
|  | deploy_instructions/ | ||||||
|  | run.sh | ||||||
|  | |||||||
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @ -1,3 +1,6 @@ | |||||||
| [submodule "src/pkg/cjvt-corpusparser"] | [submodule "src/pkg/cjvt-corpusparser"] | ||||||
| 	path = src/pkg/cjvt-corpusparser | 	path = src/pkg/cjvt-corpusparser | ||||||
| 	url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git | 	url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git | ||||||
|  | [submodule "src/pkg/luscenje_struktur"] | ||||||
|  | 	path = src/pkg/luscenje_struktur | ||||||
|  | 	url = https://gitea.cjvt.si/ozbolt/luscenje_struktur.git | ||||||
|  | |||||||
							
								
								
									
										23
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								Makefile
									
									
									
									
									
								
							| @ -13,10 +13,11 @@ SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link" | |||||||
| # KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
 | # KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
 | ||||||
| # KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
 | # KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
 | ||||||
| KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml" | KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml" | ||||||
|  | GIGAFIDA_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/giga_orig" | ||||||
| # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
 | # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
 | ||||||
| # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
 | # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
 | ||||||
| KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json" | KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json" | ||||||
| 
 | GIGAFIDA_SRL_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/final_json" | ||||||
| # This file comes with the source code. Make sure you unpack it and name it right.
 | # This file comes with the source code. Make sure you unpack it and name it right.
 | ||||||
| SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json" | SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json" | ||||||
| SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json" | SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json" | ||||||
| @ -26,14 +27,14 @@ APPINDEX_PATH = "$(MAKE_ROOT)/data/appindex.json" | |||||||
| 
 | 
 | ||||||
| OUTPUT = "db" | OUTPUT = "db" | ||||||
| # OUTPUT = "file"
 | # OUTPUT = "file"
 | ||||||
| OUTDIR = "/tmp/three"  # if you're running this in docker, make sure to mount the volume | OUTDIR = "/project/data"  # if you're running this in docker, make sure to mount the volume | ||||||
| DBADDR = "0.0.0.0:27017"  # don't use localhost | DBADDR = "0.0.0.0:27017"  # don't use localhost | ||||||
| 
 | 
 | ||||||
| # credentials from .gitignored file
 | # credentials from .gitignored file
 | ||||||
| # create it from env.default
 | # create it from env.default
 | ||||||
| include env.local | include env.local | ||||||
| 
 | 
 | ||||||
| N_CORES = 3 | N_CORES = 4 | ||||||
| # insert kres files into database in chunks, for fewer connections
 | # insert kres files into database in chunks, for fewer connections
 | ||||||
| KRES_CHUNK_SIZE = 30 | KRES_CHUNK_SIZE = 30 | ||||||
| 
 | 
 | ||||||
| @ -56,6 +57,12 @@ database-service: | |||||||
| database-users: | database-users: | ||||||
| 	cd dockerfiles/database; $(MAKE) create_users | 	cd dockerfiles/database; $(MAKE) create_users | ||||||
| 
 | 
 | ||||||
|  | database-restore: | ||||||
|  | 	cd dockerfiles/database; $(MAKE) restore_db | ||||||
|  | 
 | ||||||
|  | database-restore-postgres: | ||||||
|  | 	cd dockerfiles/database; $(MAKE) restore_postgres_db | ||||||
|  | 
 | ||||||
| # also useful, if we want to restart the db
 | # also useful, if we want to restart the db
 | ||||||
| database-clean: | database-clean: | ||||||
| 	cd dockerfiles/database; $(MAKE) clean_stack | 	cd dockerfiles/database; $(MAKE) clean_stack | ||||||
| @ -69,6 +76,7 @@ python-env-install: | |||||||
| 	pip3 install -e src/pkg/cjvt-corpusparser/. | 	pip3 install -e src/pkg/cjvt-corpusparser/. | ||||||
| 	pip3 install -e src/pkg/valency/. | 	pip3 install -e src/pkg/valency/. | ||||||
| 	pip3 install -e src/pkg/seqparser/. | 	pip3 install -e src/pkg/seqparser/. | ||||||
|  | 	pip3 install -e src/pkg/luscenje_struktur/. | ||||||
| 
 | 
 | ||||||
| # from inside python-env container:
 | # from inside python-env container:
 | ||||||
| data/samples: | data/samples: | ||||||
| @ -93,7 +101,14 @@ fill-database-kres: data/samples | |||||||
| 		--chunk-size $(KRES_CHUNK_SIZE) \
 | 		--chunk-size $(KRES_CHUNK_SIZE) \
 | ||||||
| 		--cores $(N_CORES) | 		--cores $(N_CORES) | ||||||
| 
 | 
 | ||||||
| 
 | fill-database-gigafida: data/samples | ||||||
|  | 	python3 src/pkg/cjvt-corpusparser/corpusparser/main.py --kres-folder $(GIGAFIDA_FOLDER) \
 | ||||||
|  | 		--corpus="gigafida" \
 | ||||||
|  | 		--ssj-file $(SSJ_FILE) --kres-srl-folder $(GIGAFIDA_SRL_FOLDER) \
 | ||||||
|  | 		--output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \
 | ||||||
|  | 		--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \
 | ||||||
|  | 		--chunk-size $(KRES_CHUNK_SIZE) \
 | ||||||
|  | 		--cores $(N_CORES) | ||||||
| 
 | 
 | ||||||
| ## Frontend
 | ## Frontend
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										44
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										44
									
								
								README.md
									
									
									
									
									
								
							| @ -179,3 +179,47 @@ $ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0 | |||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| After uploading, restart the stack with `27017` commented out.   | After uploading, restart the stack with `27017` commented out.   | ||||||
|  | 
 | ||||||
|  | ## Script running | ||||||
|  | 
 | ||||||
|  | ### Environment setup | ||||||
|  | ```bash | ||||||
|  | pip install -r requirements.txt | ||||||
|  | pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git | ||||||
|  | pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | ### Running on already setup environment | ||||||
|  | ```bash | ||||||
|  | make database-service | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | ### Setting up environment for running on ramdisk | ||||||
|  | 
 | ||||||
|  | ```bash | ||||||
|  | # create ramdisk | ||||||
|  | sudo mount -t tmpfs tmpfs /mnt/tmp | ||||||
|  | sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp | ||||||
|  | 
 | ||||||
|  | # change volumes to /mnt/tmp:/data/db | ||||||
|  | vim dockerfiles/database/valency-stack.yml | ||||||
|  | 
 | ||||||
|  | # change Makefile -runStack to mkdir -p /mnt/tmp | ||||||
|  | vim dockerfiles/database/Makefile | ||||||
|  | 
 | ||||||
|  | # run service | ||||||
|  | make database-service | ||||||
|  | 
 | ||||||
|  | # run ONLY ONCE to create users and restore database | ||||||
|  | make database-users | ||||||
|  | make database-restore | ||||||
|  | 
 | ||||||
|  | # double check if it worked | ||||||
|  | docker exec -it ef0a /bin/bash | ||||||
|  | 
 | ||||||
|  | # following steps in docker bash: | ||||||
|  |     # check if it worked by | ||||||
|  |     mongo --username <REGULAR USER> --password --authenticationDatabase valdb | ||||||
|  |     db.getRoles() | ||||||
|  | 
 | ||||||
|  | ``` | ||||||
| @ -1 +0,0 @@ | |||||||
| /home/kristjan/workdir/final_json/ |  | ||||||
| @ -1 +0,0 @@ | |||||||
| /home/kristjan/kres_mount/kres_parsed/tei/ |  | ||||||
										
											Binary file not shown.
										
									
								
							| @ -1 +0,0 @@ | |||||||
| /home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml |  | ||||||
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @ -1,5 +1,5 @@ | |||||||
| FROM mongo:latest | FROM mongo:4.2.9 | ||||||
| 
 | 
 | ||||||
| WORKDIR / | WORKDIR / | ||||||
| COPY init_inside_container.sh /. | COPY init_inside_mongo_container.sh /. | ||||||
| COPY create.js /. | COPY create_mongo.js /. | ||||||
|  | |||||||
| @ -2,33 +2,62 @@ | |||||||
| # collection names: lower case, plural
 | # collection names: lower case, plural
 | ||||||
| # user names?
 | # user names?
 | ||||||
| 
 | 
 | ||||||
| # mongo admin -u root -p password --eval "db.getSiblingDB('vlDB').addUser('vluser', 'password')"
 |  | ||||||
| 
 |  | ||||||
| STACKNAME = dbstack |  | ||||||
| 
 |  | ||||||
| .PHONY: start_db FORCE |  | ||||||
| 
 |  | ||||||
| all: build_run create_users | all: build_run create_users | ||||||
| 
 | 
 | ||||||
| build_run: build_mongo run_stack | build_run: build_mongo run_docker_compose | ||||||
| 
 | 
 | ||||||
| create.js: FORCE | postgres_create_roles: | ||||||
|  | 	echo 'psql -v ON_ERROR_STOP=OFF --username $(DB_ADM_USER) <<-EOSQL' > create_postgres.js | ||||||
|  | 	echo "create user $(DB_USR_USER) with encrypted password '$(DB_USR_PASS)';" >> create_postgres.js | ||||||
|  | 	echo "create database superdb_small;" >> create_postgres.js | ||||||
|  | 	echo "grant all privileges on database superdb_small to $(DB_USR_USER);" >> create_postgres.js | ||||||
|  | 	echo "grant usage on schema public to $(DB_USR_USER);" >> create_postgres.js | ||||||
|  | 	echo "grant select on all tables in schema public to $(DB_USR_USER);" >> create_postgres.js | ||||||
|  | 	echo "EOSQL" >> create_postgres.js | ||||||
|  | 	chmod +x create_postgres.js | ||||||
| 
 | 
 | ||||||
| FORCE: | mongo_create_roles: | ||||||
| 	echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create.js | 	echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create_mongo.js | ||||||
| 	echo 'use valdb' >> create.js | 	echo 'use valdb' >> create_mongo.js | ||||||
| 	echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create.js | 	echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create_mongo.js | ||||||
|  | 	echo 'db.grantRolesToUser("$(DB_USR_USER)", [{ role: "readWrite", db: "extvaldb"}])' >> create_mongo.js | ||||||
| 
 | 
 | ||||||
| build_mongo: create.js | build_mongo: mongo_create_roles | ||||||
| 	docker build . -t my-mongo --no-cache | 	docker build . -t my-mongo --no-cache | ||||||
| 
 | 
 | ||||||
| clean_stack: | # build_postgres: postgres_create_roles
 | ||||||
| 	docker stack rm $(STACKNAME) | # 	docker build . -t my-mongo --no-cache
 | ||||||
| 
 | 
 | ||||||
| run_stack: | run_docker_compose: | ||||||
| 	mkdir -p ${HOME}/mongo_container/data/ | 	mkdir -p ${HOME}/valency_data/mongo_container/data/ | ||||||
| 	docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME) | 	#docker kill $(shell ./get_mongo_container_name.sh) | ||||||
|  | 	#docker kill $(shell ./get_postgres_container_name.sh) | ||||||
|  | 	#docker-compose stop | ||||||
|  | 	docker-compose -f valency-stack.yml up -d --force-recreate | ||||||
|  | 	# docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME) | ||||||
| 
 | 
 | ||||||
| create_users: create.js | create_users: create_mongo_users create_postgres_users | ||||||
| 	docker exec $(shell ./get_container_name.sh) /init_inside_container.sh | 
 | ||||||
|  | 
 | ||||||
|  | create_mongo_users: mongo_create_roles | ||||||
|  | 	docker exec $(shell ./get_mongo_container_name.sh) /init_inside_mongo_container.sh | ||||||
| 	# rm create.js | 	# rm create.js | ||||||
|  | 
 | ||||||
|  | create_postgres_users: postgres_create_roles | ||||||
|  | 	docker exec $(shell ./get_postgres_container_name.sh) /scripts/init_inside_postgres_container.sh | ||||||
|  | 
 | ||||||
|  | restore_db: restore_mongo_db restore_postgres_db | ||||||
|  | 
 | ||||||
|  | restore_mongo_db: | ||||||
|  | ifeq (,$(wildcard ./mongo_db.gz)) | ||||||
|  | 	$(error "mongo_db.gz does not exists. Make sure to have dump of mongo db in 'dockerfiles/database/mongo_db.gz'") | ||||||
|  | else | ||||||
|  | 	docker exec $(shell ./get_mongo_container_name.sh) sh -c 'mongorestore --gzip --archive=/scripts/mongo_db.gz --db valdb --username $(DB_USR_USER) --password $(DB_USR_PASS) --authenticationDatabase valdb' | ||||||
|  | endif | ||||||
|  | 
 | ||||||
|  | restore_postgres_db: | ||||||
|  | ifeq (,$(wildcard ./postgres_db.tar)) | ||||||
|  | 	$(error "postgres_db.tar does not exists. Make sure to have dump of postgres db in 'dockerfiles/database/postgres_db.tar'") | ||||||
|  | else | ||||||
|  | 	docker exec $(shell ./get_postgres_container_name.sh) sh -c 'pg_restore -U $(DB_ADM_USER) --dbname=superdb_small --create --verbose /scripts/postgres_db.tar' | ||||||
|  | endif | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								dockerfiles/database/get_postgres_container_name.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										2
									
								
								dockerfiles/database/get_postgres_container_name.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,2 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | docker ps | grep postgres | awk '{print $1}' | ||||||
| @ -1,3 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| 
 |  | ||||||
| mongo admin < /create.js |  | ||||||
							
								
								
									
										3
									
								
								dockerfiles/database/init_inside_mongo_container.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										3
									
								
								dockerfiles/database/init_inside_mongo_container.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,3 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | 
 | ||||||
|  | mongo admin < /create_mongo.js | ||||||
							
								
								
									
										3
									
								
								dockerfiles/database/init_inside_postgres_container.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										3
									
								
								dockerfiles/database/init_inside_postgres_container.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,3 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | 
 | ||||||
|  | /scripts/create_postgres.js | ||||||
| @ -1,26 +0,0 @@ | |||||||
| version: '3.1' |  | ||||||
| 
 |  | ||||||
| services: |  | ||||||
| 
 |  | ||||||
|   my_mongo: |  | ||||||
|     image: my-mongo |  | ||||||
|     restart: always |  | ||||||
|     ports: |  | ||||||
|       - 27017:27017 |  | ||||||
|     environment: |  | ||||||
|       MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER} |  | ||||||
|       MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS} |  | ||||||
|     volumes: |  | ||||||
|       - ${HOME}/mongo_container/data/:/data/db |  | ||||||
| 
 |  | ||||||
|   mongo_express: |  | ||||||
|     image: mongo-express |  | ||||||
|     restart: always |  | ||||||
|     ports: |  | ||||||
|       - 8087:8081 |  | ||||||
|     environment: |  | ||||||
|       ME_CONFIG_BASICAUTH_USERNAME: ${MONGOEXPRESS_USER} |  | ||||||
|       ME_CONFIG_BASICAUTH_PASSWORD: ${MONGOEXPRESS_PASS} |  | ||||||
|       ME_CONFIG_MONGODB_ADMINUSERNAME: ${DB_ADM_USER} |  | ||||||
|       ME_CONFIG_MONGODB_ADMINPASSWORD: ${DB_ADM_PASS} |  | ||||||
|       ME_CONFIG_MONGODB_SERVER: my_mongo |  | ||||||
							
								
								
									
										27
									
								
								dockerfiles/database/valency-stack.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								dockerfiles/database/valency-stack.yml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,27 @@ | |||||||
|  | version: '3.1' | ||||||
|  | 
 | ||||||
|  | services: | ||||||
|  | 
 | ||||||
|  |   my_mongo: | ||||||
|  |     image: my-mongo | ||||||
|  |     restart: always | ||||||
|  |     ports: | ||||||
|  |       - 127.0.0.1:27017:27017 | ||||||
|  |     environment: | ||||||
|  |       MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER} | ||||||
|  |       MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS} | ||||||
|  |     volumes: | ||||||
|  |       - ${HOME}/valency_data/mongo_container/data/:/data/db | ||||||
|  |       - ./:/scripts | ||||||
|  | 
 | ||||||
|  |   my_postgres: | ||||||
|  |     image: postgres | ||||||
|  |     restart: always | ||||||
|  |     ports: | ||||||
|  |       - 127.0.0.1:5432:5432 | ||||||
|  |     environment: | ||||||
|  |       POSTGRES_USER: ${DB_ADM_USER} | ||||||
|  |       POSTGRES_PASSWORD: ${DB_ADM_PASS} | ||||||
|  |     volumes: | ||||||
|  |       - ${HOME}/valency_data/postgres_container/data/:/var/lib/postgresql/data | ||||||
|  |       - ./:/scripts | ||||||
| @ -6,7 +6,8 @@ vim \ | |||||||
| python3 \ | python3 \ | ||||||
| python3-pip \ | python3-pip \ | ||||||
| sshfs \ | sshfs \ | ||||||
| curl | curl \ | ||||||
|  | locales | ||||||
| 
 | 
 | ||||||
| RUN pip3 install --upgrade pip | RUN pip3 install --upgrade pip | ||||||
| 
 | 
 | ||||||
| @ -21,6 +22,16 @@ RUN pip3 install \ | |||||||
| 	flask_cors \ | 	flask_cors \ | ||||||
| 	pymongo \ | 	pymongo \ | ||||||
| 	flask-pymongo \ | 	flask-pymongo \ | ||||||
| 	gunicorn | 	gunicorn \ | ||||||
|  | 	SQLAlchemy \ | ||||||
|  | 	tqdm \ | ||||||
|  | 	psycopg2-binary | ||||||
|  | 
 | ||||||
|  | # Set the locale | ||||||
|  | RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ | ||||||
|  |     locale-gen | ||||||
|  | ENV LANG en_US.UTF-8   | ||||||
|  | ENV LANGUAGE en_US:en   | ||||||
|  | ENV LC_ALL en_US.UTF-8 | ||||||
| 
 | 
 | ||||||
| ENV PYTHONIOENCODING UTF-8 | ENV PYTHONIOENCODING UTF-8 | ||||||
|  | |||||||
| @ -24,3 +24,6 @@ server { | |||||||
|         proxy_pass http://backend_flask:8084; |         proxy_pass http://backend_flask:8084; | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | https://vezljivostni.cjvt.si/api/* -> http://vezljivostni-host.cjvt.si:8084/api/* | ||||||
|  | https://vezljivostni.cjvt.si/* -> http://vezljivostni-host.cjvt.si:80/* | ||||||
|  | |||||||
							
								
								
									
										37
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,37 @@ | |||||||
|  | asn1crypto==0.24.0 | ||||||
|  | beautifulsoup4==4.8.0 | ||||||
|  | bs4==0.0.1 | ||||||
|  | cffi==1.12.3 | ||||||
|  | Click==7.0 | ||||||
|  | cryptography==2.1.4 | ||||||
|  | Flask==1.1.1 | ||||||
|  | Flask-Cors==3.0.8 | ||||||
|  | Flask-PyMongo==2.3.0 | ||||||
|  | gunicorn==19.9.0 | ||||||
|  | idna==2.6 | ||||||
|  | itsdangerous==1.1.0 | ||||||
|  | Jinja2==2.10.1 | ||||||
|  | joblib==0.13.2 | ||||||
|  | keyring==10.6.0 | ||||||
|  | keyrings.alt==3.0 | ||||||
|  | lxml==4.4.0 | ||||||
|  | MarkupSafe==1.1.1 | ||||||
|  | numpy==1.17.0 | ||||||
|  | pandas==0.25.0 | ||||||
|  | pathlib==1.0.1 | ||||||
|  | psycopg2==2.8.4 | ||||||
|  | pycparser==2.19 | ||||||
|  | pycrypto==2.6.1 | ||||||
|  | pymongo==3.8.0 | ||||||
|  | python-dateutil==2.8.0 | ||||||
|  | pytz==2019.2 | ||||||
|  | pyxdg==0.25 | ||||||
|  | PyYAML==5.1.2 | ||||||
|  | scikit-learn==0.21.3 | ||||||
|  | scipy==1.3.0 | ||||||
|  | SecretStorage==2.3.1 | ||||||
|  | six==1.11.0 | ||||||
|  | sklearn==0.0 | ||||||
|  | soupsieve==1.9.3 | ||||||
|  | SQLAlchemy==1.3.12 | ||||||
|  | Werkzeug==0.15.5 | ||||||
							
								
								
									
										1708
									
								
								scripts/create_xml.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1708
									
								
								scripts/create_xml.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										189
									
								
								scripts/extract_keywords.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										189
									
								
								scripts/extract_keywords.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,189 @@ | |||||||
|  | import copy | ||||||
|  | import csv | ||||||
|  | from xml.etree import ElementTree | ||||||
|  | import re | ||||||
|  | import sys | ||||||
|  | import logging | ||||||
|  | import argparse | ||||||
|  | import pickle | ||||||
|  | import time | ||||||
|  | import gc | ||||||
|  | import subprocess | ||||||
|  | import concurrent.futures | ||||||
|  | import tempfile | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def read_gigafida(path): | ||||||
|  |     words = {} | ||||||
|  |     with open(path) as tsvfile: | ||||||
|  |         reader = csv.reader(tsvfile, delimiter='\t') | ||||||
|  |         for row in reader: | ||||||
|  |             words[row[0]] = int(row[2]) | ||||||
|  |     return words | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def read_sloleks(path): | ||||||
|  |     words = set() | ||||||
|  |     with open(path) as tsvfile: | ||||||
|  |         reader = csv.reader(tsvfile, delimiter='\t') | ||||||
|  |         for row in reader: | ||||||
|  |             words.add(row[1]) | ||||||
|  |     return words | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def read_zele(path): | ||||||
|  |     with open(path) as f: | ||||||
|  |         content = f.readlines() | ||||||
|  |         # fix content | ||||||
|  |         content[0] = content[0][1:] | ||||||
|  |     # a = content[2] | ||||||
|  |     # a = content[2].split() | ||||||
|  |     # a = content[2].split()[0].split('<IZT>')[1] | ||||||
|  |     # a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0] | ||||||
|  |     content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content] | ||||||
|  |     # content = [x.split() for x in content] | ||||||
|  |     return set(content) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def read_wordlist(path): | ||||||
|  |     with open(path) as f: | ||||||
|  |         content = [line[:-1] for line in f.readlines()] | ||||||
|  |     print(content[-1]) | ||||||
|  |     return set(content) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def filter_gigafida(gigafida_raw, min_limit, max_limit): | ||||||
|  |     return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def set_list_intersection(gigafida_filtered, sloleks): | ||||||
|  |     intersection = {} | ||||||
|  |     for word, num in gigafida_filtered.items(): | ||||||
|  |         if word in sloleks: | ||||||
|  |             intersection[word] = num | ||||||
|  |     return intersection | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def list_list_union(list1, list2): | ||||||
|  |     union = copy.copy(list1) | ||||||
|  |     for w, n in list2.items(): | ||||||
|  |         if w not in list1: | ||||||
|  |             union[w] = list2[w] | ||||||
|  |     return union | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def list_list_subtraction(list1, list2): | ||||||
|  |     subtraction = {} | ||||||
|  |     for w, n in list2.items(): | ||||||
|  |         # if w == 'dejati': | ||||||
|  |         #     print('here') | ||||||
|  |         if w not in list1: | ||||||
|  |             subtraction[w] = n | ||||||
|  |     return subtraction | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def set_set_subtraction(set1, set2): | ||||||
|  |     subtraction = {} | ||||||
|  |     for w in set2: | ||||||
|  |         if w not in set1: | ||||||
|  |             subtraction[w] = -1 | ||||||
|  |     return subtraction | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_document(list1, path): | ||||||
|  |     with open(path, "w") as text_file: | ||||||
|  |         for w, n in list1.items(): | ||||||
|  |             text_file.write("%s\t%d\n" % (w, n)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def create_document_set(list1, path): | ||||||
|  |     with open(path, "w") as text_file: | ||||||
|  |         for w in sorted(list(list1)): | ||||||
|  |             text_file.write("%s\n" % w) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max): | ||||||
|  |     gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max) | ||||||
|  |     sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks) | ||||||
|  |     gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize) | ||||||
|  |     zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele) | ||||||
|  |     sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect) | ||||||
|  |     sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered) | ||||||
|  |     return sloleks_zele_subtraction | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def main(args): | ||||||
|  |     gigafida_raw = read_gigafida(args.gigafida_verb_list) | ||||||
|  |     sloleks = read_sloleks(args.sloleks) | ||||||
|  |     zele = read_zele(args.zele) | ||||||
|  |     if args.wordlist is not None: | ||||||
|  |         sloleks_wordlist = set() | ||||||
|  |         # sloleks_wordlist = set() | ||||||
|  |         for el in sloleks: | ||||||
|  |             if el in gigafida_raw: | ||||||
|  |                 sloleks_wordlist.add(el) | ||||||
|  |         filtered_wordlist = read_wordlist(args.wordlist) | ||||||
|  | 
 | ||||||
|  |         # sloleks_wordlist = set() | ||||||
|  |         for el in sloleks: | ||||||
|  |             if el in gigafida_raw: | ||||||
|  |                 filtered_wordlist.add(el) | ||||||
|  | 
 | ||||||
|  |         create_document_set(filtered_wordlist, 'wordlist.tsv') | ||||||
|  |     # gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize) | ||||||
|  |     gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize) | ||||||
|  |     sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks) | ||||||
|  | 
 | ||||||
|  |     nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True) | ||||||
|  |     res = [el[0] for el in nouns_sloleks_gf_intersect] | ||||||
|  | 
 | ||||||
|  |     gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize) | ||||||
|  |     zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele) | ||||||
|  |     sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect) | ||||||
|  |     sloleks_zele_subtraction = set_set_subtraction(sloleks, zele) | ||||||
|  |     create_document(gigafida_filtered3, 'gigafida_3+.tsv') | ||||||
|  |     # create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv') | ||||||
|  |     create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv') | ||||||
|  |     create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv') | ||||||
|  | 
 | ||||||
|  |     # gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize) | ||||||
|  |     # sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered) | ||||||
|  |     gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize) | ||||||
|  |     create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv') | ||||||
|  | 
 | ||||||
|  |     # gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10) | ||||||
|  |     # sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered) | ||||||
|  |     gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10) | ||||||
|  |     create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv') | ||||||
|  |     # pass | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     parser = argparse.ArgumentParser( | ||||||
|  |         description='Extract keywords from multiple lists.') | ||||||
|  |     parser.add_argument('gigafida_verb_list', | ||||||
|  |                         help='Path to gigafida list of verbs in tsv format.') | ||||||
|  |     parser.add_argument('sloleks', | ||||||
|  |                         help='Path to Sloleks in tsv format.') | ||||||
|  |     parser.add_argument('--zele', | ||||||
|  |                         help='Path to zele valency dictionary.') | ||||||
|  |     parser.add_argument('--wordlist', default=None, | ||||||
|  |                         help='Path to filtered wordlist.') | ||||||
|  |     parser.add_argument('--handchecked_words', default=None, | ||||||
|  |                         help='Path to handchecked words.') | ||||||
|  |     # parser.add_argument('--min_limit', | ||||||
|  |     #                     help='Limit min number of ocurrences', | ||||||
|  |     #                     type=int, default=0) | ||||||
|  |     # parser.add_argument('--max_limit', | ||||||
|  |     #                     help='Limit max number of ocurrences', | ||||||
|  |     #                     type=int, default=sys.maxsize) | ||||||
|  |     parser.add_argument('--verbose', help='Enable verbose output to stderr', | ||||||
|  |                         choices=["warning", "info", "debug"], default="info", | ||||||
|  |                         const="info", nargs='?') | ||||||
|  | 
 | ||||||
|  |     args = parser.parse_args() | ||||||
|  |     logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) | ||||||
|  | 
 | ||||||
|  |     start = time.time() | ||||||
|  |     main(args) | ||||||
|  |     logging.info("TIME: {}".format(time.time() - start)) | ||||||
							
								
								
									
										117
									
								
								scripts/form_csv.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										117
									
								
								scripts/form_csv.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,117 @@ | |||||||
|  | import argparse | ||||||
|  | import csv | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | from lxml import etree, objectify, html | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def write_general_statistics(path, out_list): | ||||||
|  |     if len(out_list) == 0: | ||||||
|  |         return | ||||||
|  |     with open(path, 'w') as csvfile: | ||||||
|  |         writer = csv.writer(csvfile, delimiter='\t', | ||||||
|  |                                 quotechar='"') | ||||||
|  |         writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio']) | ||||||
|  |         for line in out_list: | ||||||
|  |             writer.writerow(line) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def write_statistics(path, out_list): | ||||||
|  |     if len(out_list) == 0: | ||||||
|  |         return | ||||||
|  |     with open(path, 'w') as csvfile: | ||||||
|  |         writer = csv.writer(csvfile, delimiter='\t', | ||||||
|  |                             quotechar='"') | ||||||
|  |         writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example']) | ||||||
|  |         for line in out_list: | ||||||
|  |             writer.writerow(line) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def main(args): | ||||||
|  |     for file in sorted(os.listdir(args.input)): | ||||||
|  |         path = os.path.join(args.input, file) | ||||||
|  |         tree = etree.parse(path) | ||||||
|  |         gf_output = [] | ||||||
|  |         ssj_output = [] | ||||||
|  |         head = next(tree.iter('head')) | ||||||
|  |         headword = head.find('headword').find('lemma').text | ||||||
|  |         #for div in root.iterfind('.//div'): | ||||||
|  |         for elem in tree.iter('statisticsContainer'): | ||||||
|  |         # for element in tree.iterfind('statisticsContainer'): | ||||||
|  |         # for element in tree.find('statisticsContainer'): | ||||||
|  |             semRole = elem.find('semanticRole').text | ||||||
|  |             gf_pattern = None | ||||||
|  |             gf_sentence = None | ||||||
|  |             ssj_pattern = None | ||||||
|  |             ssj_sentence = None | ||||||
|  |             measure = elem.find('measureList') | ||||||
|  |             for el in measure: | ||||||
|  |                 if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0': | ||||||
|  |                     gf_pattern = el.text | ||||||
|  |                 if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0': | ||||||
|  |                     gf_sentence = el.text | ||||||
|  |                 if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2': | ||||||
|  |                     ssj_pattern = el.text | ||||||
|  |                 if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2': | ||||||
|  |                     ssj_sentence = el.text | ||||||
|  |             if gf_pattern is not None and gf_sentence is not None: | ||||||
|  |                 gf_output.append([semRole, gf_pattern, gf_sentence]) | ||||||
|  |             if ssj_pattern is not None and ssj_sentence is not None: | ||||||
|  |                 ssj_output.append([semRole, ssj_pattern, ssj_sentence]) | ||||||
|  | 
 | ||||||
|  |         print(file) | ||||||
|  | 
 | ||||||
|  |         analyze_output = [] | ||||||
|  |         for elem in tree.iter('valencyPattern'): | ||||||
|  |             valency_pattern_id = elem.attrib['id'] | ||||||
|  | 
 | ||||||
|  |             # get frequency | ||||||
|  |             measure = '' | ||||||
|  |             for measure_el in elem.find('measureList').findall('measure'): | ||||||
|  |                 if measure_el.attrib['source'] == 'Gigafida 2.0': | ||||||
|  |                     measure = measure_el.text | ||||||
|  | 
 | ||||||
|  |             # get semantic roles | ||||||
|  |             semantic_roles_list = [] | ||||||
|  |             for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'): | ||||||
|  |                 semantic_roles_list.append(semantic_rol_con.find('semanticRole').text) | ||||||
|  |             semantic_roles = '_'.join(semantic_roles_list) | ||||||
|  | 
 | ||||||
|  |             # pattern representation | ||||||
|  |             pattern_representation = elem.find('patternRepresentation').text | ||||||
|  | 
 | ||||||
|  |             # corpus example | ||||||
|  |             if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None: | ||||||
|  |                 corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode') | ||||||
|  | 
 | ||||||
|  |             else: | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             # ugly postprocessing to remove xmlns:xsi=... duh.. | ||||||
|  |             root = etree.fromstring(corpus_example_text) | ||||||
|  | 
 | ||||||
|  |             # Remove namespace prefixes | ||||||
|  |             for elem in root.getiterator(): | ||||||
|  |                 elem.tag = etree.QName(elem).localname | ||||||
|  |             # Remove unused namespace declarations | ||||||
|  |             etree.cleanup_namespaces(root) | ||||||
|  | 
 | ||||||
|  |             corpus_example = etree.tostring(root, encoding='unicode') | ||||||
|  | 
 | ||||||
|  |             print(f"Valency pattern {valency_pattern_id}") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |             analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example]) | ||||||
|  | 
 | ||||||
|  |         write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output) | ||||||
|  |         write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output) | ||||||
|  |         write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output) | ||||||
|  | 
 | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.') | ||||||
|  |     arg_parser.add_argument('--input', type=str, help='Input directory') | ||||||
|  |     arg_parser.add_argument('--output', type=str, help='Output directory') | ||||||
|  | 
 | ||||||
|  |     args = arg_parser.parse_args() | ||||||
|  | 
 | ||||||
|  |     main(args) | ||||||
							
								
								
									
										1
									
								
								scripts/valency
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								scripts/valency
									
									
									
									
									
										Symbolic link
									
								
							| @ -0,0 +1 @@ | |||||||
|  | ../src/pkg/valency/valency | ||||||
							
								
								
									
										8
									
								
								scripts/xsd_checker.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								scripts/xsd_checker.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | |||||||
|  | from lxml import etree as lxml | ||||||
|  | 
 | ||||||
|  | with open('../data/inventory.xsd') as f: | ||||||
|  |     xmlschema_doc = lxml.parse(f) | ||||||
|  |     xmlschema = lxml.XMLSchema(xmlschema_doc) | ||||||
|  |     with open('../data/xmls/output.xml') as op: | ||||||
|  |         doc = lxml.parse(op) | ||||||
|  |         print(xmlschema.validate(doc)) | ||||||
							
								
								
									
										0
									
								
								src/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -37,7 +37,8 @@ app = Flask(__name__) | |||||||
| app.config.from_object("db_config") | app.config.from_object("db_config") | ||||||
| mongo = PyMongo(app) | mongo = PyMongo(app) | ||||||
| 
 | 
 | ||||||
| app.config["CORPORA"] = ["ssj", "kres"] | # app.config["CORPORA"] = ["ssj", "kres", "gigafida"] | ||||||
|  | app.config["CORPORA"] = ["gigafida"] | ||||||
| app.config["BANNED_HEADWORDS"] = ["biti"] | app.config["BANNED_HEADWORDS"] = ["biti"] | ||||||
| app.config["QUERY_LIMIT"] = 1000 | app.config["QUERY_LIMIT"] = 1000 | ||||||
| 
 | 
 | ||||||
| @ -247,20 +248,23 @@ def api_get_frames(): | |||||||
|     if corpus not in app.config["CORPORA"]: |     if corpus not in app.config["CORPORA"]: | ||||||
|         return json.dumps({"error": "cor={kres,ssj}"}) |         return json.dumps({"error": "cor={kres,ssj}"}) | ||||||
| 
 | 
 | ||||||
|  |     log.info("Test1") | ||||||
|     cur = mongo.db[corpus].find({"headwords": hw}) |     cur = mongo.db[corpus].find({"headwords": hw}) | ||||||
|  |     log.info("Test2") | ||||||
|     frames = [] |     frames = [] | ||||||
|     for ent in cur[:app.config["QUERY_LIMIT"]]: |     for ent in cur[:app.config["QUERY_LIMIT"]]: | ||||||
|         frames += frames_from_db_entry(ent)  # pre-process this step for prod TODO |         frames += frames_from_db_entry(ent)  # pre-process this step for prod TODO | ||||||
|     cur.close() |     cur.close() | ||||||
| 
 |     log.info("Test3") | ||||||
|     # filter by relevant hw |     # filter by relevant hw | ||||||
|     frames = [x for x in frames if x.hw == hw] |     frames = [x for x in frames if x.hw == hw] | ||||||
| 
 | 
 | ||||||
|     ret_frames = RF(frames, mongo.db.sensemap) |     ret_frames = RF(frames, mongo.db.sensemap) | ||||||
| 
 |     log.info("Test3") | ||||||
|     json_ret = {"frames": []} |     json_ret = {"frames": []} | ||||||
|     for frame in ret_frames: |     for frame in ret_frames: | ||||||
|         json_ret["frames"].append(frame.to_json()) |         json_ret["frames"].append(frame.to_json()) | ||||||
|  |     log.info("Test4") | ||||||
|     return json.dumps(json_ret) |     return json.dumps(json_ret) | ||||||
|     # return prepare_frames(ret_frames) |     # return prepare_frames(ret_frames) | ||||||
| 
 | 
 | ||||||
| @ -444,7 +448,7 @@ def _is_banned(hw): | |||||||
|         banned = False |         banned = False | ||||||
|     return banned |     return banned | ||||||
| 
 | 
 | ||||||
| def prepare_app_index(appindex_json, sskj_wordlist): | def prepare_app_index(appindex_json): | ||||||
|     log.info("[*] preparing app_index") |     log.info("[*] preparing app_index") | ||||||
|     # create app_index (used in frontend, left side word index) |     # create app_index (used in frontend, left side word index) | ||||||
|     tmp_app_index = {c: {} for c in app.config["CORPORA"]} |     tmp_app_index = {c: {} for c in app.config["CORPORA"]} | ||||||
| @ -452,6 +456,14 @@ def prepare_app_index(appindex_json, sskj_wordlist): | |||||||
|         res_hws = {} |         res_hws = {} | ||||||
|         res_fns = {} |         res_fns = {} | ||||||
| 
 | 
 | ||||||
|  |         # print('CORPUS...!!...') | ||||||
|  |         # print(corpus) | ||||||
|  |         # a = mongo.db[corpus] | ||||||
|  |         # print('TEST_OK') | ||||||
|  |         # print(a) | ||||||
|  |         # print(mongo.db) | ||||||
|  |         # a = mongo.db.list_collection_names() | ||||||
|  |         # print('TEST_OK2') | ||||||
|         nentries = mongo.db[corpus].count() |         nentries = mongo.db[corpus].count() | ||||||
|         idx = 0 |         idx = 0 | ||||||
|         for e in mongo.db[corpus].find({}): |         for e in mongo.db[corpus].find({}): | ||||||
| @ -484,6 +496,7 @@ def prepare_app_index(appindex_json, sskj_wordlist): | |||||||
| 
 | 
 | ||||||
|         for letter, words in alphabetical.items(): |         for letter, words in alphabetical.items(): | ||||||
|             filtered_words = [x for x in words if not _is_banned(x[0])] |             filtered_words = [x for x in words if not _is_banned(x[0])] | ||||||
|  |             # filtered_words = [x for x in words] | ||||||
|             alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0]) |             alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0]) | ||||||
| 
 | 
 | ||||||
|         tmp_app_index[corpus]["words"] = alphabetical |         tmp_app_index[corpus]["words"] = alphabetical | ||||||
| @ -560,12 +573,16 @@ if __name__ == "__main__": | |||||||
|     if args.prepare_db: |     if args.prepare_db: | ||||||
|         with Path(args.sskj_wordlist).open("r") as fp: |         with Path(args.sskj_wordlist).open("r") as fp: | ||||||
|             sskj_wordlist = json.load(fp) |             sskj_wordlist = json.load(fp) | ||||||
|         prepare_app_index(args.appindex_json, sskj_wordlist) |         prepare_app_index(args.appindex_json) | ||||||
|         sys.exit() |         sys.exit() | ||||||
| 
 | 
 | ||||||
|     # app index from db |     # app index from db | ||||||
|     with Path(args.appindex_json).open("r") as fp: |     with Path(args.appindex_json).open("r") as fp: | ||||||
|         app.config["app_index"] = json.load(fp) |         app.config["app_index"] = json.load(fp) | ||||||
|  |         # a = app.config["app_index"] | ||||||
|  |         # b = app.config["app_index"]["kres"] | ||||||
|  |         # c = app.config["app_index"]["kres"]["words"] | ||||||
|  |         # print('HERE') | ||||||
| 
 | 
 | ||||||
|     # log.info("[*] Starting app.py with config:\n%s".format(config)) |     # log.info("[*] Starting app.py with config:\n%s".format(config)) | ||||||
|     log.info("[*] Starting app.py with config:\n{}".format(config)) |     log.info("[*] Starting app.py with config:\n{}".format(config)) | ||||||
|  | |||||||
							
								
								
									
										106
									
								
								src/backend_flask/build_app_index.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								src/backend_flask/build_app_index.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,106 @@ | |||||||
|  | import argparse | ||||||
|  | import json | ||||||
|  | 
 | ||||||
|  | from flask import Flask | ||||||
|  | from flask_pymongo import PyMongo | ||||||
|  | from pathlib import Path | ||||||
|  | 
 | ||||||
|  | app = Flask(__name__) | ||||||
|  | 
 | ||||||
|  | app.config.from_object("db_config") | ||||||
|  | mongo = PyMongo(app) | ||||||
|  | 
 | ||||||
|  | app.config["BANNED_HEADWORDS"] = ["biti"] | ||||||
|  | 
 | ||||||
|  | def _is_banned(hw): | ||||||
|  |     banned = True | ||||||
|  |     if hw in app.config["BANNED_HEADWORDS"]: | ||||||
|  |         banned = True | ||||||
|  |     elif hw in sskj_wordlist["wordlist"]: | ||||||
|  |         banned = False | ||||||
|  |     elif (hw + " se") in sskj_wordlist["wordlist"]: | ||||||
|  |         banned = False | ||||||
|  |     return banned | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def prepare_app_index(appindex_json, corporas, previous_json=None): | ||||||
|  |     if previous_json: | ||||||
|  |         with Path(previous_json).open("r") as fp: | ||||||
|  |             tmp_app_index = json.load(fp) | ||||||
|  |     else: | ||||||
|  |         tmp_app_index = {} | ||||||
|  |     # create app_index (used in frontend, left side word index) | ||||||
|  |     for c in corporas: | ||||||
|  |         tmp_app_index[c] = {} | ||||||
|  | 
 | ||||||
|  |     for corpus in corporas: | ||||||
|  |         res_hws = {} | ||||||
|  |         res_fns = {} | ||||||
|  | 
 | ||||||
|  |         # print('CORPUS...!!...') | ||||||
|  |         # print(corpus) | ||||||
|  |         # a = mongo.db[corpus] | ||||||
|  |         # print('TEST_OK') | ||||||
|  |         # print(a) | ||||||
|  |         # print(mongo.db) | ||||||
|  |         # a = mongo.db.list_collection_names() | ||||||
|  |         # print('TEST_OK2') | ||||||
|  |         nentries = mongo.db[corpus].count() | ||||||
|  |         idx = 0 | ||||||
|  |         for e in mongo.db[corpus].find({}): | ||||||
|  |             if "headwords" not in e: | ||||||
|  |                 continue | ||||||
|  |             for hw in e["headwords"]: | ||||||
|  |                 if hw in res_hws: | ||||||
|  |                     res_hws[hw] += 1 | ||||||
|  |                 else: | ||||||
|  |                     res_hws[hw] = 1 | ||||||
|  |             if "functors" not in e: | ||||||
|  |                 continue | ||||||
|  |             for fn in e["functors"]: | ||||||
|  |                 if fn in res_fns: | ||||||
|  |                     res_fns[fn] += 1 | ||||||
|  |                 else: | ||||||
|  |                     res_fns[fn] = 1 | ||||||
|  |             idx += 1 | ||||||
|  |             if idx % 10000 == 0: | ||||||
|  |                 print("indexing {}: {}/{}".format( | ||||||
|  |                     corpus, idx, nentries)) | ||||||
|  | 
 | ||||||
|  |         alphabetical = {} | ||||||
|  |         for k, e in res_hws.items(): | ||||||
|  |             fst = k[0].lower() | ||||||
|  |             if fst in alphabetical: | ||||||
|  |                 alphabetical[fst].append((k, e)) | ||||||
|  |             else: | ||||||
|  |                 alphabetical[fst] = [(k, e)] | ||||||
|  | 
 | ||||||
|  |         for letter, words in alphabetical.items(): | ||||||
|  |             filtered_words = [x for x in words if not _is_banned(x[0])] | ||||||
|  |             # filtered_words = [x for x in words] | ||||||
|  |             alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0]) | ||||||
|  | 
 | ||||||
|  |         tmp_app_index[corpus]["words"] = alphabetical | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |         functors = [(k, e) for (k, e) in res_fns.items()] | ||||||
|  |         functors = sorted(functors, key=lambda x: x[0]) | ||||||
|  |         tmp_app_index[corpus]["functors"] = functors | ||||||
|  | 
 | ||||||
|  |     with Path(appindex_json).open("w") as fp: | ||||||
|  |         json.dump(tmp_app_index, fp) | ||||||
|  | 
 | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     print("Starting app.py main()") | ||||||
|  |     aparser = argparse.ArgumentParser(description="Arguments for app.py") | ||||||
|  |     aparser.add_argument("--previous-json", type=str, default=None) | ||||||
|  |     aparser.add_argument("--appindex-json", type=str) | ||||||
|  |     aparser.add_argument("--sskj-wordlist", type=str) | ||||||
|  |     args = aparser.parse_args() | ||||||
|  | 
 | ||||||
|  |     corporas = ['gigafida'] | ||||||
|  | 
 | ||||||
|  |     with Path(args.sskj_wordlist).open("r") as fp: | ||||||
|  |         sskj_wordlist = json.load(fp) | ||||||
|  | 
 | ||||||
|  |     prepare_app_index(args.appindex_json, corporas, args.previous_json) | ||||||
| @ -1,2 +1,2 @@ | |||||||
| MONGO_URI = "mongodb://sizif:p5e3r4u8t7@my_mongo:27017/valdb" | MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb" | ||||||
| MONGO_AUTH_SOURCE = 'admin' | MONGO_AUTH_SOURCE = 'admin' | ||||||
|  | |||||||
							
								
								
									
										18
									
								
								src/backend_flask/get_sentence_ids.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								src/backend_flask/get_sentence_ids.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | |||||||
|  | import json | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json" | ||||||
|  | output_file = "../../all_sentences.json" | ||||||
|  | 
 | ||||||
|  | results = {} | ||||||
|  | filenames = os.listdir(input_dir) | ||||||
|  | len(filenames) | ||||||
|  | for i, filename in enumerate(filenames): | ||||||
|  |     if filename.endswith(".json"): | ||||||
|  |         with open(os.path.join(input_dir, filename)) as json_file: | ||||||
|  |             data = json.load(json_file) | ||||||
|  |             results[filename.split('-')[0]] = list(data.keys()) | ||||||
|  |         print('Progress: %.2f %%' % (i/len(filenames))) | ||||||
|  | 
 | ||||||
|  | with open(output_file, 'w') as f: | ||||||
|  |     json.dump(results, f) | ||||||
| @ -1,3 +1,3 @@ | |||||||
| { | { | ||||||
|     "api_addr": "http://193.2.76.103:8084" |     "api_addr": "http://0.0.0.0:8084" | ||||||
| } | } | ||||||
|  | |||||||
							
								
								
									
										28
									
								
								src/frontend_vue/package-lock.json
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										28
									
								
								src/frontend_vue/package-lock.json
									
									
									
										generated
									
									
									
								
							| @ -3513,14 +3513,12 @@ | |||||||
|         "balanced-match": { |         "balanced-match": { | ||||||
|           "version": "1.0.0", |           "version": "1.0.0", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true | ||||||
|           "optional": true |  | ||||||
|         }, |         }, | ||||||
|         "brace-expansion": { |         "brace-expansion": { | ||||||
|           "version": "1.1.11", |           "version": "1.1.11", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true, | ||||||
|           "optional": true, |  | ||||||
|           "requires": { |           "requires": { | ||||||
|             "balanced-match": "^1.0.0", |             "balanced-match": "^1.0.0", | ||||||
|             "concat-map": "0.0.1" |             "concat-map": "0.0.1" | ||||||
| @ -3535,20 +3533,17 @@ | |||||||
|         "code-point-at": { |         "code-point-at": { | ||||||
|           "version": "1.1.0", |           "version": "1.1.0", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true | ||||||
|           "optional": true |  | ||||||
|         }, |         }, | ||||||
|         "concat-map": { |         "concat-map": { | ||||||
|           "version": "0.0.1", |           "version": "0.0.1", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true | ||||||
|           "optional": true |  | ||||||
|         }, |         }, | ||||||
|         "console-control-strings": { |         "console-control-strings": { | ||||||
|           "version": "1.1.0", |           "version": "1.1.0", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true | ||||||
|           "optional": true |  | ||||||
|         }, |         }, | ||||||
|         "core-util-is": { |         "core-util-is": { | ||||||
|           "version": "1.0.2", |           "version": "1.0.2", | ||||||
| @ -3665,8 +3660,7 @@ | |||||||
|         "inherits": { |         "inherits": { | ||||||
|           "version": "2.0.3", |           "version": "2.0.3", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true | ||||||
|           "optional": true |  | ||||||
|         }, |         }, | ||||||
|         "ini": { |         "ini": { | ||||||
|           "version": "1.3.5", |           "version": "1.3.5", | ||||||
| @ -3678,7 +3672,6 @@ | |||||||
|           "version": "1.0.0", |           "version": "1.0.0", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true, | ||||||
|           "optional": true, |  | ||||||
|           "requires": { |           "requires": { | ||||||
|             "number-is-nan": "^1.0.0" |             "number-is-nan": "^1.0.0" | ||||||
|           } |           } | ||||||
| @ -3693,7 +3686,6 @@ | |||||||
|           "version": "3.0.4", |           "version": "3.0.4", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true, | ||||||
|           "optional": true, |  | ||||||
|           "requires": { |           "requires": { | ||||||
|             "brace-expansion": "^1.1.7" |             "brace-expansion": "^1.1.7" | ||||||
|           } |           } | ||||||
| @ -3701,14 +3693,12 @@ | |||||||
|         "minimist": { |         "minimist": { | ||||||
|           "version": "0.0.8", |           "version": "0.0.8", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true | ||||||
|           "optional": true |  | ||||||
|         }, |         }, | ||||||
|         "minipass": { |         "minipass": { | ||||||
|           "version": "2.3.5", |           "version": "2.3.5", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true, | ||||||
|           "optional": true, |  | ||||||
|           "requires": { |           "requires": { | ||||||
|             "safe-buffer": "^5.1.2", |             "safe-buffer": "^5.1.2", | ||||||
|             "yallist": "^3.0.0" |             "yallist": "^3.0.0" | ||||||
| @ -3727,7 +3717,6 @@ | |||||||
|           "version": "0.5.1", |           "version": "0.5.1", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true, | ||||||
|           "optional": true, |  | ||||||
|           "requires": { |           "requires": { | ||||||
|             "minimist": "0.0.8" |             "minimist": "0.0.8" | ||||||
|           } |           } | ||||||
| @ -3808,8 +3797,7 @@ | |||||||
|         "number-is-nan": { |         "number-is-nan": { | ||||||
|           "version": "1.0.1", |           "version": "1.0.1", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true | ||||||
|           "optional": true |  | ||||||
|         }, |         }, | ||||||
|         "object-assign": { |         "object-assign": { | ||||||
|           "version": "4.1.1", |           "version": "4.1.1", | ||||||
| @ -3821,7 +3809,6 @@ | |||||||
|           "version": "1.4.0", |           "version": "1.4.0", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true, | ||||||
|           "optional": true, |  | ||||||
|           "requires": { |           "requires": { | ||||||
|             "wrappy": "1" |             "wrappy": "1" | ||||||
|           } |           } | ||||||
| @ -3943,7 +3930,6 @@ | |||||||
|           "version": "1.0.2", |           "version": "1.0.2", | ||||||
|           "bundled": true, |           "bundled": true, | ||||||
|           "dev": true, |           "dev": true, | ||||||
|           "optional": true, |  | ||||||
|           "requires": { |           "requires": { | ||||||
|             "code-point-at": "^1.0.0", |             "code-point-at": "^1.0.0", | ||||||
|             "is-fullwidth-code-point": "^1.0.0", |             "is-fullwidth-code-point": "^1.0.0", | ||||||
|  | |||||||
| @ -62,7 +62,7 @@ export default { | |||||||
|     name: "Nav", |     name: "Nav", | ||||||
|     props: ["appState"], |     props: ["appState"], | ||||||
|     data() {return { |     data() {return { | ||||||
|         optCorpora: ["kres", "ssj"], |         optCorpora: ["kres", "ssj", "gigafida"], | ||||||
|         optIndexes: [ |         optIndexes: [ | ||||||
|             {key: "besede", val: "words"}, |             {key: "besede", val: "words"}, | ||||||
|             {key: "udeleženske vloge", val: "functors"}, |             {key: "udeleženske vloge", val: "functors"}, | ||||||
|  | |||||||
							
								
								
									
										0
									
								
								src/pkg/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/pkg/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -1 +1 @@ | |||||||
| Subproject commit 01adf47b9b63b43f86bff52429792b0de2327ddd | Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd | ||||||
							
								
								
									
										1
									
								
								src/pkg/luscenje_struktur
									
									
									
									
									
										Submodule
									
								
							
							
								
								
								
								
								
								
							
						
						
									
										1
									
								
								src/pkg/luscenje_struktur
									
									
									
									
									
										Submodule
									
								
							| @ -0,0 +1 @@ | |||||||
|  | Subproject commit 8c87d07b8a3ca73faac2fac30c39969bc5f97d45 | ||||||
| @ -3,6 +3,41 @@ from corpusparser import enriched_lemma | |||||||
| 
 | 
 | ||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
| 
 | 
 | ||||||
|  | def frames_from_db_entry_headword(dbent, headword): | ||||||
|  |     def _full_tid(tid): | ||||||
|  |         return ".".join([dbent["sid"], str(tid)]) | ||||||
|  | 
 | ||||||
|  |     token_dict = {str(x["tid"]): x for x in dbent["tokens"]} | ||||||
|  | 
 | ||||||
|  |     frames = [] | ||||||
|  |     if "srl_links" not in dbent: | ||||||
|  |         return [] | ||||||
|  |     srldict = {} | ||||||
|  |     for srl in dbent["srl_links"]: | ||||||
|  |         key = str(srl["from"]) | ||||||
|  |         if enriched_lemma(token_dict[key]) != headword: | ||||||
|  |             continue | ||||||
|  |         if key not in srldict: | ||||||
|  |             srldict[key] = [srl] | ||||||
|  |         else: | ||||||
|  |             srldict[key] += [srl] | ||||||
|  |     for hwtid, srlarr in srldict.items(): | ||||||
|  |         frames += [Frame( | ||||||
|  |             hw_lemma=enriched_lemma(token_dict[hwtid]), | ||||||
|  |             tids=[_full_tid(hwtid)], | ||||||
|  |             slots=[ | ||||||
|  |                 Slot( | ||||||
|  |                     functor=srl["afun"], | ||||||
|  |                     tids=[_full_tid(srl["to"])] | ||||||
|  |                 ) for srl in srlarr | ||||||
|  |             ], | ||||||
|  |             # sentences=[(dbent["sid"], dbent["tokens"])], | ||||||
|  |             sentences=[ | ||||||
|  |                 [(_full_tid(t["tid"]), t) for t in dbent["tokens"]], | ||||||
|  |             ] | ||||||
|  |         )] | ||||||
|  |     return frames | ||||||
|  | 
 | ||||||
| def frames_from_db_entry(dbent): | def frames_from_db_entry(dbent): | ||||||
|     def _full_tid(tid): |     def _full_tid(tid): | ||||||
|         return ".".join([dbent["sid"], str(tid)]) |         return ".".join([dbent["sid"], str(tid)]) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user