forked from kristjan/cjvt-valency
		
	Compare commits
	
		
			14 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| ec083a8d63 | |||
| 69c3521e4b | |||
| 75b015dcda | |||
| c18aaff11f | |||
| 34b776be11 | |||
| 26bca0b083 | |||
| 2551a9c6a8 | |||
| 5cdc963c2d | |||
| ce1fb46b4e | |||
| 220529b777 | |||
| ae5f2869bc | |||
| 931b3531b3 | |||
| 3d91251905 | |||
| c803057164 | 
							
								
								
									
										11
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										11
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -7,6 +7,17 @@ data/appindex.json | ||||
| src/frontend_vue/node_modules/ | ||||
| src/frontend_vue/dist/ | ||||
| dockerfiles/database/create.js | ||||
| dockerfiles/database/create_mongo.js | ||||
| dockerfiles/database/create_postgres.js | ||||
| dockerfiles/database/mongo_db.gz | ||||
| dockerfiles/database/postgres_db.tar | ||||
| dockerfiles/database/postgres_db_OLD.tar | ||||
| *__pycache__/ | ||||
| env.local | ||||
| logs/* | ||||
| .idea/ | ||||
| venv* | ||||
| data/ | ||||
| data | ||||
| deploy_instructions/ | ||||
| run.sh | ||||
|  | ||||
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @ -1,3 +1,6 @@ | ||||
| [submodule "src/pkg/cjvt-corpusparser"] | ||||
| 	path = src/pkg/cjvt-corpusparser | ||||
| 	url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git | ||||
| [submodule "src/pkg/luscenje_struktur"] | ||||
| 	path = src/pkg/luscenje_struktur | ||||
| 	url = https://gitea.cjvt.si/ozbolt/luscenje_struktur.git | ||||
|  | ||||
							
								
								
									
										23
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								Makefile
									
									
									
									
									
								
							| @ -13,10 +13,11 @@ SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link" | ||||
| # KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
 | ||||
| # KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
 | ||||
| KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml" | ||||
| GIGAFIDA_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/giga_orig" | ||||
| # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
 | ||||
| # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
 | ||||
| KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json" | ||||
| 
 | ||||
| GIGAFIDA_SRL_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/final_json" | ||||
| # This file comes with the source code. Make sure you unpack it and name it right.
 | ||||
| SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json" | ||||
| SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json" | ||||
| @ -26,14 +27,14 @@ APPINDEX_PATH = "$(MAKE_ROOT)/data/appindex.json" | ||||
| 
 | ||||
| OUTPUT = "db" | ||||
| # OUTPUT = "file"
 | ||||
| OUTDIR = "/tmp/three"  # if you're running this in docker, make sure to mount the volume | ||||
| OUTDIR = "/project/data"  # if you're running this in docker, make sure to mount the volume | ||||
| DBADDR = "0.0.0.0:27017"  # don't use localhost | ||||
| 
 | ||||
| # credentials from .gitignored file
 | ||||
| # create it from env.default
 | ||||
| include env.local | ||||
| 
 | ||||
| N_CORES = 3 | ||||
| N_CORES = 4 | ||||
| # insert kres files into database in chunks, for fewer connections
 | ||||
| KRES_CHUNK_SIZE = 30 | ||||
| 
 | ||||
| @ -56,6 +57,12 @@ database-service: | ||||
| database-users: | ||||
| 	cd dockerfiles/database; $(MAKE) create_users | ||||
| 
 | ||||
| database-restore: | ||||
| 	cd dockerfiles/database; $(MAKE) restore_db | ||||
| 
 | ||||
| database-restore-postgres: | ||||
| 	cd dockerfiles/database; $(MAKE) restore_postgres_db | ||||
| 
 | ||||
| # also useful, if we want to restart the db
 | ||||
| database-clean: | ||||
| 	cd dockerfiles/database; $(MAKE) clean_stack | ||||
| @ -69,6 +76,7 @@ python-env-install: | ||||
| 	pip3 install -e src/pkg/cjvt-corpusparser/. | ||||
| 	pip3 install -e src/pkg/valency/. | ||||
| 	pip3 install -e src/pkg/seqparser/. | ||||
| 	pip3 install -e src/pkg/luscenje_struktur/. | ||||
| 
 | ||||
| # from inside python-env container:
 | ||||
| data/samples: | ||||
| @ -93,7 +101,14 @@ fill-database-kres: data/samples | ||||
| 		--chunk-size $(KRES_CHUNK_SIZE) \
 | ||||
| 		--cores $(N_CORES) | ||||
| 
 | ||||
| 
 | ||||
| fill-database-gigafida: data/samples | ||||
| 	python3 src/pkg/cjvt-corpusparser/corpusparser/main.py --kres-folder $(GIGAFIDA_FOLDER) \
 | ||||
| 		--corpus="gigafida" \
 | ||||
| 		--ssj-file $(SSJ_FILE) --kres-srl-folder $(GIGAFIDA_SRL_FOLDER) \
 | ||||
| 		--output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \
 | ||||
| 		--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \
 | ||||
| 		--chunk-size $(KRES_CHUNK_SIZE) \
 | ||||
| 		--cores $(N_CORES) | ||||
| 
 | ||||
| ## Frontend
 | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										44
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										44
									
								
								README.md
									
									
									
									
									
								
							| @ -179,3 +179,47 @@ $ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0 | ||||
| ``` | ||||
| 
 | ||||
| After uploading, restart the stack with `27017` commented out.   | ||||
| 
 | ||||
| ## Script running | ||||
| 
 | ||||
| ### Environment setup | ||||
| ```bash | ||||
| pip install -r requirements.txt | ||||
| pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git | ||||
| pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git | ||||
| ``` | ||||
| 
 | ||||
| ### Running on already setup environment | ||||
| ```bash | ||||
| make database-service | ||||
| ``` | ||||
| 
 | ||||
| ### Setting up environment for running on ramdisk | ||||
| 
 | ||||
| ```bash | ||||
| # create ramdisk | ||||
| sudo mount -t tmpfs tmpfs /mnt/tmp | ||||
| sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp | ||||
| 
 | ||||
| # change volumes to /mnt/tmp:/data/db | ||||
| vim dockerfiles/database/valency-stack.yml | ||||
| 
 | ||||
| # change Makefile -runStack to mkdir -p /mnt/tmp | ||||
| vim dockerfiles/database/Makefile | ||||
| 
 | ||||
| # run service | ||||
| make database-service | ||||
| 
 | ||||
| # run ONLY ONCE to create users and restore database | ||||
| make database-users | ||||
| make database-restore | ||||
| 
 | ||||
| # double check if it worked | ||||
| docker exec -it ef0a /bin/bash | ||||
| 
 | ||||
| # following steps in docker bash: | ||||
|     # check if it worked by | ||||
|     mongo --username <REGULAR USER> --password --authenticationDatabase valdb | ||||
|     db.getRoles() | ||||
| 
 | ||||
| ``` | ||||
| @ -1 +0,0 @@ | ||||
| /home/kristjan/workdir/final_json/ | ||||
| @ -1 +0,0 @@ | ||||
| /home/kristjan/kres_mount/kres_parsed/tei/ | ||||
										
											Binary file not shown.
										
									
								
							| @ -1 +0,0 @@ | ||||
| /home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml | ||||
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| @ -1,5 +1,5 @@ | ||||
| FROM mongo:latest | ||||
| FROM mongo:4.2.9 | ||||
| 
 | ||||
| WORKDIR / | ||||
| COPY init_inside_container.sh /. | ||||
| COPY create.js /. | ||||
| COPY init_inside_mongo_container.sh /. | ||||
| COPY create_mongo.js /. | ||||
|  | ||||
| @ -2,33 +2,62 @@ | ||||
| # collection names: lower case, plural
 | ||||
| # user names?
 | ||||
| 
 | ||||
| # mongo admin -u root -p password --eval "db.getSiblingDB('vlDB').addUser('vluser', 'password')"
 | ||||
| 
 | ||||
| STACKNAME = dbstack | ||||
| 
 | ||||
| .PHONY: start_db FORCE | ||||
| 
 | ||||
| all: build_run create_users | ||||
| 
 | ||||
| build_run: build_mongo run_stack | ||||
| build_run: build_mongo run_docker_compose | ||||
| 
 | ||||
| create.js: FORCE | ||||
| postgres_create_roles: | ||||
| 	echo 'psql -v ON_ERROR_STOP=OFF --username $(DB_ADM_USER) <<-EOSQL' > create_postgres.js | ||||
| 	echo "create user $(DB_USR_USER) with encrypted password '$(DB_USR_PASS)';" >> create_postgres.js | ||||
| 	echo "create database superdb_small;" >> create_postgres.js | ||||
| 	echo "grant all privileges on database superdb_small to $(DB_USR_USER);" >> create_postgres.js | ||||
| 	echo "grant usage on schema public to $(DB_USR_USER);" >> create_postgres.js | ||||
| 	echo "grant select on all tables in schema public to $(DB_USR_USER);" >> create_postgres.js | ||||
| 	echo "EOSQL" >> create_postgres.js | ||||
| 	chmod +x create_postgres.js | ||||
| 
 | ||||
| FORCE: | ||||
| 	echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create.js | ||||
| 	echo 'use valdb' >> create.js | ||||
| 	echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create.js | ||||
| mongo_create_roles: | ||||
| 	echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create_mongo.js | ||||
| 	echo 'use valdb' >> create_mongo.js | ||||
| 	echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create_mongo.js | ||||
| 	echo 'db.grantRolesToUser("$(DB_USR_USER)", [{ role: "readWrite", db: "extvaldb"}])' >> create_mongo.js | ||||
| 
 | ||||
| build_mongo: create.js | ||||
| build_mongo: mongo_create_roles | ||||
| 	docker build . -t my-mongo --no-cache | ||||
| 
 | ||||
| clean_stack: | ||||
| 	docker stack rm $(STACKNAME) | ||||
| # build_postgres: postgres_create_roles
 | ||||
| # 	docker build . -t my-mongo --no-cache
 | ||||
| 
 | ||||
| run_stack: | ||||
| 	mkdir -p ${HOME}/mongo_container/data/ | ||||
| 	docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME) | ||||
| run_docker_compose: | ||||
| 	mkdir -p ${HOME}/valency_data/mongo_container/data/ | ||||
| 	#docker kill $(shell ./get_mongo_container_name.sh) | ||||
| 	#docker kill $(shell ./get_postgres_container_name.sh) | ||||
| 	#docker-compose stop | ||||
| 	docker-compose -f valency-stack.yml up -d --force-recreate | ||||
| 	# docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME) | ||||
| 
 | ||||
| create_users: create.js | ||||
| 	docker exec $(shell ./get_container_name.sh) /init_inside_container.sh | ||||
| create_users: create_mongo_users create_postgres_users | ||||
| 
 | ||||
| 
 | ||||
| create_mongo_users: mongo_create_roles | ||||
| 	docker exec $(shell ./get_mongo_container_name.sh) /init_inside_mongo_container.sh | ||||
| 	# rm create.js | ||||
| 
 | ||||
| create_postgres_users: postgres_create_roles | ||||
| 	docker exec $(shell ./get_postgres_container_name.sh) /scripts/init_inside_postgres_container.sh | ||||
| 
 | ||||
| restore_db: restore_mongo_db restore_postgres_db | ||||
| 
 | ||||
| restore_mongo_db: | ||||
| ifeq (,$(wildcard ./mongo_db.gz)) | ||||
| 	$(error "mongo_db.gz does not exists. Make sure to have dump of mongo db in 'dockerfiles/database/mongo_db.gz'") | ||||
| else | ||||
| 	docker exec $(shell ./get_mongo_container_name.sh) sh -c 'mongorestore --gzip --archive=/scripts/mongo_db.gz --db valdb --username $(DB_USR_USER) --password $(DB_USR_PASS) --authenticationDatabase valdb' | ||||
| endif | ||||
| 
 | ||||
| restore_postgres_db: | ||||
| ifeq (,$(wildcard ./postgres_db.tar)) | ||||
| 	$(error "postgres_db.tar does not exists. Make sure to have dump of postgres db in 'dockerfiles/database/postgres_db.tar'") | ||||
| else | ||||
| 	docker exec $(shell ./get_postgres_container_name.sh) sh -c 'pg_restore -U $(DB_ADM_USER) --dbname=superdb_small --create --verbose /scripts/postgres_db.tar' | ||||
| endif | ||||
|  | ||||
							
								
								
									
										2
									
								
								dockerfiles/database/get_postgres_container_name.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										2
									
								
								dockerfiles/database/get_postgres_container_name.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,2 @@ | ||||
| #!/bin/bash | ||||
| docker ps | grep postgres | awk '{print $1}' | ||||
| @ -1,3 +0,0 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| mongo admin < /create.js | ||||
							
								
								
									
										3
									
								
								dockerfiles/database/init_inside_mongo_container.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										3
									
								
								dockerfiles/database/init_inside_mongo_container.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,3 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| mongo admin < /create_mongo.js | ||||
							
								
								
									
										3
									
								
								dockerfiles/database/init_inside_postgres_container.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										3
									
								
								dockerfiles/database/init_inside_postgres_container.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,3 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| /scripts/create_postgres.js | ||||
| @ -1,26 +0,0 @@ | ||||
| version: '3.1' | ||||
| 
 | ||||
| services: | ||||
| 
 | ||||
|   my_mongo: | ||||
|     image: my-mongo | ||||
|     restart: always | ||||
|     ports: | ||||
|       - 27017:27017 | ||||
|     environment: | ||||
|       MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER} | ||||
|       MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS} | ||||
|     volumes: | ||||
|       - ${HOME}/mongo_container/data/:/data/db | ||||
| 
 | ||||
|   mongo_express: | ||||
|     image: mongo-express | ||||
|     restart: always | ||||
|     ports: | ||||
|       - 8087:8081 | ||||
|     environment: | ||||
|       ME_CONFIG_BASICAUTH_USERNAME: ${MONGOEXPRESS_USER} | ||||
|       ME_CONFIG_BASICAUTH_PASSWORD: ${MONGOEXPRESS_PASS} | ||||
|       ME_CONFIG_MONGODB_ADMINUSERNAME: ${DB_ADM_USER} | ||||
|       ME_CONFIG_MONGODB_ADMINPASSWORD: ${DB_ADM_PASS} | ||||
|       ME_CONFIG_MONGODB_SERVER: my_mongo | ||||
							
								
								
									
										27
									
								
								dockerfiles/database/valency-stack.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								dockerfiles/database/valency-stack.yml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,27 @@ | ||||
| version: '3.1' | ||||
| 
 | ||||
| services: | ||||
| 
 | ||||
|   my_mongo: | ||||
|     image: my-mongo | ||||
|     restart: always | ||||
|     ports: | ||||
|       - 127.0.0.1:27017:27017 | ||||
|     environment: | ||||
|       MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER} | ||||
|       MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS} | ||||
|     volumes: | ||||
|       - ${HOME}/valency_data/mongo_container/data/:/data/db | ||||
|       - ./:/scripts | ||||
| 
 | ||||
|   my_postgres: | ||||
|     image: postgres | ||||
|     restart: always | ||||
|     ports: | ||||
|       - 127.0.0.1:5432:5432 | ||||
|     environment: | ||||
|       POSTGRES_USER: ${DB_ADM_USER} | ||||
|       POSTGRES_PASSWORD: ${DB_ADM_PASS} | ||||
|     volumes: | ||||
|       - ${HOME}/valency_data/postgres_container/data/:/var/lib/postgresql/data | ||||
|       - ./:/scripts | ||||
| @ -6,7 +6,8 @@ vim \ | ||||
| python3 \ | ||||
| python3-pip \ | ||||
| sshfs \ | ||||
| curl | ||||
| curl \ | ||||
| locales | ||||
| 
 | ||||
| RUN pip3 install --upgrade pip | ||||
| 
 | ||||
| @ -21,6 +22,16 @@ RUN pip3 install \ | ||||
| 	flask_cors \ | ||||
| 	pymongo \ | ||||
| 	flask-pymongo \ | ||||
| 	gunicorn | ||||
| 	gunicorn \ | ||||
| 	SQLAlchemy \ | ||||
| 	tqdm \ | ||||
| 	psycopg2-binary | ||||
| 
 | ||||
| # Set the locale | ||||
| RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ | ||||
|     locale-gen | ||||
| ENV LANG en_US.UTF-8   | ||||
| ENV LANGUAGE en_US:en   | ||||
| ENV LC_ALL en_US.UTF-8 | ||||
| 
 | ||||
| ENV PYTHONIOENCODING UTF-8 | ||||
|  | ||||
| @ -23,4 +23,7 @@ server { | ||||
|         proxy_set_header Host $http_host; | ||||
|         proxy_pass http://backend_flask:8084; | ||||
|     } | ||||
| } | ||||
| } | ||||
| 
 | ||||
| https://vezljivostni.cjvt.si/api/* -> http://vezljivostni-host.cjvt.si:8084/api/* | ||||
| https://vezljivostni.cjvt.si/* -> http://vezljivostni-host.cjvt.si:80/* | ||||
|  | ||||
							
								
								
									
										37
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,37 @@ | ||||
| asn1crypto==0.24.0 | ||||
| beautifulsoup4==4.8.0 | ||||
| bs4==0.0.1 | ||||
| cffi==1.12.3 | ||||
| Click==7.0 | ||||
| cryptography==2.1.4 | ||||
| Flask==1.1.1 | ||||
| Flask-Cors==3.0.8 | ||||
| Flask-PyMongo==2.3.0 | ||||
| gunicorn==19.9.0 | ||||
| idna==2.6 | ||||
| itsdangerous==1.1.0 | ||||
| Jinja2==2.10.1 | ||||
| joblib==0.13.2 | ||||
| keyring==10.6.0 | ||||
| keyrings.alt==3.0 | ||||
| lxml==4.4.0 | ||||
| MarkupSafe==1.1.1 | ||||
| numpy==1.17.0 | ||||
| pandas==0.25.0 | ||||
| pathlib==1.0.1 | ||||
| psycopg2==2.8.4 | ||||
| pycparser==2.19 | ||||
| pycrypto==2.6.1 | ||||
| pymongo==3.8.0 | ||||
| python-dateutil==2.8.0 | ||||
| pytz==2019.2 | ||||
| pyxdg==0.25 | ||||
| PyYAML==5.1.2 | ||||
| scikit-learn==0.21.3 | ||||
| scipy==1.3.0 | ||||
| SecretStorage==2.3.1 | ||||
| six==1.11.0 | ||||
| sklearn==0.0 | ||||
| soupsieve==1.9.3 | ||||
| SQLAlchemy==1.3.12 | ||||
| Werkzeug==0.15.5 | ||||
							
								
								
									
										1708
									
								
								scripts/create_xml.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1708
									
								
								scripts/create_xml.py
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										189
									
								
								scripts/extract_keywords.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										189
									
								
								scripts/extract_keywords.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,189 @@ | ||||
| import copy | ||||
| import csv | ||||
| from xml.etree import ElementTree | ||||
| import re | ||||
| import sys | ||||
| import logging | ||||
| import argparse | ||||
| import pickle | ||||
| import time | ||||
| import gc | ||||
| import subprocess | ||||
| import concurrent.futures | ||||
| import tempfile | ||||
| 
 | ||||
| 
 | ||||
| def read_gigafida(path): | ||||
|     words = {} | ||||
|     with open(path) as tsvfile: | ||||
|         reader = csv.reader(tsvfile, delimiter='\t') | ||||
|         for row in reader: | ||||
|             words[row[0]] = int(row[2]) | ||||
|     return words | ||||
| 
 | ||||
| 
 | ||||
| def read_sloleks(path): | ||||
|     words = set() | ||||
|     with open(path) as tsvfile: | ||||
|         reader = csv.reader(tsvfile, delimiter='\t') | ||||
|         for row in reader: | ||||
|             words.add(row[1]) | ||||
|     return words | ||||
| 
 | ||||
| 
 | ||||
| def read_zele(path): | ||||
|     with open(path) as f: | ||||
|         content = f.readlines() | ||||
|         # fix content | ||||
|         content[0] = content[0][1:] | ||||
|     # a = content[2] | ||||
|     # a = content[2].split() | ||||
|     # a = content[2].split()[0].split('<IZT>')[1] | ||||
|     # a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0] | ||||
|     content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content] | ||||
|     # content = [x.split() for x in content] | ||||
|     return set(content) | ||||
| 
 | ||||
| 
 | ||||
| def read_wordlist(path): | ||||
|     with open(path) as f: | ||||
|         content = [line[:-1] for line in f.readlines()] | ||||
|     print(content[-1]) | ||||
|     return set(content) | ||||
| 
 | ||||
| 
 | ||||
| def filter_gigafida(gigafida_raw, min_limit, max_limit): | ||||
|     return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit} | ||||
| 
 | ||||
| 
 | ||||
| def set_list_intersection(gigafida_filtered, sloleks): | ||||
|     intersection = {} | ||||
|     for word, num in gigafida_filtered.items(): | ||||
|         if word in sloleks: | ||||
|             intersection[word] = num | ||||
|     return intersection | ||||
| 
 | ||||
| 
 | ||||
| def list_list_union(list1, list2): | ||||
|     union = copy.copy(list1) | ||||
|     for w, n in list2.items(): | ||||
|         if w not in list1: | ||||
|             union[w] = list2[w] | ||||
|     return union | ||||
| 
 | ||||
| 
 | ||||
| def list_list_subtraction(list1, list2): | ||||
|     subtraction = {} | ||||
|     for w, n in list2.items(): | ||||
|         # if w == 'dejati': | ||||
|         #     print('here') | ||||
|         if w not in list1: | ||||
|             subtraction[w] = n | ||||
|     return subtraction | ||||
| 
 | ||||
| 
 | ||||
| def set_set_subtraction(set1, set2): | ||||
|     subtraction = {} | ||||
|     for w in set2: | ||||
|         if w not in set1: | ||||
|             subtraction[w] = -1 | ||||
|     return subtraction | ||||
| 
 | ||||
| 
 | ||||
| def create_document(list1, path): | ||||
|     with open(path, "w") as text_file: | ||||
|         for w, n in list1.items(): | ||||
|             text_file.write("%s\t%d\n" % (w, n)) | ||||
| 
 | ||||
| 
 | ||||
| def create_document_set(list1, path): | ||||
|     with open(path, "w") as text_file: | ||||
|         for w in sorted(list(list1)): | ||||
|             text_file.write("%s\n" % w) | ||||
| 
 | ||||
| 
 | ||||
| def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max): | ||||
|     gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max) | ||||
|     sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks) | ||||
|     gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize) | ||||
|     zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele) | ||||
|     sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect) | ||||
|     sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered) | ||||
|     return sloleks_zele_subtraction | ||||
| 
 | ||||
| 
 | ||||
| def main(args): | ||||
|     gigafida_raw = read_gigafida(args.gigafida_verb_list) | ||||
|     sloleks = read_sloleks(args.sloleks) | ||||
|     zele = read_zele(args.zele) | ||||
|     if args.wordlist is not None: | ||||
|         sloleks_wordlist = set() | ||||
|         # sloleks_wordlist = set() | ||||
|         for el in sloleks: | ||||
|             if el in gigafida_raw: | ||||
|                 sloleks_wordlist.add(el) | ||||
|         filtered_wordlist = read_wordlist(args.wordlist) | ||||
| 
 | ||||
|         # sloleks_wordlist = set() | ||||
|         for el in sloleks: | ||||
|             if el in gigafida_raw: | ||||
|                 filtered_wordlist.add(el) | ||||
| 
 | ||||
|         create_document_set(filtered_wordlist, 'wordlist.tsv') | ||||
|     # gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize) | ||||
|     gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize) | ||||
|     sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks) | ||||
| 
 | ||||
|     nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True) | ||||
|     res = [el[0] for el in nouns_sloleks_gf_intersect] | ||||
| 
 | ||||
|     gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize) | ||||
|     zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele) | ||||
|     sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect) | ||||
|     sloleks_zele_subtraction = set_set_subtraction(sloleks, zele) | ||||
|     create_document(gigafida_filtered3, 'gigafida_3+.tsv') | ||||
|     # create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv') | ||||
|     create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv') | ||||
|     create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv') | ||||
| 
 | ||||
|     # gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize) | ||||
|     # sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered) | ||||
|     gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize) | ||||
|     create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv') | ||||
| 
 | ||||
|     # gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10) | ||||
|     # sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered) | ||||
|     gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10) | ||||
|     create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv') | ||||
|     # pass | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description='Extract keywords from multiple lists.') | ||||
|     parser.add_argument('gigafida_verb_list', | ||||
|                         help='Path to gigafida list of verbs in tsv format.') | ||||
|     parser.add_argument('sloleks', | ||||
|                         help='Path to Sloleks in tsv format.') | ||||
|     parser.add_argument('--zele', | ||||
|                         help='Path to zele valency dictionary.') | ||||
|     parser.add_argument('--wordlist', default=None, | ||||
|                         help='Path to filtered wordlist.') | ||||
|     parser.add_argument('--handchecked_words', default=None, | ||||
|                         help='Path to handchecked words.') | ||||
|     # parser.add_argument('--min_limit', | ||||
|     #                     help='Limit min number of ocurrences', | ||||
|     #                     type=int, default=0) | ||||
|     # parser.add_argument('--max_limit', | ||||
|     #                     help='Limit max number of ocurrences', | ||||
|     #                     type=int, default=sys.maxsize) | ||||
|     parser.add_argument('--verbose', help='Enable verbose output to stderr', | ||||
|                         choices=["warning", "info", "debug"], default="info", | ||||
|                         const="info", nargs='?') | ||||
| 
 | ||||
|     args = parser.parse_args() | ||||
|     logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) | ||||
| 
 | ||||
|     start = time.time() | ||||
|     main(args) | ||||
|     logging.info("TIME: {}".format(time.time() - start)) | ||||
							
								
								
									
										117
									
								
								scripts/form_csv.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										117
									
								
								scripts/form_csv.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,117 @@ | ||||
| import argparse | ||||
| import csv | ||||
| import os | ||||
| 
 | ||||
| from lxml import etree, objectify, html | ||||
| 
 | ||||
| 
 | ||||
| def write_general_statistics(path, out_list): | ||||
|     if len(out_list) == 0: | ||||
|         return | ||||
|     with open(path, 'w') as csvfile: | ||||
|         writer = csv.writer(csvfile, delimiter='\t', | ||||
|                                 quotechar='"') | ||||
|         writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio']) | ||||
|         for line in out_list: | ||||
|             writer.writerow(line) | ||||
| 
 | ||||
| 
 | ||||
| def write_statistics(path, out_list): | ||||
|     if len(out_list) == 0: | ||||
|         return | ||||
|     with open(path, 'w') as csvfile: | ||||
|         writer = csv.writer(csvfile, delimiter='\t', | ||||
|                             quotechar='"') | ||||
|         writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example']) | ||||
|         for line in out_list: | ||||
|             writer.writerow(line) | ||||
| 
 | ||||
| 
 | ||||
| def main(args): | ||||
|     for file in sorted(os.listdir(args.input)): | ||||
|         path = os.path.join(args.input, file) | ||||
|         tree = etree.parse(path) | ||||
|         gf_output = [] | ||||
|         ssj_output = [] | ||||
|         head = next(tree.iter('head')) | ||||
|         headword = head.find('headword').find('lemma').text | ||||
|         #for div in root.iterfind('.//div'): | ||||
|         for elem in tree.iter('statisticsContainer'): | ||||
|         # for element in tree.iterfind('statisticsContainer'): | ||||
|         # for element in tree.find('statisticsContainer'): | ||||
|             semRole = elem.find('semanticRole').text | ||||
|             gf_pattern = None | ||||
|             gf_sentence = None | ||||
|             ssj_pattern = None | ||||
|             ssj_sentence = None | ||||
|             measure = elem.find('measureList') | ||||
|             for el in measure: | ||||
|                 if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0': | ||||
|                     gf_pattern = el.text | ||||
|                 if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0': | ||||
|                     gf_sentence = el.text | ||||
|                 if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2': | ||||
|                     ssj_pattern = el.text | ||||
|                 if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2': | ||||
|                     ssj_sentence = el.text | ||||
|             if gf_pattern is not None and gf_sentence is not None: | ||||
|                 gf_output.append([semRole, gf_pattern, gf_sentence]) | ||||
|             if ssj_pattern is not None and ssj_sentence is not None: | ||||
|                 ssj_output.append([semRole, ssj_pattern, ssj_sentence]) | ||||
| 
 | ||||
|         print(file) | ||||
| 
 | ||||
|         analyze_output = [] | ||||
|         for elem in tree.iter('valencyPattern'): | ||||
|             valency_pattern_id = elem.attrib['id'] | ||||
| 
 | ||||
|             # get frequency | ||||
|             measure = '' | ||||
|             for measure_el in elem.find('measureList').findall('measure'): | ||||
|                 if measure_el.attrib['source'] == 'Gigafida 2.0': | ||||
|                     measure = measure_el.text | ||||
| 
 | ||||
|             # get semantic roles | ||||
|             semantic_roles_list = [] | ||||
|             for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'): | ||||
|                 semantic_roles_list.append(semantic_rol_con.find('semanticRole').text) | ||||
|             semantic_roles = '_'.join(semantic_roles_list) | ||||
| 
 | ||||
|             # pattern representation | ||||
|             pattern_representation = elem.find('patternRepresentation').text | ||||
| 
 | ||||
|             # corpus example | ||||
|             if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None: | ||||
|                 corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode') | ||||
| 
 | ||||
|             else: | ||||
|                 continue | ||||
| 
 | ||||
|             # ugly postprocessing to remove xmlns:xsi=... duh.. | ||||
|             root = etree.fromstring(corpus_example_text) | ||||
| 
 | ||||
|             # Remove namespace prefixes | ||||
|             for elem in root.getiterator(): | ||||
|                 elem.tag = etree.QName(elem).localname | ||||
|             # Remove unused namespace declarations | ||||
|             etree.cleanup_namespaces(root) | ||||
| 
 | ||||
|             corpus_example = etree.tostring(root, encoding='unicode') | ||||
| 
 | ||||
|             print(f"Valency pattern {valency_pattern_id}") | ||||
| 
 | ||||
| 
 | ||||
|             analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example]) | ||||
| 
 | ||||
|         write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output) | ||||
|         write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output) | ||||
|         write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output) | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.') | ||||
|     arg_parser.add_argument('--input', type=str, help='Input directory') | ||||
|     arg_parser.add_argument('--output', type=str, help='Output directory') | ||||
| 
 | ||||
|     args = arg_parser.parse_args() | ||||
| 
 | ||||
|     main(args) | ||||
							
								
								
									
										1
									
								
								scripts/valency
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								scripts/valency
									
									
									
									
									
										Symbolic link
									
								
							| @ -0,0 +1 @@ | ||||
| ../src/pkg/valency/valency | ||||
							
								
								
									
										8
									
								
								scripts/xsd_checker.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								scripts/xsd_checker.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | ||||
| from lxml import etree as lxml | ||||
| 
 | ||||
| with open('../data/inventory.xsd') as f: | ||||
|     xmlschema_doc = lxml.parse(f) | ||||
|     xmlschema = lxml.XMLSchema(xmlschema_doc) | ||||
|     with open('../data/xmls/output.xml') as op: | ||||
|         doc = lxml.parse(op) | ||||
|         print(xmlschema.validate(doc)) | ||||
							
								
								
									
										0
									
								
								src/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -37,7 +37,8 @@ app = Flask(__name__) | ||||
| app.config.from_object("db_config") | ||||
| mongo = PyMongo(app) | ||||
| 
 | ||||
| app.config["CORPORA"] = ["ssj", "kres"] | ||||
| # app.config["CORPORA"] = ["ssj", "kres", "gigafida"] | ||||
| app.config["CORPORA"] = ["gigafida"] | ||||
| app.config["BANNED_HEADWORDS"] = ["biti"] | ||||
| app.config["QUERY_LIMIT"] = 1000 | ||||
| 
 | ||||
| @ -247,20 +248,23 @@ def api_get_frames(): | ||||
|     if corpus not in app.config["CORPORA"]: | ||||
|         return json.dumps({"error": "cor={kres,ssj}"}) | ||||
| 
 | ||||
|     log.info("Test1") | ||||
|     cur = mongo.db[corpus].find({"headwords": hw}) | ||||
|     log.info("Test2") | ||||
|     frames = [] | ||||
|     for ent in cur[:app.config["QUERY_LIMIT"]]: | ||||
|         frames += frames_from_db_entry(ent)  # pre-process this step for prod TODO | ||||
|     cur.close() | ||||
| 
 | ||||
|     log.info("Test3") | ||||
|     # filter by relevant hw | ||||
|     frames = [x for x in frames if x.hw == hw] | ||||
| 
 | ||||
|     ret_frames = RF(frames, mongo.db.sensemap) | ||||
| 
 | ||||
|     log.info("Test3") | ||||
|     json_ret = {"frames": []} | ||||
|     for frame in ret_frames: | ||||
|         json_ret["frames"].append(frame.to_json()) | ||||
|     log.info("Test4") | ||||
|     return json.dumps(json_ret) | ||||
|     # return prepare_frames(ret_frames) | ||||
| 
 | ||||
| @ -444,7 +448,7 @@ def _is_banned(hw): | ||||
|         banned = False | ||||
|     return banned | ||||
| 
 | ||||
| def prepare_app_index(appindex_json, sskj_wordlist): | ||||
| def prepare_app_index(appindex_json): | ||||
|     log.info("[*] preparing app_index") | ||||
|     # create app_index (used in frontend, left side word index) | ||||
|     tmp_app_index = {c: {} for c in app.config["CORPORA"]} | ||||
| @ -452,6 +456,14 @@ def prepare_app_index(appindex_json, sskj_wordlist): | ||||
|         res_hws = {} | ||||
|         res_fns = {} | ||||
| 
 | ||||
|         # print('CORPUS...!!...') | ||||
|         # print(corpus) | ||||
|         # a = mongo.db[corpus] | ||||
|         # print('TEST_OK') | ||||
|         # print(a) | ||||
|         # print(mongo.db) | ||||
|         # a = mongo.db.list_collection_names() | ||||
|         # print('TEST_OK2') | ||||
|         nentries = mongo.db[corpus].count() | ||||
|         idx = 0 | ||||
|         for e in mongo.db[corpus].find({}): | ||||
| @ -484,6 +496,7 @@ def prepare_app_index(appindex_json, sskj_wordlist): | ||||
| 
 | ||||
|         for letter, words in alphabetical.items(): | ||||
|             filtered_words = [x for x in words if not _is_banned(x[0])] | ||||
|             # filtered_words = [x for x in words] | ||||
|             alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0]) | ||||
| 
 | ||||
|         tmp_app_index[corpus]["words"] = alphabetical | ||||
| @ -560,12 +573,16 @@ if __name__ == "__main__": | ||||
|     if args.prepare_db: | ||||
|         with Path(args.sskj_wordlist).open("r") as fp: | ||||
|             sskj_wordlist = json.load(fp) | ||||
|         prepare_app_index(args.appindex_json, sskj_wordlist) | ||||
|         prepare_app_index(args.appindex_json) | ||||
|         sys.exit() | ||||
| 
 | ||||
|     # app index from db | ||||
|     with Path(args.appindex_json).open("r") as fp: | ||||
|         app.config["app_index"] = json.load(fp) | ||||
|         # a = app.config["app_index"] | ||||
|         # b = app.config["app_index"]["kres"] | ||||
|         # c = app.config["app_index"]["kres"]["words"] | ||||
|         # print('HERE') | ||||
| 
 | ||||
|     # log.info("[*] Starting app.py with config:\n%s".format(config)) | ||||
|     log.info("[*] Starting app.py with config:\n{}".format(config)) | ||||
|  | ||||
							
								
								
									
										106
									
								
								src/backend_flask/build_app_index.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								src/backend_flask/build_app_index.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,106 @@ | ||||
| import argparse | ||||
| import json | ||||
| 
 | ||||
| from flask import Flask | ||||
| from flask_pymongo import PyMongo | ||||
| from pathlib import Path | ||||
| 
 | ||||
| app = Flask(__name__) | ||||
| 
 | ||||
| app.config.from_object("db_config") | ||||
| mongo = PyMongo(app) | ||||
| 
 | ||||
| app.config["BANNED_HEADWORDS"] = ["biti"] | ||||
| 
 | ||||
| def _is_banned(hw): | ||||
|     banned = True | ||||
|     if hw in app.config["BANNED_HEADWORDS"]: | ||||
|         banned = True | ||||
|     elif hw in sskj_wordlist["wordlist"]: | ||||
|         banned = False | ||||
|     elif (hw + " se") in sskj_wordlist["wordlist"]: | ||||
|         banned = False | ||||
|     return banned | ||||
| 
 | ||||
| 
 | ||||
| def prepare_app_index(appindex_json, corporas, previous_json=None): | ||||
|     if previous_json: | ||||
|         with Path(previous_json).open("r") as fp: | ||||
|             tmp_app_index = json.load(fp) | ||||
|     else: | ||||
|         tmp_app_index = {} | ||||
|     # create app_index (used in frontend, left side word index) | ||||
|     for c in corporas: | ||||
|         tmp_app_index[c] = {} | ||||
| 
 | ||||
|     for corpus in corporas: | ||||
|         res_hws = {} | ||||
|         res_fns = {} | ||||
| 
 | ||||
|         # print('CORPUS...!!...') | ||||
|         # print(corpus) | ||||
|         # a = mongo.db[corpus] | ||||
|         # print('TEST_OK') | ||||
|         # print(a) | ||||
|         # print(mongo.db) | ||||
|         # a = mongo.db.list_collection_names() | ||||
|         # print('TEST_OK2') | ||||
|         nentries = mongo.db[corpus].count() | ||||
|         idx = 0 | ||||
|         for e in mongo.db[corpus].find({}): | ||||
|             if "headwords" not in e: | ||||
|                 continue | ||||
|             for hw in e["headwords"]: | ||||
|                 if hw in res_hws: | ||||
|                     res_hws[hw] += 1 | ||||
|                 else: | ||||
|                     res_hws[hw] = 1 | ||||
|             if "functors" not in e: | ||||
|                 continue | ||||
|             for fn in e["functors"]: | ||||
|                 if fn in res_fns: | ||||
|                     res_fns[fn] += 1 | ||||
|                 else: | ||||
|                     res_fns[fn] = 1 | ||||
|             idx += 1 | ||||
|             if idx % 10000 == 0: | ||||
|                 print("indexing {}: {}/{}".format( | ||||
|                     corpus, idx, nentries)) | ||||
| 
 | ||||
|         alphabetical = {} | ||||
|         for k, e in res_hws.items(): | ||||
|             fst = k[0].lower() | ||||
|             if fst in alphabetical: | ||||
|                 alphabetical[fst].append((k, e)) | ||||
|             else: | ||||
|                 alphabetical[fst] = [(k, e)] | ||||
| 
 | ||||
|         for letter, words in alphabetical.items(): | ||||
|             filtered_words = [x for x in words if not _is_banned(x[0])] | ||||
|             # filtered_words = [x for x in words] | ||||
|             alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0]) | ||||
| 
 | ||||
|         tmp_app_index[corpus]["words"] = alphabetical | ||||
| 
 | ||||
| 
 | ||||
|         functors = [(k, e) for (k, e) in res_fns.items()] | ||||
|         functors = sorted(functors, key=lambda x: x[0]) | ||||
|         tmp_app_index[corpus]["functors"] = functors | ||||
| 
 | ||||
|     with Path(appindex_json).open("w") as fp: | ||||
|         json.dump(tmp_app_index, fp) | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     print("Starting app.py main()") | ||||
|     aparser = argparse.ArgumentParser(description="Arguments for app.py") | ||||
|     aparser.add_argument("--previous-json", type=str, default=None) | ||||
|     aparser.add_argument("--appindex-json", type=str) | ||||
|     aparser.add_argument("--sskj-wordlist", type=str) | ||||
|     args = aparser.parse_args() | ||||
| 
 | ||||
|     corporas = ['gigafida'] | ||||
| 
 | ||||
|     with Path(args.sskj_wordlist).open("r") as fp: | ||||
|         sskj_wordlist = json.load(fp) | ||||
| 
 | ||||
|     prepare_app_index(args.appindex_json, corporas, args.previous_json) | ||||
| @ -1,2 +1,2 @@ | ||||
| MONGO_URI = "mongodb://sizif:p5e3r4u8t7@my_mongo:27017/valdb" | ||||
| MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb" | ||||
| MONGO_AUTH_SOURCE = 'admin' | ||||
|  | ||||
							
								
								
									
										18
									
								
								src/backend_flask/get_sentence_ids.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								src/backend_flask/get_sentence_ids.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | ||||
| import json | ||||
| import os | ||||
| 
 | ||||
| input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json" | ||||
| output_file = "../../all_sentences.json" | ||||
| 
 | ||||
| results = {} | ||||
| filenames = os.listdir(input_dir) | ||||
| len(filenames) | ||||
| for i, filename in enumerate(filenames): | ||||
|     if filename.endswith(".json"): | ||||
|         with open(os.path.join(input_dir, filename)) as json_file: | ||||
|             data = json.load(json_file) | ||||
|             results[filename.split('-')[0]] = list(data.keys()) | ||||
|         print('Progress: %.2f %%' % (i/len(filenames))) | ||||
| 
 | ||||
| with open(output_file, 'w') as f: | ||||
|     json.dump(results, f) | ||||
| @ -1,3 +1,3 @@ | ||||
| { | ||||
|     "api_addr": "http://193.2.76.103:8084" | ||||
|     "api_addr": "http://0.0.0.0:8084" | ||||
| } | ||||
|  | ||||
							
								
								
									
										28
									
								
								src/frontend_vue/package-lock.json
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										28
									
								
								src/frontend_vue/package-lock.json
									
									
									
										generated
									
									
									
								
							| @ -3513,14 +3513,12 @@ | ||||
|         "balanced-match": { | ||||
|           "version": "1.0.0", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true | ||||
|           "dev": true | ||||
|         }, | ||||
|         "brace-expansion": { | ||||
|           "version": "1.1.11", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true, | ||||
|           "requires": { | ||||
|             "balanced-match": "^1.0.0", | ||||
|             "concat-map": "0.0.1" | ||||
| @ -3535,20 +3533,17 @@ | ||||
|         "code-point-at": { | ||||
|           "version": "1.1.0", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true | ||||
|           "dev": true | ||||
|         }, | ||||
|         "concat-map": { | ||||
|           "version": "0.0.1", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true | ||||
|           "dev": true | ||||
|         }, | ||||
|         "console-control-strings": { | ||||
|           "version": "1.1.0", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true | ||||
|           "dev": true | ||||
|         }, | ||||
|         "core-util-is": { | ||||
|           "version": "1.0.2", | ||||
| @ -3665,8 +3660,7 @@ | ||||
|         "inherits": { | ||||
|           "version": "2.0.3", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true | ||||
|           "dev": true | ||||
|         }, | ||||
|         "ini": { | ||||
|           "version": "1.3.5", | ||||
| @ -3678,7 +3672,6 @@ | ||||
|           "version": "1.0.0", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true, | ||||
|           "requires": { | ||||
|             "number-is-nan": "^1.0.0" | ||||
|           } | ||||
| @ -3693,7 +3686,6 @@ | ||||
|           "version": "3.0.4", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true, | ||||
|           "requires": { | ||||
|             "brace-expansion": "^1.1.7" | ||||
|           } | ||||
| @ -3701,14 +3693,12 @@ | ||||
|         "minimist": { | ||||
|           "version": "0.0.8", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true | ||||
|           "dev": true | ||||
|         }, | ||||
|         "minipass": { | ||||
|           "version": "2.3.5", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true, | ||||
|           "requires": { | ||||
|             "safe-buffer": "^5.1.2", | ||||
|             "yallist": "^3.0.0" | ||||
| @ -3727,7 +3717,6 @@ | ||||
|           "version": "0.5.1", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true, | ||||
|           "requires": { | ||||
|             "minimist": "0.0.8" | ||||
|           } | ||||
| @ -3808,8 +3797,7 @@ | ||||
|         "number-is-nan": { | ||||
|           "version": "1.0.1", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true | ||||
|           "dev": true | ||||
|         }, | ||||
|         "object-assign": { | ||||
|           "version": "4.1.1", | ||||
| @ -3821,7 +3809,6 @@ | ||||
|           "version": "1.4.0", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true, | ||||
|           "requires": { | ||||
|             "wrappy": "1" | ||||
|           } | ||||
| @ -3943,7 +3930,6 @@ | ||||
|           "version": "1.0.2", | ||||
|           "bundled": true, | ||||
|           "dev": true, | ||||
|           "optional": true, | ||||
|           "requires": { | ||||
|             "code-point-at": "^1.0.0", | ||||
|             "is-fullwidth-code-point": "^1.0.0", | ||||
|  | ||||
| @ -62,7 +62,7 @@ export default { | ||||
|     name: "Nav", | ||||
|     props: ["appState"], | ||||
|     data() {return { | ||||
|         optCorpora: ["kres", "ssj"], | ||||
|         optCorpora: ["kres", "ssj", "gigafida"], | ||||
|         optIndexes: [ | ||||
|             {key: "besede", val: "words"}, | ||||
|             {key: "udeleženske vloge", val: "functors"}, | ||||
|  | ||||
							
								
								
									
										0
									
								
								src/pkg/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/pkg/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -1 +1 @@ | ||||
| Subproject commit 01adf47b9b63b43f86bff52429792b0de2327ddd | ||||
| Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd | ||||
							
								
								
									
										1
									
								
								src/pkg/luscenje_struktur
									
									
									
									
									
										Submodule
									
								
							
							
								
								
								
								
								
								
							
						
						
									
										1
									
								
								src/pkg/luscenje_struktur
									
									
									
									
									
										Submodule
									
								
							| @ -0,0 +1 @@ | ||||
| Subproject commit 8c87d07b8a3ca73faac2fac30c39969bc5f97d45 | ||||
| @ -3,6 +3,41 @@ from corpusparser import enriched_lemma | ||||
| 
 | ||||
| log = logging.getLogger(__name__) | ||||
| 
 | ||||
| def frames_from_db_entry_headword(dbent, headword): | ||||
|     def _full_tid(tid): | ||||
|         return ".".join([dbent["sid"], str(tid)]) | ||||
| 
 | ||||
|     token_dict = {str(x["tid"]): x for x in dbent["tokens"]} | ||||
| 
 | ||||
|     frames = [] | ||||
|     if "srl_links" not in dbent: | ||||
|         return [] | ||||
|     srldict = {} | ||||
|     for srl in dbent["srl_links"]: | ||||
|         key = str(srl["from"]) | ||||
|         if enriched_lemma(token_dict[key]) != headword: | ||||
|             continue | ||||
|         if key not in srldict: | ||||
|             srldict[key] = [srl] | ||||
|         else: | ||||
|             srldict[key] += [srl] | ||||
|     for hwtid, srlarr in srldict.items(): | ||||
|         frames += [Frame( | ||||
|             hw_lemma=enriched_lemma(token_dict[hwtid]), | ||||
|             tids=[_full_tid(hwtid)], | ||||
|             slots=[ | ||||
|                 Slot( | ||||
|                     functor=srl["afun"], | ||||
|                     tids=[_full_tid(srl["to"])] | ||||
|                 ) for srl in srlarr | ||||
|             ], | ||||
|             # sentences=[(dbent["sid"], dbent["tokens"])], | ||||
|             sentences=[ | ||||
|                 [(_full_tid(t["tid"]), t) for t in dbent["tokens"]], | ||||
|             ] | ||||
|         )] | ||||
|     return frames | ||||
| 
 | ||||
| def frames_from_db_entry(dbent): | ||||
|     def _full_tid(tid): | ||||
|         return ".".join([dbent["sid"], str(tid)]) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user