Compare commits

...

14 Commits

39 changed files with 2434 additions and 93 deletions

11
.gitignore vendored
View File

@ -7,6 +7,17 @@ data/appindex.json
src/frontend_vue/node_modules/ src/frontend_vue/node_modules/
src/frontend_vue/dist/ src/frontend_vue/dist/
dockerfiles/database/create.js dockerfiles/database/create.js
dockerfiles/database/create_mongo.js
dockerfiles/database/create_postgres.js
dockerfiles/database/mongo_db.gz
dockerfiles/database/postgres_db.tar
dockerfiles/database/postgres_db_OLD.tar
*__pycache__/ *__pycache__/
env.local env.local
logs/* logs/*
.idea/
venv*
data/
data
deploy_instructions/
run.sh

3
.gitmodules vendored
View File

@ -1,3 +1,6 @@
[submodule "src/pkg/cjvt-corpusparser"] [submodule "src/pkg/cjvt-corpusparser"]
path = src/pkg/cjvt-corpusparser path = src/pkg/cjvt-corpusparser
url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git
[submodule "src/pkg/luscenje_struktur"]
path = src/pkg/luscenje_struktur
url = https://gitea.cjvt.si/ozbolt/luscenje_struktur.git

View File

@ -13,10 +13,11 @@ SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml" # KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link" # KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml" KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml"
GIGAFIDA_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/giga_orig"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json" # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link" # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json" KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
GIGAFIDA_SRL_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/final_json"
# This file comes with the source code. Make sure you unpack it and name it right. # This file comes with the source code. Make sure you unpack it and name it right.
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json" SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json" SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
@ -26,14 +27,14 @@ APPINDEX_PATH = "$(MAKE_ROOT)/data/appindex.json"
OUTPUT = "db" OUTPUT = "db"
# OUTPUT = "file" # OUTPUT = "file"
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume OUTDIR = "/project/data" # if you're running this in docker, make sure to mount the volume
DBADDR = "0.0.0.0:27017" # don't use localhost DBADDR = "0.0.0.0:27017" # don't use localhost
# credentials from .gitignored file # credentials from .gitignored file
# create it from env.default # create it from env.default
include env.local include env.local
N_CORES = 3 N_CORES = 4
# insert kres files into database in chunks, for fewer connections # insert kres files into database in chunks, for fewer connections
KRES_CHUNK_SIZE = 30 KRES_CHUNK_SIZE = 30
@ -56,6 +57,12 @@ database-service:
database-users: database-users:
cd dockerfiles/database; $(MAKE) create_users cd dockerfiles/database; $(MAKE) create_users
database-restore:
cd dockerfiles/database; $(MAKE) restore_db
database-restore-postgres:
cd dockerfiles/database; $(MAKE) restore_postgres_db
# also useful, if we want to restart the db # also useful, if we want to restart the db
database-clean: database-clean:
cd dockerfiles/database; $(MAKE) clean_stack cd dockerfiles/database; $(MAKE) clean_stack
@ -69,6 +76,7 @@ python-env-install:
pip3 install -e src/pkg/cjvt-corpusparser/. pip3 install -e src/pkg/cjvt-corpusparser/.
pip3 install -e src/pkg/valency/. pip3 install -e src/pkg/valency/.
pip3 install -e src/pkg/seqparser/. pip3 install -e src/pkg/seqparser/.
pip3 install -e src/pkg/luscenje_struktur/.
# from inside python-env container: # from inside python-env container:
data/samples: data/samples:
@ -93,7 +101,14 @@ fill-database-kres: data/samples
--chunk-size $(KRES_CHUNK_SIZE) \ --chunk-size $(KRES_CHUNK_SIZE) \
--cores $(N_CORES) --cores $(N_CORES)
fill-database-gigafida: data/samples
python3 src/pkg/cjvt-corpusparser/corpusparser/main.py --kres-folder $(GIGAFIDA_FOLDER) \
--corpus="gigafida" \
--ssj-file $(SSJ_FILE) --kres-srl-folder $(GIGAFIDA_SRL_FOLDER) \
--output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \
--chunk-size $(KRES_CHUNK_SIZE) \
--cores $(N_CORES)
## Frontend ## Frontend

View File

@ -179,3 +179,47 @@ $ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0
``` ```
After uploading, restart the stack with `27017` commented out. After uploading, restart the stack with `27017` commented out.
## Script running
### Environment setup
```bash
pip install -r requirements.txt
pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git
```
### Running on already setup environment
```bash
make database-service
```
### Setting up environment for running on ramdisk
```bash
# create ramdisk
sudo mount -t tmpfs tmpfs /mnt/tmp
sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp
# change volumes to /mnt/tmp:/data/db
vim dockerfiles/database/valency-stack.yml
# change Makefile -runStack to mkdir -p /mnt/tmp
vim dockerfiles/database/Makefile
# run service
make database-service
# run ONLY ONCE to create users and restore database
make database-users
make database-restore
# double check if it worked
docker exec -it ef0a /bin/bash
# following steps in docker bash:
# check if it worked by
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
db.getRoles()
```

View File

@ -1 +0,0 @@
/home/kristjan/workdir/final_json/

View File

@ -1 +0,0 @@
/home/kristjan/kres_mount/kres_parsed/tei/

Binary file not shown.

View File

@ -1 +0,0 @@
/home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,5 @@
FROM mongo:latest FROM mongo:4.2.9
WORKDIR / WORKDIR /
COPY init_inside_container.sh /. COPY init_inside_mongo_container.sh /.
COPY create.js /. COPY create_mongo.js /.

View File

@ -2,33 +2,62 @@
# collection names: lower case, plural # collection names: lower case, plural
# user names? # user names?
# mongo admin -u root -p password --eval "db.getSiblingDB('vlDB').addUser('vluser', 'password')"
STACKNAME = dbstack
.PHONY: start_db FORCE
all: build_run create_users all: build_run create_users
build_run: build_mongo run_stack build_run: build_mongo run_docker_compose
create.js: FORCE postgres_create_roles:
echo 'psql -v ON_ERROR_STOP=OFF --username $(DB_ADM_USER) <<-EOSQL' > create_postgres.js
echo "create user $(DB_USR_USER) with encrypted password '$(DB_USR_PASS)';" >> create_postgres.js
echo "create database superdb_small;" >> create_postgres.js
echo "grant all privileges on database superdb_small to $(DB_USR_USER);" >> create_postgres.js
echo "grant usage on schema public to $(DB_USR_USER);" >> create_postgres.js
echo "grant select on all tables in schema public to $(DB_USR_USER);" >> create_postgres.js
echo "EOSQL" >> create_postgres.js
chmod +x create_postgres.js
FORCE: mongo_create_roles:
echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create.js echo 'db.auth("$(DB_ADM_USER)", "$(DB_ADM_PASS)")' > create_mongo.js
echo 'use valdb' >> create.js echo 'use valdb' >> create_mongo.js
echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create.js echo 'db.createUser({user: "$(DB_USR_USER)", pwd: "$(DB_USR_PASS)", roles: ["readWrite"]})' >> create_mongo.js
echo 'db.grantRolesToUser("$(DB_USR_USER)", [{ role: "readWrite", db: "extvaldb"}])' >> create_mongo.js
build_mongo: create.js build_mongo: mongo_create_roles
docker build . -t my-mongo --no-cache docker build . -t my-mongo --no-cache
clean_stack: # build_postgres: postgres_create_roles
docker stack rm $(STACKNAME) # docker build . -t my-mongo --no-cache
run_stack: run_docker_compose:
mkdir -p ${HOME}/mongo_container/data/ mkdir -p ${HOME}/valency_data/mongo_container/data/
docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME) #docker kill $(shell ./get_mongo_container_name.sh)
#docker kill $(shell ./get_postgres_container_name.sh)
#docker-compose stop
docker-compose -f valency-stack.yml up -d --force-recreate
# docker stack deploy --compose-file mongodb-stack.yml $(STACKNAME)
create_users: create.js create_users: create_mongo_users create_postgres_users
docker exec $(shell ./get_container_name.sh) /init_inside_container.sh
create_mongo_users: mongo_create_roles
docker exec $(shell ./get_mongo_container_name.sh) /init_inside_mongo_container.sh
# rm create.js # rm create.js
create_postgres_users: postgres_create_roles
docker exec $(shell ./get_postgres_container_name.sh) /scripts/init_inside_postgres_container.sh
restore_db: restore_mongo_db restore_postgres_db
restore_mongo_db:
ifeq (,$(wildcard ./mongo_db.gz))
$(error "mongo_db.gz does not exists. Make sure to have dump of mongo db in 'dockerfiles/database/mongo_db.gz'")
else
docker exec $(shell ./get_mongo_container_name.sh) sh -c 'mongorestore --gzip --archive=/scripts/mongo_db.gz --db valdb --username $(DB_USR_USER) --password $(DB_USR_PASS) --authenticationDatabase valdb'
endif
restore_postgres_db:
ifeq (,$(wildcard ./postgres_db.tar))
$(error "postgres_db.tar does not exists. Make sure to have dump of postgres db in 'dockerfiles/database/postgres_db.tar'")
else
docker exec $(shell ./get_postgres_container_name.sh) sh -c 'pg_restore -U $(DB_ADM_USER) --dbname=superdb_small --create --verbose /scripts/postgres_db.tar'
endif

View File

@ -0,0 +1,2 @@
#!/bin/bash
docker ps | grep postgres | awk '{print $1}'

View File

@ -1,3 +0,0 @@
#!/bin/bash
mongo admin < /create.js

View File

@ -0,0 +1,3 @@
#!/bin/bash
mongo admin < /create_mongo.js

View File

@ -0,0 +1,3 @@
#!/bin/bash
/scripts/create_postgres.js

View File

@ -1,26 +0,0 @@
version: '3.1'
services:
my_mongo:
image: my-mongo
restart: always
ports:
- 27017:27017
environment:
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
volumes:
- ${HOME}/mongo_container/data/:/data/db
mongo_express:
image: mongo-express
restart: always
ports:
- 8087:8081
environment:
ME_CONFIG_BASICAUTH_USERNAME: ${MONGOEXPRESS_USER}
ME_CONFIG_BASICAUTH_PASSWORD: ${MONGOEXPRESS_PASS}
ME_CONFIG_MONGODB_ADMINUSERNAME: ${DB_ADM_USER}
ME_CONFIG_MONGODB_ADMINPASSWORD: ${DB_ADM_PASS}
ME_CONFIG_MONGODB_SERVER: my_mongo

View File

@ -0,0 +1,27 @@
version: '3.1'
services:
my_mongo:
image: my-mongo
restart: always
ports:
- 127.0.0.1:27017:27017
environment:
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
volumes:
- ${HOME}/valency_data/mongo_container/data/:/data/db
- ./:/scripts
my_postgres:
image: postgres
restart: always
ports:
- 127.0.0.1:5432:5432
environment:
POSTGRES_USER: ${DB_ADM_USER}
POSTGRES_PASSWORD: ${DB_ADM_PASS}
volumes:
- ${HOME}/valency_data/postgres_container/data/:/var/lib/postgresql/data
- ./:/scripts

View File

@ -6,7 +6,8 @@ vim \
python3 \ python3 \
python3-pip \ python3-pip \
sshfs \ sshfs \
curl curl \
locales
RUN pip3 install --upgrade pip RUN pip3 install --upgrade pip
@ -21,6 +22,16 @@ RUN pip3 install \
flask_cors \ flask_cors \
pymongo \ pymongo \
flask-pymongo \ flask-pymongo \
gunicorn gunicorn \
SQLAlchemy \
tqdm \
psycopg2-binary
# Set the locale
RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
locale-gen
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
ENV PYTHONIOENCODING UTF-8 ENV PYTHONIOENCODING UTF-8

View File

@ -24,3 +24,6 @@ server {
proxy_pass http://backend_flask:8084; proxy_pass http://backend_flask:8084;
} }
} }
https://vezljivostni.cjvt.si/api/* -> http://vezljivostni-host.cjvt.si:8084/api/*
https://vezljivostni.cjvt.si/* -> http://vezljivostni-host.cjvt.si:80/*

37
requirements.txt Normal file
View File

@ -0,0 +1,37 @@
asn1crypto==0.24.0
beautifulsoup4==4.8.0
bs4==0.0.1
cffi==1.12.3
Click==7.0
cryptography==2.1.4
Flask==1.1.1
Flask-Cors==3.0.8
Flask-PyMongo==2.3.0
gunicorn==19.9.0
idna==2.6
itsdangerous==1.1.0
Jinja2==2.10.1
joblib==0.13.2
keyring==10.6.0
keyrings.alt==3.0
lxml==4.4.0
MarkupSafe==1.1.1
numpy==1.17.0
pandas==0.25.0
pathlib==1.0.1
psycopg2==2.8.4
pycparser==2.19
pycrypto==2.6.1
pymongo==3.8.0
python-dateutil==2.8.0
pytz==2019.2
pyxdg==0.25
PyYAML==5.1.2
scikit-learn==0.21.3
scipy==1.3.0
SecretStorage==2.3.1
six==1.11.0
sklearn==0.0
soupsieve==1.9.3
SQLAlchemy==1.3.12
Werkzeug==0.15.5

1708
scripts/create_xml.py Normal file

File diff suppressed because it is too large Load Diff

189
scripts/extract_keywords.py Normal file
View File

@ -0,0 +1,189 @@
import copy
import csv
from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import gc
import subprocess
import concurrent.futures
import tempfile
def read_gigafida(path):
words = {}
with open(path) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
words[row[0]] = int(row[2])
return words
def read_sloleks(path):
words = set()
with open(path) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
words.add(row[1])
return words
def read_zele(path):
with open(path) as f:
content = f.readlines()
# fix content
content[0] = content[0][1:]
# a = content[2]
# a = content[2].split()
# a = content[2].split()[0].split('<IZT>')[1]
# a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0]
content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content]
# content = [x.split() for x in content]
return set(content)
def read_wordlist(path):
with open(path) as f:
content = [line[:-1] for line in f.readlines()]
print(content[-1])
return set(content)
def filter_gigafida(gigafida_raw, min_limit, max_limit):
return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit}
def set_list_intersection(gigafida_filtered, sloleks):
intersection = {}
for word, num in gigafida_filtered.items():
if word in sloleks:
intersection[word] = num
return intersection
def list_list_union(list1, list2):
union = copy.copy(list1)
for w, n in list2.items():
if w not in list1:
union[w] = list2[w]
return union
def list_list_subtraction(list1, list2):
subtraction = {}
for w, n in list2.items():
# if w == 'dejati':
# print('here')
if w not in list1:
subtraction[w] = n
return subtraction
def set_set_subtraction(set1, set2):
subtraction = {}
for w in set2:
if w not in set1:
subtraction[w] = -1
return subtraction
def create_document(list1, path):
with open(path, "w") as text_file:
for w, n in list1.items():
text_file.write("%s\t%d\n" % (w, n))
def create_document_set(list1, path):
with open(path, "w") as text_file:
for w in sorted(list(list1)):
text_file.write("%s\n" % w)
def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max):
gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max)
sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks)
gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize)
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
return sloleks_zele_subtraction
def main(args):
gigafida_raw = read_gigafida(args.gigafida_verb_list)
sloleks = read_sloleks(args.sloleks)
zele = read_zele(args.zele)
if args.wordlist is not None:
sloleks_wordlist = set()
# sloleks_wordlist = set()
for el in sloleks:
if el in gigafida_raw:
sloleks_wordlist.add(el)
filtered_wordlist = read_wordlist(args.wordlist)
# sloleks_wordlist = set()
for el in sloleks:
if el in gigafida_raw:
filtered_wordlist.add(el)
create_document_set(filtered_wordlist, 'wordlist.tsv')
# gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize)
gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize)
sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks)
nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True)
res = [el[0] for el in nouns_sloleks_gf_intersect]
gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize)
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
sloleks_zele_subtraction = set_set_subtraction(sloleks, zele)
create_document(gigafida_filtered3, 'gigafida_3+.tsv')
# create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv')
create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv')
create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv')
# gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize)
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize)
create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv')
# gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10)
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10)
create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv')
# pass
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract keywords from multiple lists.')
parser.add_argument('gigafida_verb_list',
help='Path to gigafida list of verbs in tsv format.')
parser.add_argument('sloleks',
help='Path to Sloleks in tsv format.')
parser.add_argument('--zele',
help='Path to zele valency dictionary.')
parser.add_argument('--wordlist', default=None,
help='Path to filtered wordlist.')
parser.add_argument('--handchecked_words', default=None,
help='Path to handchecked words.')
# parser.add_argument('--min_limit',
# help='Limit min number of ocurrences',
# type=int, default=0)
# parser.add_argument('--max_limit',
# help='Limit max number of ocurrences',
# type=int, default=sys.maxsize)
parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))

117
scripts/form_csv.py Normal file
View File

@ -0,0 +1,117 @@
import argparse
import csv
import os
from lxml import etree, objectify, html
def write_general_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio'])
for line in out_list:
writer.writerow(line)
def write_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example'])
for line in out_list:
writer.writerow(line)
def main(args):
for file in sorted(os.listdir(args.input)):
path = os.path.join(args.input, file)
tree = etree.parse(path)
gf_output = []
ssj_output = []
head = next(tree.iter('head'))
headword = head.find('headword').find('lemma').text
#for div in root.iterfind('.//div'):
for elem in tree.iter('statisticsContainer'):
# for element in tree.iterfind('statisticsContainer'):
# for element in tree.find('statisticsContainer'):
semRole = elem.find('semanticRole').text
gf_pattern = None
gf_sentence = None
ssj_pattern = None
ssj_sentence = None
measure = elem.find('measureList')
for el in measure:
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_sentence = el.text
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_sentence = el.text
if gf_pattern is not None and gf_sentence is not None:
gf_output.append([semRole, gf_pattern, gf_sentence])
if ssj_pattern is not None and ssj_sentence is not None:
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
print(file)
analyze_output = []
for elem in tree.iter('valencyPattern'):
valency_pattern_id = elem.attrib['id']
# get frequency
measure = ''
for measure_el in elem.find('measureList').findall('measure'):
if measure_el.attrib['source'] == 'Gigafida 2.0':
measure = measure_el.text
# get semantic roles
semantic_roles_list = []
for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'):
semantic_roles_list.append(semantic_rol_con.find('semanticRole').text)
semantic_roles = '_'.join(semantic_roles_list)
# pattern representation
pattern_representation = elem.find('patternRepresentation').text
# corpus example
if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None:
corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode')
else:
continue
# ugly postprocessing to remove xmlns:xsi=... duh..
root = etree.fromstring(corpus_example_text)
# Remove namespace prefixes
for elem in root.getiterator():
elem.tag = etree.QName(elem).localname
# Remove unused namespace declarations
etree.cleanup_namespaces(root)
corpus_example = etree.tostring(root, encoding='unicode')
print(f"Valency pattern {valency_pattern_id}")
analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example])
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
arg_parser.add_argument('--input', type=str, help='Input directory')
arg_parser.add_argument('--output', type=str, help='Output directory')
args = arg_parser.parse_args()
main(args)

1
scripts/valency Symbolic link
View File

@ -0,0 +1 @@
../src/pkg/valency/valency

8
scripts/xsd_checker.py Normal file
View File

@ -0,0 +1,8 @@
from lxml import etree as lxml
with open('../data/inventory.xsd') as f:
xmlschema_doc = lxml.parse(f)
xmlschema = lxml.XMLSchema(xmlschema_doc)
with open('../data/xmls/output.xml') as op:
doc = lxml.parse(op)
print(xmlschema.validate(doc))

0
src/__init__.py Normal file
View File

View File

@ -37,7 +37,8 @@ app = Flask(__name__)
app.config.from_object("db_config") app.config.from_object("db_config")
mongo = PyMongo(app) mongo = PyMongo(app)
app.config["CORPORA"] = ["ssj", "kres"] # app.config["CORPORA"] = ["ssj", "kres", "gigafida"]
app.config["CORPORA"] = ["gigafida"]
app.config["BANNED_HEADWORDS"] = ["biti"] app.config["BANNED_HEADWORDS"] = ["biti"]
app.config["QUERY_LIMIT"] = 1000 app.config["QUERY_LIMIT"] = 1000
@ -247,20 +248,23 @@ def api_get_frames():
if corpus not in app.config["CORPORA"]: if corpus not in app.config["CORPORA"]:
return json.dumps({"error": "cor={kres,ssj}"}) return json.dumps({"error": "cor={kres,ssj}"})
log.info("Test1")
cur = mongo.db[corpus].find({"headwords": hw}) cur = mongo.db[corpus].find({"headwords": hw})
log.info("Test2")
frames = [] frames = []
for ent in cur[:app.config["QUERY_LIMIT"]]: for ent in cur[:app.config["QUERY_LIMIT"]]:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close() cur.close()
log.info("Test3")
# filter by relevant hw # filter by relevant hw
frames = [x for x in frames if x.hw == hw] frames = [x for x in frames if x.hw == hw]
ret_frames = RF(frames, mongo.db.sensemap) ret_frames = RF(frames, mongo.db.sensemap)
log.info("Test3")
json_ret = {"frames": []} json_ret = {"frames": []}
for frame in ret_frames: for frame in ret_frames:
json_ret["frames"].append(frame.to_json()) json_ret["frames"].append(frame.to_json())
log.info("Test4")
return json.dumps(json_ret) return json.dumps(json_ret)
# return prepare_frames(ret_frames) # return prepare_frames(ret_frames)
@ -444,7 +448,7 @@ def _is_banned(hw):
banned = False banned = False
return banned return banned
def prepare_app_index(appindex_json, sskj_wordlist): def prepare_app_index(appindex_json):
log.info("[*] preparing app_index") log.info("[*] preparing app_index")
# create app_index (used in frontend, left side word index) # create app_index (used in frontend, left side word index)
tmp_app_index = {c: {} for c in app.config["CORPORA"]} tmp_app_index = {c: {} for c in app.config["CORPORA"]}
@ -452,6 +456,14 @@ def prepare_app_index(appindex_json, sskj_wordlist):
res_hws = {} res_hws = {}
res_fns = {} res_fns = {}
# print('CORPUS...!!...')
# print(corpus)
# a = mongo.db[corpus]
# print('TEST_OK')
# print(a)
# print(mongo.db)
# a = mongo.db.list_collection_names()
# print('TEST_OK2')
nentries = mongo.db[corpus].count() nentries = mongo.db[corpus].count()
idx = 0 idx = 0
for e in mongo.db[corpus].find({}): for e in mongo.db[corpus].find({}):
@ -484,6 +496,7 @@ def prepare_app_index(appindex_json, sskj_wordlist):
for letter, words in alphabetical.items(): for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])] filtered_words = [x for x in words if not _is_banned(x[0])]
# filtered_words = [x for x in words]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0]) alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical tmp_app_index[corpus]["words"] = alphabetical
@ -560,12 +573,16 @@ if __name__ == "__main__":
if args.prepare_db: if args.prepare_db:
with Path(args.sskj_wordlist).open("r") as fp: with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp) sskj_wordlist = json.load(fp)
prepare_app_index(args.appindex_json, sskj_wordlist) prepare_app_index(args.appindex_json)
sys.exit() sys.exit()
# app index from db # app index from db
with Path(args.appindex_json).open("r") as fp: with Path(args.appindex_json).open("r") as fp:
app.config["app_index"] = json.load(fp) app.config["app_index"] = json.load(fp)
# a = app.config["app_index"]
# b = app.config["app_index"]["kres"]
# c = app.config["app_index"]["kres"]["words"]
# print('HERE')
# log.info("[*] Starting app.py with config:\n%s".format(config)) # log.info("[*] Starting app.py with config:\n%s".format(config))
log.info("[*] Starting app.py with config:\n{}".format(config)) log.info("[*] Starting app.py with config:\n{}".format(config))

View File

@ -0,0 +1,106 @@
import argparse
import json
from flask import Flask
from flask_pymongo import PyMongo
from pathlib import Path
app = Flask(__name__)
app.config.from_object("db_config")
mongo = PyMongo(app)
app.config["BANNED_HEADWORDS"] = ["biti"]
def _is_banned(hw):
banned = True
if hw in app.config["BANNED_HEADWORDS"]:
banned = True
elif hw in sskj_wordlist["wordlist"]:
banned = False
elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False
return banned
def prepare_app_index(appindex_json, corporas, previous_json=None):
if previous_json:
with Path(previous_json).open("r") as fp:
tmp_app_index = json.load(fp)
else:
tmp_app_index = {}
# create app_index (used in frontend, left side word index)
for c in corporas:
tmp_app_index[c] = {}
for corpus in corporas:
res_hws = {}
res_fns = {}
# print('CORPUS...!!...')
# print(corpus)
# a = mongo.db[corpus]
# print('TEST_OK')
# print(a)
# print(mongo.db)
# a = mongo.db.list_collection_names()
# print('TEST_OK2')
nentries = mongo.db[corpus].count()
idx = 0
for e in mongo.db[corpus].find({}):
if "headwords" not in e:
continue
for hw in e["headwords"]:
if hw in res_hws:
res_hws[hw] += 1
else:
res_hws[hw] = 1
if "functors" not in e:
continue
for fn in e["functors"]:
if fn in res_fns:
res_fns[fn] += 1
else:
res_fns[fn] = 1
idx += 1
if idx % 10000 == 0:
print("indexing {}: {}/{}".format(
corpus, idx, nentries))
alphabetical = {}
for k, e in res_hws.items():
fst = k[0].lower()
if fst in alphabetical:
alphabetical[fst].append((k, e))
else:
alphabetical[fst] = [(k, e)]
for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])]
# filtered_words = [x for x in words]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
functors = sorted(functors, key=lambda x: x[0])
tmp_app_index[corpus]["functors"] = functors
with Path(appindex_json).open("w") as fp:
json.dump(tmp_app_index, fp)
if __name__ == "__main__":
print("Starting app.py main()")
aparser = argparse.ArgumentParser(description="Arguments for app.py")
aparser.add_argument("--previous-json", type=str, default=None)
aparser.add_argument("--appindex-json", type=str)
aparser.add_argument("--sskj-wordlist", type=str)
args = aparser.parse_args()
corporas = ['gigafida']
with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp)
prepare_app_index(args.appindex_json, corporas, args.previous_json)

View File

@ -1,2 +1,2 @@
MONGO_URI = "mongodb://sizif:p5e3r4u8t7@my_mongo:27017/valdb" MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb"
MONGO_AUTH_SOURCE = 'admin' MONGO_AUTH_SOURCE = 'admin'

View File

@ -0,0 +1,18 @@
import json
import os
input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json"
output_file = "../../all_sentences.json"
results = {}
filenames = os.listdir(input_dir)
len(filenames)
for i, filename in enumerate(filenames):
if filename.endswith(".json"):
with open(os.path.join(input_dir, filename)) as json_file:
data = json.load(json_file)
results[filename.split('-')[0]] = list(data.keys())
print('Progress: %.2f %%' % (i/len(filenames)))
with open(output_file, 'w') as f:
json.dump(results, f)

View File

@ -1,3 +1,3 @@
{ {
"api_addr": "http://193.2.76.103:8084" "api_addr": "http://0.0.0.0:8084"
} }

View File

@ -3513,14 +3513,12 @@
"balanced-match": { "balanced-match": {
"version": "1.0.0", "version": "1.0.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"brace-expansion": { "brace-expansion": {
"version": "1.1.11", "version": "1.1.11",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"balanced-match": "^1.0.0", "balanced-match": "^1.0.0",
"concat-map": "0.0.1" "concat-map": "0.0.1"
@ -3535,20 +3533,17 @@
"code-point-at": { "code-point-at": {
"version": "1.1.0", "version": "1.1.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"concat-map": { "concat-map": {
"version": "0.0.1", "version": "0.0.1",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"console-control-strings": { "console-control-strings": {
"version": "1.1.0", "version": "1.1.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"core-util-is": { "core-util-is": {
"version": "1.0.2", "version": "1.0.2",
@ -3665,8 +3660,7 @@
"inherits": { "inherits": {
"version": "2.0.3", "version": "2.0.3",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"ini": { "ini": {
"version": "1.3.5", "version": "1.3.5",
@ -3678,7 +3672,6 @@
"version": "1.0.0", "version": "1.0.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"number-is-nan": "^1.0.0" "number-is-nan": "^1.0.0"
} }
@ -3693,7 +3686,6 @@
"version": "3.0.4", "version": "3.0.4",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"brace-expansion": "^1.1.7" "brace-expansion": "^1.1.7"
} }
@ -3701,14 +3693,12 @@
"minimist": { "minimist": {
"version": "0.0.8", "version": "0.0.8",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"minipass": { "minipass": {
"version": "2.3.5", "version": "2.3.5",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"safe-buffer": "^5.1.2", "safe-buffer": "^5.1.2",
"yallist": "^3.0.0" "yallist": "^3.0.0"
@ -3727,7 +3717,6 @@
"version": "0.5.1", "version": "0.5.1",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"minimist": "0.0.8" "minimist": "0.0.8"
} }
@ -3808,8 +3797,7 @@
"number-is-nan": { "number-is-nan": {
"version": "1.0.1", "version": "1.0.1",
"bundled": true, "bundled": true,
"dev": true, "dev": true
"optional": true
}, },
"object-assign": { "object-assign": {
"version": "4.1.1", "version": "4.1.1",
@ -3821,7 +3809,6 @@
"version": "1.4.0", "version": "1.4.0",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"wrappy": "1" "wrappy": "1"
} }
@ -3943,7 +3930,6 @@
"version": "1.0.2", "version": "1.0.2",
"bundled": true, "bundled": true,
"dev": true, "dev": true,
"optional": true,
"requires": { "requires": {
"code-point-at": "^1.0.0", "code-point-at": "^1.0.0",
"is-fullwidth-code-point": "^1.0.0", "is-fullwidth-code-point": "^1.0.0",

View File

@ -62,7 +62,7 @@ export default {
name: "Nav", name: "Nav",
props: ["appState"], props: ["appState"],
data() {return { data() {return {
optCorpora: ["kres", "ssj"], optCorpora: ["kres", "ssj", "gigafida"],
optIndexes: [ optIndexes: [
{key: "besede", val: "words"}, {key: "besede", val: "words"},
{key: "udeleženske vloge", val: "functors"}, {key: "udeleženske vloge", val: "functors"},

0
src/pkg/__init__.py Normal file
View File

@ -1 +1 @@
Subproject commit 01adf47b9b63b43f86bff52429792b0de2327ddd Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd

@ -0,0 +1 @@
Subproject commit 8c87d07b8a3ca73faac2fac30c39969bc5f97d45

View File

@ -3,6 +3,41 @@ from corpusparser import enriched_lemma
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def frames_from_db_entry_headword(dbent, headword):
def _full_tid(tid):
return ".".join([dbent["sid"], str(tid)])
token_dict = {str(x["tid"]): x for x in dbent["tokens"]}
frames = []
if "srl_links" not in dbent:
return []
srldict = {}
for srl in dbent["srl_links"]:
key = str(srl["from"])
if enriched_lemma(token_dict[key]) != headword:
continue
if key not in srldict:
srldict[key] = [srl]
else:
srldict[key] += [srl]
for hwtid, srlarr in srldict.items():
frames += [Frame(
hw_lemma=enriched_lemma(token_dict[hwtid]),
tids=[_full_tid(hwtid)],
slots=[
Slot(
functor=srl["afun"],
tids=[_full_tid(srl["to"])]
) for srl in srlarr
],
# sentences=[(dbent["sid"], dbent["tokens"])],
sentences=[
[(_full_tid(t["tid"]), t) for t in dbent["tokens"]],
]
)]
return frames
def frames_from_db_entry(dbent): def frames_from_db_entry(dbent):
def _full_tid(tid): def _full_tid(tid):
return ".".join([dbent["sid"], str(tid)]) return ".".join([dbent["sid"], str(tid)])