forked from kristjan/cjvt-valency
Compare commits
1 Commits
sskj_sense
...
dev
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c563df31ba |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,5 +1,4 @@
|
|||||||
data/samples/
|
data/samples/
|
||||||
data/wordlist.json
|
|
||||||
*egg-info/
|
*egg-info/
|
||||||
*.pyc
|
*.pyc
|
||||||
src/frontend_vue/node_modules/
|
src/frontend_vue/node_modules/
|
||||||
|
|||||||
41
Makefile
41
Makefile
@@ -3,38 +3,21 @@
|
|||||||
|
|
||||||
MAKE_ROOT = $(shell pwd)
|
MAKE_ROOT = $(shell pwd)
|
||||||
|
|
||||||
### Input data
|
|
||||||
# I received ssj500k in one .xml file,
|
|
||||||
# kres is composed of many .xml files
|
|
||||||
# I generated srl tags for kres in separate .json files
|
|
||||||
# (for each kres.xml file there is a kres.json file with srl tags)
|
|
||||||
# SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
|
|
||||||
SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
|
|
||||||
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
|
|
||||||
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
|
|
||||||
KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml"
|
|
||||||
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
|
|
||||||
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
|
|
||||||
KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
|
|
||||||
|
|
||||||
# This file comes with the source code. Make sure you unpack it and name it right.
|
|
||||||
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
|
|
||||||
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
|
|
||||||
|
|
||||||
OUTPUT = "db"
|
OUTPUT = "db"
|
||||||
# OUTPUT = "file"
|
# OUTPUT = "file"
|
||||||
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume
|
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume
|
||||||
DBADDR = "0.0.0.0:27017" # don't use localhost
|
DBADDR = "0.0.0.0:27017" # don't use localhost
|
||||||
|
|
||||||
|
N_CORES = 5
|
||||||
|
# insert kres files into database in chunks, for fewer connections
|
||||||
|
KRES_CHUNK_SIZE = 30
|
||||||
|
|
||||||
|
# Some backend parameters can be found in conf file (see make backend)
|
||||||
|
|
||||||
# credentials from .gitignored file
|
# credentials from .gitignored file
|
||||||
# create it from env.default
|
# create it from env.default
|
||||||
include env.local
|
include env.local
|
||||||
|
|
||||||
N_CORES = 3
|
|
||||||
# insert kres files into database in chunks, for fewer connections
|
|
||||||
KRES_CHUNK_SIZE = 30
|
|
||||||
|
|
||||||
# Backend parameters found in conf file (see make backend)
|
|
||||||
export
|
export
|
||||||
|
|
||||||
.PHONY: python-env fill-database
|
.PHONY: python-env fill-database
|
||||||
@@ -65,7 +48,6 @@ python-env:
|
|||||||
python-env-install:
|
python-env-install:
|
||||||
pip3 install -e src/pkg/cjvt-corpusparser/.
|
pip3 install -e src/pkg/cjvt-corpusparser/.
|
||||||
pip3 install -e src/pkg/valency/.
|
pip3 install -e src/pkg/valency/.
|
||||||
pip3 install -e src/pkg/seqparser/.
|
|
||||||
|
|
||||||
# from inside python-env container:
|
# from inside python-env container:
|
||||||
data/samples:
|
data/samples:
|
||||||
@@ -106,12 +88,10 @@ frontend-prod:
|
|||||||
## Backend
|
## Backend
|
||||||
|
|
||||||
# runs once and exits before the app starts
|
# runs once and exits before the app starts
|
||||||
# need to extract ./data/sskj_data.tar.gz first
|
|
||||||
backend-prepare-db:
|
backend-prepare-db:
|
||||||
cd ./src/backend_flask; python3 app.py \
|
cd ./src/backend_flask; python3 app.py \
|
||||||
--config-file ./conf_files/dev_conf.yaml \
|
--config-file ./conf_files/dev_conf.yaml \
|
||||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
|
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
|
||||||
--sskj-wordlist $(SSKJ_WORDLIST) \
|
|
||||||
--prepare-db
|
--prepare-db
|
||||||
|
|
||||||
backend-dev:
|
backend-dev:
|
||||||
@@ -123,12 +103,3 @@ backend-prod:
|
|||||||
cd ./src/backend_flask; python3 app.py \
|
cd ./src/backend_flask; python3 app.py \
|
||||||
--config-file ./conf_files/prod_conf.yaml \
|
--config-file ./conf_files/prod_conf.yaml \
|
||||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
|
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
|
||||||
|
|
||||||
## add sskj senses to db (generated with pkg/seqparser)
|
|
||||||
sskj-senses:
|
|
||||||
python3 ./src/pkg/seqparser/seqparser/main.py \
|
|
||||||
--sskj-json $(SSKJ_JSON) \
|
|
||||||
--operation "senses_to_db" \
|
|
||||||
--dbaddr $(DBADDR) \
|
|
||||||
--dbuser $(DB_USR_USER) \
|
|
||||||
--dbpass $(DB_USR_PASS)
|
|
||||||
|
|||||||
15
README.md
15
README.md
@@ -81,21 +81,18 @@ $ make backend-dev
|
|||||||
$ make backend-prod
|
$ make backend-prod
|
||||||
```
|
```
|
||||||
|
|
||||||
API endpoints:
|
|
||||||
|
|
||||||
* GET word list (pre-cached)
|
|
||||||
* GET reduced frames (pre-cached)
|
|
||||||
* POST senses
|
|
||||||
* User auth logic
|
|
||||||
|
|
||||||
|
|
||||||
### Vue frontend (1 container)
|
### Vue frontend (1 container)
|
||||||
Relies on Flask backend.
|
Relies on Flask backend.
|
||||||
Before running `make`, you might need to set the correct api address.
|
Before running `make`, you might need to set the correct api address.
|
||||||
Check `./src/frontend_vue/config/config_prod.json`.
|
Check `./src/frontend_vue/config/config_prod.json`.
|
||||||
bash
|
bash
|
||||||
```
|
```
|
||||||
# $ make frontend-dev # development
|
# development
|
||||||
|
# ./config_dev.json
|
||||||
|
$ make frontend-dev # development
|
||||||
|
|
||||||
|
# production
|
||||||
|
# ./config_prod.json
|
||||||
$ make frontend-prod
|
$ make frontend-prod
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
1
data/kres_json_folder_link_t420
Symbolic link
1
data/kres_json_folder_link_t420
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
/home/kristjan/kres_data/payload/kres_json/
|
||||||
1
data/kres_xml_folder_link_proc
Symbolic link
1
data/kres_xml_folder_link_proc
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
/home/kristjan/kres_mount/kres_parsed/tei/
|
||||||
Binary file not shown.
File diff suppressed because one or more lines are too long
21
env.default
21
env.default
@@ -1,6 +1,27 @@
|
|||||||
|
### Credentials
|
||||||
|
|
||||||
MONGOEXPRESS_USER = mxuser
|
MONGOEXPRESS_USER = mxuser
|
||||||
MONGOEXPRESS_PASS = mxuserpassword
|
MONGOEXPRESS_PASS = mxuserpassword
|
||||||
DB_ADM_USER = valadmin
|
DB_ADM_USER = valadmin
|
||||||
DB_ADM_PASS = valadminpass
|
DB_ADM_PASS = valadminpass
|
||||||
DB_USR_USER = valuser
|
DB_USR_USER = valuser
|
||||||
DB_USR_PASS = valuserpass
|
DB_USR_PASS = valuserpass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Input data
|
||||||
|
|
||||||
|
# I received ssj500k in one .xml file,
|
||||||
|
# kres is composed of many .xml files
|
||||||
|
# I generated srl tags for kres in separate .json files
|
||||||
|
# (for each kres.xml file there is a kres.json file with srl tags)
|
||||||
|
|
||||||
|
# Use the files from /data/samples.tar.gz for a quick app build with a subset of data.
|
||||||
|
|
||||||
|
SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
|
||||||
|
# SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
|
||||||
|
KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
|
||||||
|
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
|
||||||
|
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
|
||||||
|
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
|
||||||
|
|
||||||
|
|||||||
25
makefile_vars.example
Normal file
25
makefile_vars.example
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Environment specific Makefile parameters
|
||||||
|
# Copy this file and name it makefile_args
|
||||||
|
# makefile_args gets .gitignored
|
||||||
|
|
||||||
|
# SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
|
||||||
|
SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
|
||||||
|
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
|
||||||
|
KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
|
||||||
|
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
|
||||||
|
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
|
||||||
|
|
||||||
|
OUTPUT = "db"
|
||||||
|
# OUTPUT = "file"
|
||||||
|
OUTDIR = "/tmp/three" # if you're running this in docker, make sure to mount the volume
|
||||||
|
DBADDR = "0.0.0.0:27017" # don't use localhost
|
||||||
|
|
||||||
|
# credentials from .gitignored file
|
||||||
|
# create it from env.default
|
||||||
|
include env.local
|
||||||
|
|
||||||
|
N_CORES = 5
|
||||||
|
# insert kres files into database in chunks, for fewer connections
|
||||||
|
KRES_CHUNK_SIZE = 30
|
||||||
|
|
||||||
|
# Backend parameters found in conf file (see make backend)
|
||||||
@@ -38,8 +38,6 @@ SENSEMAP_COLL = "sensemap"
|
|||||||
# pre-generated data (gui leftside word index)
|
# pre-generated data (gui leftside word index)
|
||||||
CORPORA = ["ssj", "kres"]
|
CORPORA = ["ssj", "kres"]
|
||||||
app_index = None
|
app_index = None
|
||||||
sskj_wordlist = None # used by _is_banned(hw)
|
|
||||||
BANNED_HEADWORDS = ["biti"]
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
valdb = None
|
valdb = None
|
||||||
@@ -324,8 +322,6 @@ def api_get_functor_frames():
|
|||||||
|
|
||||||
|
|
||||||
# SENSES ----------------------------.
|
# SENSES ----------------------------.
|
||||||
# ssj_id is legacy notation, read
|
|
||||||
# it as general sentence_id
|
|
||||||
|
|
||||||
@app.route("/api/senses/get")
|
@app.route("/api/senses/get")
|
||||||
def api_senses_get():
|
def api_senses_get():
|
||||||
@@ -411,8 +407,6 @@ def api_senses_update():
|
|||||||
ns["date"] = tmp_dt
|
ns["date"] = tmp_dt
|
||||||
id_map[frontend_sense_id] = new_sense_id
|
id_map[frontend_sense_id] = new_sense_id
|
||||||
|
|
||||||
print(ns)
|
|
||||||
|
|
||||||
# insert into db
|
# insert into db
|
||||||
valdb[SENSES_COLL].insert(ns)
|
valdb[SENSES_COLL].insert(ns)
|
||||||
|
|
||||||
@@ -436,18 +430,6 @@ def api_senses_update():
|
|||||||
|
|
||||||
|
|
||||||
# APP PREFLIGHT ---------------------.
|
# APP PREFLIGHT ---------------------.
|
||||||
def _is_banned(hw):
|
|
||||||
banned = True
|
|
||||||
if hw in BANNED_HEADWORDS:
|
|
||||||
banned = True
|
|
||||||
elif hw in sskj_wordlist["wordlist"]:
|
|
||||||
banned = False
|
|
||||||
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
|
||||||
banned = False
|
|
||||||
|
|
||||||
if hw[-1] == "_":
|
|
||||||
log.debug("hw: {}, banned: {}".format(hw, banned))
|
|
||||||
return banned
|
|
||||||
|
|
||||||
def prepare_app_index():
|
def prepare_app_index():
|
||||||
log.info("[*] preparing app_index")
|
log.info("[*] preparing app_index")
|
||||||
@@ -480,10 +462,8 @@ def prepare_app_index():
|
|||||||
else:
|
else:
|
||||||
alphabetical[fst] = [(k, e)]
|
alphabetical[fst] = [(k, e)]
|
||||||
|
|
||||||
for letter, words in alphabetical.items():
|
for k, e in alphabetical.items():
|
||||||
filtered_words = [x for x in words if not _is_banned(x[0])]
|
alphabetical[k] = sorted(e, key=lambda x: x[0])
|
||||||
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
|
||||||
|
|
||||||
tmp_app_index[corpus]["words"] = alphabetical
|
tmp_app_index[corpus]["words"] = alphabetical
|
||||||
|
|
||||||
functors = [(k, e) for (k, e) in res_fns.items()]
|
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||||
@@ -503,7 +483,6 @@ if __name__ == "__main__":
|
|||||||
aparser.add_argument("--dbuser", type=str)
|
aparser.add_argument("--dbuser", type=str)
|
||||||
aparser.add_argument("--dbpass", type=str)
|
aparser.add_argument("--dbpass", type=str)
|
||||||
aparser.add_argument("--dbaddr", type=str)
|
aparser.add_argument("--dbaddr", type=str)
|
||||||
aparser.add_argument("--sskj-wordlist", type=str)
|
|
||||||
args = aparser.parse_args()
|
args = aparser.parse_args()
|
||||||
|
|
||||||
config = None
|
config = None
|
||||||
@@ -528,8 +507,6 @@ if __name__ == "__main__":
|
|||||||
valdb = client.valdb
|
valdb = client.valdb
|
||||||
|
|
||||||
if args.prepare_db:
|
if args.prepare_db:
|
||||||
with Path(args.sskj_wordlist).open("r") as fp:
|
|
||||||
sskj_wordlist = json.load(fp)
|
|
||||||
prepare_app_index()
|
prepare_app_index()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,8 @@ info:
|
|||||||
echo "Pick either dev or prod."
|
echo "Pick either dev or prod."
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
- docker rm -f $(CONNAME)
|
- docker kill $(CONNAME)
|
||||||
|
- docker rm $(CONNAME)
|
||||||
|
|
||||||
build-container:
|
build-container:
|
||||||
docker build . -t $(IMGNAME)
|
docker build . -t $(IMGNAME)
|
||||||
@@ -20,5 +21,3 @@ dev: build-container clean
|
|||||||
prod: build-container clean
|
prod: build-container clean
|
||||||
docker run --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh
|
docker run --name $(CONNAME) -d -p 8080:8080 -v $(shell pwd):/src $(IMGNAME) /src/ops_scripts/prod.sh
|
||||||
|
|
||||||
node-env: clean
|
|
||||||
docker run --name $(CONNAME) -it -p 8080:8080 -v $(shell pwd):/src $(IMGNAME)
|
|
||||||
|
|||||||
5787
src/frontend_vue/package-lock.json
generated
5787
src/frontend_vue/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -10,52 +10,50 @@
|
|||||||
"build": "node build/build.js"
|
"build": "node build/build.js"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"ajv": "^6.10.0",
|
|
||||||
"axios": "^0.18.0",
|
"axios": "^0.18.0",
|
||||||
"bootstrap-vue": "^2.0.0-rc.19",
|
"bootstrap-vue": "^2.0.0-rc.11",
|
||||||
"jquery": "^3.4.0",
|
|
||||||
"sha256": "^0.2.0",
|
"sha256": "^0.2.0",
|
||||||
"vue": "^2.6.10",
|
"vue": "^2.5.2",
|
||||||
"vue-cookies": "^1.5.13",
|
"vue-cookies": "^1.5.6",
|
||||||
"vue-router": "^3.0.6",
|
"vue-router": "^3.0.1",
|
||||||
"vue-spinner": "^1.0.3"
|
"vue-spinner": "^1.0.3"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"autoprefixer": "^7.1.2",
|
"autoprefixer": "^7.1.2",
|
||||||
"babel-core": "^6.22.1",
|
"babel-core": "^6.22.1",
|
||||||
"babel-helper-vue-jsx-merge-props": "^2.0.3",
|
"babel-helper-vue-jsx-merge-props": "^2.0.3",
|
||||||
"babel-loader": "^7.1.5",
|
"babel-loader": "^7.1.1",
|
||||||
"babel-plugin-syntax-jsx": "^6.18.0",
|
"babel-plugin-syntax-jsx": "^6.18.0",
|
||||||
"babel-plugin-transform-runtime": "^6.22.0",
|
"babel-plugin-transform-runtime": "^6.22.0",
|
||||||
"babel-plugin-transform-vue-jsx": "^3.5.0",
|
"babel-plugin-transform-vue-jsx": "^3.5.0",
|
||||||
"babel-preset-env": "^1.3.2",
|
"babel-preset-env": "^1.3.2",
|
||||||
"babel-preset-stage-2": "^6.22.0",
|
"babel-preset-stage-2": "^6.22.0",
|
||||||
"chalk": "^2.4.2",
|
"chalk": "^2.0.1",
|
||||||
"copy-webpack-plugin": "^4.6.0",
|
"copy-webpack-plugin": "^4.0.1",
|
||||||
"css-loader": "^2.1.1",
|
"css-loader": "^0.28.0",
|
||||||
"extract-text-webpack-plugin": "^3.0.0",
|
"extract-text-webpack-plugin": "^3.0.0",
|
||||||
"file-loader": "^1.1.4",
|
"file-loader": "^1.1.4",
|
||||||
"friendly-errors-webpack-plugin": "^1.6.1",
|
"friendly-errors-webpack-plugin": "^1.6.1",
|
||||||
"html-webpack-plugin": "^2.30.1",
|
"html-webpack-plugin": "^2.30.1",
|
||||||
"node-notifier": "^5.4.0",
|
"node-notifier": "^5.1.2",
|
||||||
"optimize-css-assets-webpack-plugin": "^5.0.1",
|
"optimize-css-assets-webpack-plugin": "^3.2.0",
|
||||||
"ora": "^1.2.0",
|
"ora": "^1.2.0",
|
||||||
"portfinder": "^1.0.20",
|
"portfinder": "^1.0.13",
|
||||||
"postcss-import": "^11.0.0",
|
"postcss-import": "^11.0.0",
|
||||||
"postcss-loader": "^2.1.6",
|
"postcss-loader": "^2.0.8",
|
||||||
"postcss-url": "^7.2.1",
|
"postcss-url": "^7.2.1",
|
||||||
"rimraf": "^2.6.3",
|
"rimraf": "^2.6.0",
|
||||||
"semver": "^5.7.0",
|
"semver": "^5.3.0",
|
||||||
"shelljs": "^0.7.6",
|
"shelljs": "^0.7.6",
|
||||||
"uglifyjs-webpack-plugin": "^1.3.0",
|
"uglifyjs-webpack-plugin": "^1.1.1",
|
||||||
"url-loader": "^1.1.2",
|
"url-loader": "^0.5.8",
|
||||||
"vue-loader": "^13.7.3",
|
"vue-loader": "^13.3.0",
|
||||||
"vue-style-loader": "^3.0.1",
|
"vue-style-loader": "^3.0.1",
|
||||||
"vue-template-compiler": "^2.6.10",
|
"vue-template-compiler": "^2.5.2",
|
||||||
"webpack": "^3.6.0",
|
"webpack": "^3.6.0",
|
||||||
"webpack-bundle-analyzer": "^3.3.2",
|
"webpack-bundle-analyzer": "^2.9.0",
|
||||||
"webpack-dev-server": "^2.11.5",
|
"webpack-dev-server": "^2.9.1",
|
||||||
"webpack-merge": "^4.2.1"
|
"webpack-merge": "^4.1.0"
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 6.0.0",
|
"node": ">= 6.0.0",
|
||||||
|
|||||||
@@ -7,25 +7,3 @@ export default {
|
|||||||
name: 'App',
|
name: 'App',
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<style>
|
|
||||||
body {
|
|
||||||
font-family: cambria;
|
|
||||||
}
|
|
||||||
.ulred {
|
|
||||||
color: #b71511;
|
|
||||||
color: rgb(183,21,17);
|
|
||||||
}
|
|
||||||
.lmenu td {
|
|
||||||
color: #9e9e9e;
|
|
||||||
}
|
|
||||||
.redlinks a {
|
|
||||||
color: #9e9e9e;
|
|
||||||
}
|
|
||||||
.redlinks a:hover {
|
|
||||||
color: #b71511;
|
|
||||||
}
|
|
||||||
.text-secondary {
|
|
||||||
color: #9e9e9e !important;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
@@ -1,17 +1,20 @@
|
|||||||
<template>
|
<template>
|
||||||
<div>
|
<div>
|
||||||
|
<p
|
||||||
|
v-if="this.$root.store.api_error !== null"
|
||||||
|
class="text-warning"
|
||||||
|
>
|
||||||
|
api_error: {{ this.$root.store.api_error }}
|
||||||
|
</p>
|
||||||
<Nav></Nav>
|
<Nav></Nav>
|
||||||
<div class="my-home container-fluid">
|
<div class="my-home container-fluid">
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div id="search" class="col-sm-2 border-right fill" :key=this.$root.store.indexReloader>
|
<div id="serach" class="col-sm-2 border-right fill" :key=this.$root.store.indexReloader>
|
||||||
<LWords
|
<LWords
|
||||||
v-if="this.$root.store.selIndex.val === 'words'"></LWords>
|
v-if="this.$root.store.selIndex.val === 'words'"></LWords>
|
||||||
<LFunctors v-else></LFunctors>
|
<LFunctors v-else></LFunctors>
|
||||||
</div>
|
</div>
|
||||||
<div class="col-sm-10">
|
<div class="col-sm-10">
|
||||||
<p class="text-danger" v-if="this.$root.store.api_error != null">
|
|
||||||
{{ this.$root.store.api_error }}
|
|
||||||
</p>
|
|
||||||
<router-view></router-view>
|
<router-view></router-view>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
<template>
|
<template>
|
||||||
<div class="redlinks">
|
<div>
|
||||||
<table>
|
<table>
|
||||||
<tr v-for="functor in functors">
|
<tr v-for="functor in functors">
|
||||||
<td><a href="#" v-on:click="selectFunctor(functor)">{{ functor[0] }}</a></td>
|
<td><a href="#" v-on:click="selectFunctor(functor)">{{ functor[0] }}</a></td>
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
<template>
|
<template>
|
||||||
<div class="redlinks">
|
<div>
|
||||||
<select v-model="selectedLetter">
|
<select v-model="selectedLetter">
|
||||||
<option v-for="letter in alphabet" :value="letter">
|
<option v-for="letter in alphabet" :value="letter">
|
||||||
{{ letter.toUpperCase() }} ({{ getNumWords(letter) }})
|
{{ letter.toUpperCase() }} ({{ getNumWords(letter) }})
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
<template>
|
<template>
|
||||||
<div class="redlinks">
|
<div>
|
||||||
<div class="col-sm-2">
|
<div class="col-sm-2">
|
||||||
<a href="#" v-on:click="this.$root.routeBack">Nazaj</a>
|
<a href="#" v-on:click="this.$root.routeBack">Nazaj</a>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,11 +1,6 @@
|
|||||||
<template>
|
<template>
|
||||||
|
|
||||||
<!--in case of error-->
|
|
||||||
<div v-if="this.$root.store.api_error != null">
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!--load mode-->
|
<!--load mode-->
|
||||||
<div v-else-if="state === 'loading'">
|
<div v-if="show_loader">
|
||||||
<pulse-loader :color="loader_color"></pulse-loader>
|
<pulse-loader :color="loader_color"></pulse-loader>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -81,11 +76,16 @@ export default {
|
|||||||
},
|
},
|
||||||
state: "loading", // editing, normal
|
state: "loading", // editing, normal
|
||||||
request_reload: false,
|
request_reload: false,
|
||||||
loader_color: "#b71511",
|
loader_color: "#007bff",
|
||||||
}},
|
}},
|
||||||
created: function () {
|
created: function () {
|
||||||
this.reload()
|
this.reload()
|
||||||
},
|
},
|
||||||
|
computed: {
|
||||||
|
show_loader: function () {
|
||||||
|
return this.state === "loading" && this.$root.store.api_error !== null
|
||||||
|
}
|
||||||
|
},
|
||||||
watch: {
|
watch: {
|
||||||
hw: function () {
|
hw: function () {
|
||||||
this.reload()
|
this.reload()
|
||||||
@@ -118,7 +118,6 @@ export default {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
var component = this
|
var component = this
|
||||||
component.state = "loading"
|
|
||||||
this.$http.get(
|
this.$http.get(
|
||||||
this.$root.store.api_addr +
|
this.$root.store.api_addr +
|
||||||
"/api/functor-frames" +
|
"/api/functor-frames" +
|
||||||
@@ -132,7 +131,6 @@ export default {
|
|||||||
})
|
})
|
||||||
.catch(function(error) {
|
.catch(function(error) {
|
||||||
component.$root.store.api_error = error
|
component.$root.store.api_error = error
|
||||||
component.state = "error"
|
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
getFrames: function (hw, reduce_fun=null) {
|
getFrames: function (hw, reduce_fun=null) {
|
||||||
@@ -151,7 +149,6 @@ export default {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
var component = this
|
var component = this
|
||||||
component.state = "loading"
|
|
||||||
this.$http.get(
|
this.$http.get(
|
||||||
this.$root.store.api_addr + "/api/frames" +
|
this.$root.store.api_addr + "/api/frames" +
|
||||||
"?hw=" + hw + "&rf=" + reduce_fun +
|
"?hw=" + hw + "&rf=" + reduce_fun +
|
||||||
@@ -164,7 +161,6 @@ export default {
|
|||||||
})
|
})
|
||||||
.catch(function(error) {
|
.catch(function(error) {
|
||||||
component.$root.store.api_error = error
|
component.$root.store.api_error = error
|
||||||
component.state = "error"
|
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
buildSentences: function () {
|
buildSentences: function () {
|
||||||
|
|||||||
@@ -2,10 +2,7 @@
|
|||||||
<nav>
|
<nav>
|
||||||
<b-navbar toggleable="md" type="light" variant="light">
|
<b-navbar toggleable="md" type="light" variant="light">
|
||||||
<b-navbar-toggle target="nav_collapse"></b-navbar-toggle>
|
<b-navbar-toggle target="nav_collapse"></b-navbar-toggle>
|
||||||
<!--b-navbar-brand>Vezljivostni vzorci slovenskih glagolov</b-navbar-brand-->
|
<b-navbar-brand>Vezljivostni vzorci slovenskih glagolov</b-navbar-brand>
|
||||||
<b-navbar-brand class=cursorpointer v-on:click="goHome">
|
|
||||||
VEZLJIVOSTNI VZORCI SLOVENSKIH GLAGOLOV
|
|
||||||
</b-navbar-brand>
|
|
||||||
<b-collapse is-nav id="nav_collapse">
|
<b-collapse is-nav id="nav_collapse">
|
||||||
|
|
||||||
<b-navbar-nav>
|
<b-navbar-nav>
|
||||||
@@ -103,25 +100,7 @@ export default {
|
|||||||
this.$router.push({
|
this.$router.push({
|
||||||
name: "Home"
|
name: "Home"
|
||||||
})
|
})
|
||||||
},
|
|
||||||
goHome() {
|
|
||||||
this.$router.replace({path: "/home"})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<style>
|
|
||||||
.bg-light {
|
|
||||||
background-color: rgb(183,21,17,0.9) !important;
|
|
||||||
}
|
|
||||||
nav a {
|
|
||||||
color: white;
|
|
||||||
}
|
|
||||||
nav a:hover {
|
|
||||||
color: white;
|
|
||||||
}
|
|
||||||
.cursorpointer {
|
|
||||||
cursor: pointer;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
<template>
|
<template>
|
||||||
<div class=redlinks>
|
<div>
|
||||||
<div class="col-sm-2">
|
<div class="col-sm-2">
|
||||||
<a href="#" v-on:click="this.$root.routeBack">Nazaj</a>
|
<a href="#" v-on:click="this.$root.routeBack">Nazaj</a>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
<template>
|
<template>
|
||||||
<div class="redlinks">
|
<div>
|
||||||
<div class="col-sm-2">
|
<div class="col-sm-2">
|
||||||
<a href="#" v-on:click="this.$root.routeBack">Nazaj</a>
|
<a href="#" v-on:click="this.$root.routeBack">Nazaj</a>
|
||||||
</div>
|
</div>
|
||||||
@@ -7,15 +7,6 @@
|
|||||||
<div class="alert alert-danger" v-if="error">
|
<div class="alert alert-danger" v-if="error">
|
||||||
<p>{{ error }}</p>
|
<p>{{ error }}</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
|
||||||
<input
|
|
||||||
type="email"
|
|
||||||
class="form-control"
|
|
||||||
placeholder="e-pošta"
|
|
||||||
v-model="credentials.email"
|
|
||||||
autocomplete="off"
|
|
||||||
>
|
|
||||||
</div>
|
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<input
|
<input
|
||||||
type="text"
|
type="text"
|
||||||
@@ -25,6 +16,15 @@
|
|||||||
autocomplete="off"
|
autocomplete="off"
|
||||||
>
|
>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<input
|
||||||
|
type="email"
|
||||||
|
class="form-control"
|
||||||
|
placeholder="e-pošta"
|
||||||
|
v-model="credentials.email"
|
||||||
|
autocomplete="off"
|
||||||
|
>
|
||||||
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<input
|
<input
|
||||||
type="password"
|
type="password"
|
||||||
@@ -38,7 +38,7 @@
|
|||||||
<input
|
<input
|
||||||
type="password"
|
type="password"
|
||||||
class="form-control js-login__password "
|
class="form-control js-login__password "
|
||||||
placeholder="Ponovite geslo"
|
placeholder="Ponovite geslo."
|
||||||
v-model="credentials.snd_password"
|
v-model="credentials.snd_password"
|
||||||
autocomplete="off"
|
autocomplete="off"
|
||||||
>
|
>
|
||||||
|
|||||||
@@ -1,9 +0,0 @@
|
|||||||
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
|
|
||||||
SSKJ_JSON = "./sskj_senses.json"
|
|
||||||
WORDLIST = "./wordlist.json"
|
|
||||||
|
|
||||||
gen_json_files:
|
|
||||||
cd seqparser; python3 main.py \
|
|
||||||
--sskj-html=$(SSKJ_HTML) \
|
|
||||||
--sskj-json=$(SSKJ_JSON) \
|
|
||||||
--wordlist=$(WORDLIST)
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
bs4
|
|
||||||
@@ -1,313 +0,0 @@
|
|||||||
from bs4 import BeautifulSoup as BS
|
|
||||||
import re
|
|
||||||
from collections import defaultdict
|
|
||||||
from time import time
|
|
||||||
import pickle
|
|
||||||
import json
|
|
||||||
from copy import deepcopy as DC
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Match sese ordinals (1., 2., ...)
|
|
||||||
rord = re.compile(r"^ *[0-9]+\. *$")
|
|
||||||
|
|
||||||
# Get rid of accented characters.
|
|
||||||
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
|
|
||||||
outtb = "AEIOUaaaceeeiiinoooouuučRr"
|
|
||||||
transtab = str.maketrans(intab, outtb)
|
|
||||||
|
|
||||||
def d_time(fun):
|
|
||||||
def wrapper(*args, **kwargs):
|
|
||||||
tstart = time()
|
|
||||||
fun(*args, **kwargs)
|
|
||||||
duration = time() - tstart
|
|
||||||
print("Function {} ran for {:.2f} s.".format(
|
|
||||||
fun.__name__, duration))
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
class Seqparser:
|
|
||||||
def __init__(sskj_file):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@d_time
|
|
||||||
def html_to_verb_adj_json(self, infile, outfile):
|
|
||||||
out_dict = defaultdict(list)
|
|
||||||
with Path(infile).open("rb") as fp:
|
|
||||||
for line in fp:
|
|
||||||
data = self.parse_line(line)
|
|
||||||
if data is None: continue
|
|
||||||
out_dict[data["izt_clean"]].append(data)
|
|
||||||
with Path(outfile).open("w") as fp:
|
|
||||||
json.dump(dict(out_dict), fp)
|
|
||||||
|
|
||||||
@d_time
|
|
||||||
def generate_sskj_wordlist(self, in_json_file, out_wordlist):
|
|
||||||
wordlist = None
|
|
||||||
with Path(in_json_file).open("r") as fp:
|
|
||||||
jdata = json.load(fp)
|
|
||||||
wordlist = list(jdata.keys())
|
|
||||||
with Path(out_wordlist).open("w") as fp:
|
|
||||||
json.dump({"wordlist": wordlist}, fp)
|
|
||||||
|
|
||||||
# main functions
|
|
||||||
def html_to_raw_pickle(self, sskj_html_filepath, raw_pickle_filepath):
|
|
||||||
entries = dict(self.parse_file(sskj_html_filepath, self.parse_line))
|
|
||||||
print("entries len: " + str(len(entries)))
|
|
||||||
with open(raw_pickle_filepath, "wb") as f:
|
|
||||||
tmpstr = json.dumps(dict(entries))
|
|
||||||
pickle.dump(tmpstr, f)
|
|
||||||
# debugging
|
|
||||||
|
|
||||||
def raw_pickle_to_parsed_pickle(
|
|
||||||
self, raw_pickle_filepath, parsed_pickle_filepath,
|
|
||||||
se_list_filepath
|
|
||||||
):
|
|
||||||
data = self.load_raw_pickle(raw_pickle_filepath)
|
|
||||||
print("raw_pickle data len: " + str(len(data)))
|
|
||||||
se_list = self.gen_se_list(data)
|
|
||||||
print("se_list len: " + str(len(se_list)))
|
|
||||||
with open(se_list_filepath, "wb") as f:
|
|
||||||
pickle.dump(se_list, f)
|
|
||||||
data1 = self.remove_se(data)
|
|
||||||
data2 = self.reorganize(data1, se_list)
|
|
||||||
print("data2 len: " + str(len(data2.keys())))
|
|
||||||
with open(parsed_pickle_filepath, "wb") as f:
|
|
||||||
pickle.dump(data2, f)
|
|
||||||
|
|
||||||
# helper html reading functions
|
|
||||||
def parse_file(self, path, f_parse_line):
|
|
||||||
tstart = time()
|
|
||||||
entries = defaultdict(list)
|
|
||||||
with open(path, "r") as f:
|
|
||||||
for line in f:
|
|
||||||
data = f_parse_line(line)
|
|
||||||
if data is not None:
|
|
||||||
entries[data["izt_clean"]].append(data)
|
|
||||||
print("parse_file({}) in {:.2f}s".format(path, time() - tstart))
|
|
||||||
return entries
|
|
||||||
|
|
||||||
def parse_line(self, line):
|
|
||||||
def helper_bv_set(g_or_p):
|
|
||||||
if g_or_p not in ["G", "P"]:
|
|
||||||
print("Err g_or_p.")
|
|
||||||
exit(1)
|
|
||||||
if data.get("bv") is not None:
|
|
||||||
if data["bv"] != g_or_p:
|
|
||||||
print(str(line))
|
|
||||||
# exit(1)
|
|
||||||
data["bv"] = g_or_p
|
|
||||||
data = {
|
|
||||||
"izt": "",
|
|
||||||
"izt_clean": "",
|
|
||||||
"senses": defaultdict(list)
|
|
||||||
}
|
|
||||||
soup = BS(line, "html.parser")
|
|
||||||
|
|
||||||
current_sense_id = "0"
|
|
||||||
for span in soup.find_all("span"):
|
|
||||||
|
|
||||||
# sense id
|
|
||||||
if span.string is not None:
|
|
||||||
rmatch = rord.match(span.string)
|
|
||||||
if rmatch is not None:
|
|
||||||
current_sense_id = rmatch.group().strip()
|
|
||||||
|
|
||||||
title = span.attrs.get("title")
|
|
||||||
if title is not None:
|
|
||||||
title = title.lower()
|
|
||||||
|
|
||||||
# only verbs and adjectives
|
|
||||||
if "glagol" in title:
|
|
||||||
helper_bv_set("G")
|
|
||||||
data["bv_full"] = title
|
|
||||||
elif "pridevn" in title:
|
|
||||||
helper_bv_set("P")
|
|
||||||
data["bv_full"] = title
|
|
||||||
|
|
||||||
# žšč
|
|
||||||
if title == "iztočnica":
|
|
||||||
data["izt"] = span.string
|
|
||||||
data["izt_clean"] = span.string.translate(transtab).lower()
|
|
||||||
|
|
||||||
# sense description
|
|
||||||
if title == "razlaga" and span.string is not None:
|
|
||||||
data["senses"][current_sense_id].append(
|
|
||||||
("razl", span.string))
|
|
||||||
if "pridevnik od" in span.string:
|
|
||||||
helper_bv_set("P")
|
|
||||||
|
|
||||||
if title == "sopomenka":
|
|
||||||
subspan = span.find_all("a")[0]
|
|
||||||
if subspan.string is not None:
|
|
||||||
data["senses"][current_sense_id].append(
|
|
||||||
("sopo", subspan.string))
|
|
||||||
|
|
||||||
# save verbs and adjectives
|
|
||||||
if (
|
|
||||||
("bv" not in data) or
|
|
||||||
(data["bv"] != "P" and data["bv"] != "G")
|
|
||||||
):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
if data["bv"] == "P" and " se" in data["izt_clean"]:
|
|
||||||
print(data)
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
# append _ to adjective keywords
|
|
||||||
if data["bv"] == "P":
|
|
||||||
data["izt_clean"] = data["izt_clean"] + "_"
|
|
||||||
|
|
||||||
# cleanup
|
|
||||||
if "bv" not in data:
|
|
||||||
print("Should not be here (no bv).")
|
|
||||||
exit(1)
|
|
||||||
del(data["bv"])
|
|
||||||
if "bv_full" in data:
|
|
||||||
del(data["bv_full"])
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
# helper functions
|
|
||||||
def load_raw_pickle(self, raw_pickle_filepath):
|
|
||||||
with open(raw_pickle_filepath, "rb") as f:
|
|
||||||
tmpstr = pickle.load(f)
|
|
||||||
return json.loads(tmpstr)
|
|
||||||
|
|
||||||
def helper_loop(self, data, fnc):
|
|
||||||
for k, lst in data.items():
|
|
||||||
for el in lst:
|
|
||||||
fnc(el)
|
|
||||||
|
|
||||||
def gen_se_list(self, data):
|
|
||||||
|
|
||||||
def fnc1(el):
|
|
||||||
ic = el["izt_clean"]
|
|
||||||
if " se" in ic:
|
|
||||||
se_list.append(ic)
|
|
||||||
|
|
||||||
def fnc2(el):
|
|
||||||
ic = el["izt_clean"]
|
|
||||||
if ic in se_pruned:
|
|
||||||
se_pruned.remove(ic)
|
|
||||||
|
|
||||||
# hw entries that only exist with " se"
|
|
||||||
se_list = []
|
|
||||||
self.helper_loop(data, fnc1)
|
|
||||||
se_pruned = set([hw.split(" se")[0] for hw in se_list])
|
|
||||||
self.helper_loop(data, fnc2)
|
|
||||||
return sorted(list(se_pruned))
|
|
||||||
|
|
||||||
def remove_se(self, data):
|
|
||||||
|
|
||||||
def fnc1(el):
|
|
||||||
nel = DC(el)
|
|
||||||
ic = nel["izt_clean"]
|
|
||||||
if " se" in ic:
|
|
||||||
nic = ic.split(" se")[0]
|
|
||||||
nel["izt_clean"] = nic
|
|
||||||
data_new[nel["izt_clean"]].append(nel)
|
|
||||||
|
|
||||||
data_new = defaultdict(list)
|
|
||||||
self.helper_loop(data, fnc1)
|
|
||||||
return dict(data_new)
|
|
||||||
|
|
||||||
def reorganize(self, data, se_list):
|
|
||||||
# some hw entries have several headwords,
|
|
||||||
# some senses have subsenses
|
|
||||||
# index everything, make 1 object per hw
|
|
||||||
|
|
||||||
def helper_prune(sense_str):
|
|
||||||
# remove space padding
|
|
||||||
sense_str = sense_str.strip()
|
|
||||||
|
|
||||||
if len(sense_str) == 1:
|
|
||||||
return sense_str
|
|
||||||
|
|
||||||
# remove banned characters from string ending
|
|
||||||
banned = ": ; . , - ! ?".split(" ")
|
|
||||||
if sense_str[-1] in banned:
|
|
||||||
return sense_str[:-1]
|
|
||||||
|
|
||||||
return sense_str
|
|
||||||
|
|
||||||
data_new = {}
|
|
||||||
for k, lst in data.items():
|
|
||||||
new_el = {
|
|
||||||
"hw": k,
|
|
||||||
"has_se": k in se_list,
|
|
||||||
"senses": []
|
|
||||||
}
|
|
||||||
|
|
||||||
# if there is a single hw entry, hw_id is 0
|
|
||||||
if len(lst) == 1:
|
|
||||||
homonym_id = -1
|
|
||||||
else:
|
|
||||||
homonym_id = 0
|
|
||||||
|
|
||||||
# loop homonyms
|
|
||||||
for el in lst:
|
|
||||||
homonym_id += 1
|
|
||||||
# loop top lvl sense ids
|
|
||||||
for sense_id, sens_lst in el["senses"].items():
|
|
||||||
# loop subsenses
|
|
||||||
for i, sens in enumerate(sens_lst):
|
|
||||||
nsid = sense_id.split(".")[0]
|
|
||||||
if len(sens_lst) == 1:
|
|
||||||
nsid += "-0"
|
|
||||||
else:
|
|
||||||
nsid += ("-" + str(i + 1))
|
|
||||||
new_sense = {
|
|
||||||
"homonym_id": homonym_id,
|
|
||||||
# sense_id: sense_id-subsense_id
|
|
||||||
"sense_id": nsid,
|
|
||||||
"sense_type": sens[0],
|
|
||||||
"sense_desc": helper_prune(sens[1]),
|
|
||||||
}
|
|
||||||
new_el["senses"].append(new_sense)
|
|
||||||
hw = new_el["hw"]
|
|
||||||
if hw in data_new:
|
|
||||||
print("Shouldn't be here.")
|
|
||||||
print(new_el)
|
|
||||||
exit(1)
|
|
||||||
data_new[hw] = DC(new_el)
|
|
||||||
# return data_new
|
|
||||||
|
|
||||||
# check
|
|
||||||
for hw, el in data_new.items():
|
|
||||||
for sens in el["senses"]:
|
|
||||||
if sens["sense_desc"] is None:
|
|
||||||
print(sens)
|
|
||||||
|
|
||||||
return data_new
|
|
||||||
|
|
||||||
|
|
||||||
def plst(lst):
|
|
||||||
for el in lst:
|
|
||||||
print(el)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
datapath = "../../../data"
|
|
||||||
html_filepath = datapath + "/sskj/sskj2_v1.html"
|
|
||||||
raw_pickle_filepath = datapath + "/tmp_pickles/raw_sskj.pickle"
|
|
||||||
parsed_pickle_filepath = datapath + "/no_del_pickles/sskj_senses.pickle"
|
|
||||||
se_list_filepath = datapath + "/no_del_pickles/se_list.pickle"
|
|
||||||
|
|
||||||
p = Seqparser()
|
|
||||||
|
|
||||||
if True:
|
|
||||||
print("html_to_raw_pickle({}, {})".format(
|
|
||||||
html_filepath, raw_pickle_filepath))
|
|
||||||
print("Big file, this might take a while (2 min).")
|
|
||||||
tstart = time()
|
|
||||||
p.html_to_raw_pickle(html_filepath, raw_pickle_filepath)
|
|
||||||
print("Finished in {:.2f}.".format(time() - tstart))
|
|
||||||
|
|
||||||
if False:
|
|
||||||
print("raw_pickle_to_parsed_pickle({}, {}, {})".format(
|
|
||||||
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath))
|
|
||||||
tstart = time()
|
|
||||||
p.raw_pickle_to_parsed_pickle(
|
|
||||||
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath)
|
|
||||||
print("Finished in {:.2f}.".format(time() - tstart))
|
|
||||||
print("Done.")
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
from Seqparser import Seqparser
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
import json
|
|
||||||
import datetime
|
|
||||||
import hashlib
|
|
||||||
from pymongo import MongoClient
|
|
||||||
|
|
||||||
SSKJ_USER = "sskj2"
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
aparser = argparse.ArgumentParser()
|
|
||||||
aparser.add_argument("--sskj-html", type=str)
|
|
||||||
aparser.add_argument("--sskj-json", type=str)
|
|
||||||
aparser.add_argument("--wordlist", type=str)
|
|
||||||
aparser.add_argument("--operation", type=str)
|
|
||||||
aparser.add_argument("--dbaddr", type=str)
|
|
||||||
aparser.add_argument("--dbuser", type=str)
|
|
||||||
aparser.add_argument("--dbpass", type=str)
|
|
||||||
args = aparser.parse_args()
|
|
||||||
|
|
||||||
if args.operation == "gen_sskj_json":
|
|
||||||
sqp = Seqparser()
|
|
||||||
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
if args.operation == "gen_wordlist":
|
|
||||||
sqp = Seqparser()
|
|
||||||
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
if args.operation == "senses_to_db":
|
|
||||||
db_entries = []
|
|
||||||
tmp_dt = datetime.datetime.utcnow()
|
|
||||||
with Path(args.sskj_json).open("r") as fp:
|
|
||||||
jdata = json.load(fp)
|
|
||||||
# print(jdata[list(jdata.keys())[201]])
|
|
||||||
for hw, entry in jdata.items():
|
|
||||||
for key, sense in entry[0]["senses"].items():
|
|
||||||
desc = sense[0][1]
|
|
||||||
if sense[0][0] == "razl":
|
|
||||||
desc = desc[:-1] # for some reason, descriptions contain a ':'
|
|
||||||
else:
|
|
||||||
desc = sense[0][0] + ": " + desc
|
|
||||||
tmp_entry = {
|
|
||||||
"desc": desc,
|
|
||||||
"hw": hw,
|
|
||||||
"author": SSKJ_USER
|
|
||||||
}
|
|
||||||
tmp_entry["sense_id"] = "{}-{}".format(
|
|
||||||
SSKJ_USER,
|
|
||||||
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
|
|
||||||
)
|
|
||||||
tmp_entry["date"] = tmp_dt
|
|
||||||
db_entries.append(tmp_entry)
|
|
||||||
print(len(db_entries))
|
|
||||||
|
|
||||||
# db login
|
|
||||||
client = MongoClient(
|
|
||||||
"mongodb://{}".format(args.dbaddr),
|
|
||||||
username=args.dbuser,
|
|
||||||
password=args.dbpass,
|
|
||||||
authSource="valdb",
|
|
||||||
authMechanism='SCRAM-SHA-1'
|
|
||||||
)
|
|
||||||
valdb = client.valdb
|
|
||||||
valdb.senses.insert_many(db_entries)
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
from setuptools import setup
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name='seqparser',
|
|
||||||
version='0.0.1',
|
|
||||||
description='Parser for sskj2 html dump.',
|
|
||||||
author='Kristjan Voje',
|
|
||||||
author_email='kristjan.voje@gmail.com',
|
|
||||||
license='MIT',
|
|
||||||
packages=['seqparser'],
|
|
||||||
)
|
|
||||||
@@ -33,8 +33,10 @@ def reduce_0(frames, valdb_sensemap=None):
|
|||||||
separated_frames = []
|
separated_frames = []
|
||||||
for frame in frames:
|
for frame in frames:
|
||||||
for tid in frame.tids:
|
for tid in frame.tids:
|
||||||
tmp_frame = DC(frame)
|
tmp_frame = frame
|
||||||
tmp_frame.tids = [tid]
|
tmp_frame.tids = [tid]
|
||||||
|
tmp_frame.sort_slots()
|
||||||
|
|
||||||
separated_frames.append(tmp_frame)
|
separated_frames.append(tmp_frame)
|
||||||
sorting_strings.append("".join(
|
sorting_strings.append("".join(
|
||||||
[slot.functor for slot in tmp_frame.slots]
|
[slot.functor for slot in tmp_frame.slots]
|
||||||
|
|||||||
Reference in New Issue
Block a user