forked from kristjan/cjvt-valency
First commit on scripts branch
This commit is contained in:
parent
c803057164
commit
3d91251905
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -11,4 +11,7 @@ dockerfiles/database/create.js
|
|||
env.local
|
||||
logs/*
|
||||
.idea/
|
||||
venv/
|
||||
venv*
|
||||
data/
|
||||
data
|
||||
deploy_instructions/
|
||||
|
|
18
Makefile
18
Makefile
|
@ -13,10 +13,11 @@ SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
|
|||
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
|
||||
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
|
||||
KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml"
|
||||
GIGAFIDA_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/giga_orig"
|
||||
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
|
||||
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
|
||||
KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
|
||||
|
||||
GIGAFIDA_SRL_FOLDER = "/home/lukakrsnik/cjvt-valency/data_all/final_json"
|
||||
# This file comes with the source code. Make sure you unpack it and name it right.
|
||||
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
|
||||
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
|
||||
|
@ -24,8 +25,8 @@ SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
|
|||
# for pre-generation the index of all headwords and functors
|
||||
APPINDEX_PATH = "$(MAKE_ROOT)/data/appindex.json"
|
||||
|
||||
# OUTPUT = "db"
|
||||
OUTPUT = "file"
|
||||
OUTPUT = "db"
|
||||
# OUTPUT = "file"
|
||||
OUTDIR = "/project/data" # if you're running this in docker, make sure to mount the volume
|
||||
DBADDR = "0.0.0.0:27017" # don't use localhost
|
||||
|
||||
|
@ -33,7 +34,7 @@ DBADDR = "0.0.0.0:27017" # don't use localhost
|
|||
# create it from env.default
|
||||
include env.local
|
||||
|
||||
N_CORES = 3
|
||||
N_CORES = 4
|
||||
# insert kres files into database in chunks, for fewer connections
|
||||
KRES_CHUNK_SIZE = 30
|
||||
|
||||
|
@ -93,7 +94,14 @@ fill-database-kres: data/samples
|
|||
--chunk-size $(KRES_CHUNK_SIZE) \
|
||||
--cores $(N_CORES)
|
||||
|
||||
|
||||
fill-database-gigafida: data/samples
|
||||
python3 src/pkg/cjvt-corpusparser/corpusparser/main.py --kres-folder $(GIGAFIDA_FOLDER) \
|
||||
--corpus="gigafida" \
|
||||
--ssj-file $(SSJ_FILE) --kres-srl-folder $(GIGAFIDA_SRL_FOLDER) \
|
||||
--output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR) \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) \
|
||||
--chunk-size $(KRES_CHUNK_SIZE) \
|
||||
--cores $(N_CORES)
|
||||
|
||||
## Frontend
|
||||
|
||||
|
|
|
@ -179,3 +179,9 @@ $ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0
|
|||
```
|
||||
|
||||
After uploading, restart the stack with `27017` commented out.
|
||||
|
||||
## When running script
|
||||
|
||||
```bash
|
||||
make database-service
|
||||
```
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
/home/kristjan/workdir/final_json/
|
|
@ -1 +0,0 @@
|
|||
/home/kristjan/kres_mount/kres_parsed/tei/
|
Binary file not shown.
1256152
data/ssj_file_link
1256152
data/ssj_file_link
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -1,4 +1,4 @@
|
|||
FROM mongo:latest
|
||||
FROM mongo:4.2.9
|
||||
|
||||
WORKDIR /
|
||||
COPY init_inside_container.sh /.
|
||||
|
|
|
@ -24,3 +24,6 @@ server {
|
|||
proxy_pass http://backend_flask:8084;
|
||||
}
|
||||
}
|
||||
|
||||
https://vezljivostni.cjvt.si/api/* -> http://vezljivostni-host.cjvt.si:8084/api/*
|
||||
https://vezljivostni.cjvt.si/* -> http://vezljivostni-host.cjvt.si:80/*
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
asn1crypto==0.24.0
|
||||
beautifulsoup4==4.8.0
|
||||
bs4==0.0.1
|
||||
cffi==1.12.3
|
||||
Click==7.0
|
||||
# corpusparser==0.0.1
|
||||
cryptography==2.1.4
|
||||
Flask==1.1.1
|
||||
Flask-Cors==3.0.8
|
||||
|
@ -17,8 +19,9 @@ MarkupSafe==1.1.1
|
|||
numpy==1.17.0
|
||||
pandas==0.25.0
|
||||
pathlib==1.0.1
|
||||
psycopg2==2.8.4
|
||||
pycparser==2.19
|
||||
pycrypto==2.6.1
|
||||
# pygobject
|
||||
pymongo==3.8.0
|
||||
python-dateutil==2.8.0
|
||||
pytz==2019.2
|
||||
|
@ -27,10 +30,8 @@ PyYAML==5.1.2
|
|||
scikit-learn==0.21.3
|
||||
scipy==1.3.0
|
||||
SecretStorage==2.3.1
|
||||
# Editable install with no version control (seqparser==0.0.1)
|
||||
six==1.11.0
|
||||
sklearn==0.0
|
||||
# Editable install with no version control (valency==0.1.1)
|
||||
|
||||
soupsieve==1.9.3
|
||||
SQLAlchemy==1.3.12
|
||||
Werkzeug==0.15.5
|
||||
|
||||
|
|
1643
scripts/create_xml.py
Normal file
1643
scripts/create_xml.py
Normal file
File diff suppressed because it is too large
Load Diff
189
scripts/extract_keywords.py
Normal file
189
scripts/extract_keywords.py
Normal file
|
@ -0,0 +1,189 @@
|
|||
import copy
|
||||
import csv
|
||||
from xml.etree import ElementTree
|
||||
import re
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import pickle
|
||||
import time
|
||||
import gc
|
||||
import subprocess
|
||||
import concurrent.futures
|
||||
import tempfile
|
||||
|
||||
|
||||
def read_gigafida(path):
|
||||
words = {}
|
||||
with open(path) as tsvfile:
|
||||
reader = csv.reader(tsvfile, delimiter='\t')
|
||||
for row in reader:
|
||||
words[row[0]] = int(row[2])
|
||||
return words
|
||||
|
||||
|
||||
def read_sloleks(path):
|
||||
words = set()
|
||||
with open(path) as tsvfile:
|
||||
reader = csv.reader(tsvfile, delimiter='\t')
|
||||
for row in reader:
|
||||
words.add(row[1])
|
||||
return words
|
||||
|
||||
|
||||
def read_zele(path):
|
||||
with open(path) as f:
|
||||
content = f.readlines()
|
||||
# fix content
|
||||
content[0] = content[0][1:]
|
||||
# a = content[2]
|
||||
# a = content[2].split()
|
||||
# a = content[2].split()[0].split('<IZT>')[1]
|
||||
# a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0]
|
||||
content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content]
|
||||
# content = [x.split() for x in content]
|
||||
return set(content)
|
||||
|
||||
|
||||
def read_wordlist(path):
|
||||
with open(path) as f:
|
||||
content = [line[:-1] for line in f.readlines()]
|
||||
print(content[-1])
|
||||
return set(content)
|
||||
|
||||
|
||||
def filter_gigafida(gigafida_raw, min_limit, max_limit):
|
||||
return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit}
|
||||
|
||||
|
||||
def set_list_intersection(gigafida_filtered, sloleks):
|
||||
intersection = {}
|
||||
for word, num in gigafida_filtered.items():
|
||||
if word in sloleks:
|
||||
intersection[word] = num
|
||||
return intersection
|
||||
|
||||
|
||||
def list_list_union(list1, list2):
|
||||
union = copy.copy(list1)
|
||||
for w, n in list2.items():
|
||||
if w not in list1:
|
||||
union[w] = list2[w]
|
||||
return union
|
||||
|
||||
|
||||
def list_list_subtraction(list1, list2):
|
||||
subtraction = {}
|
||||
for w, n in list2.items():
|
||||
# if w == 'dejati':
|
||||
# print('here')
|
||||
if w not in list1:
|
||||
subtraction[w] = n
|
||||
return subtraction
|
||||
|
||||
|
||||
def set_set_subtraction(set1, set2):
|
||||
subtraction = {}
|
||||
for w in set2:
|
||||
if w not in set1:
|
||||
subtraction[w] = -1
|
||||
return subtraction
|
||||
|
||||
|
||||
def create_document(list1, path):
|
||||
with open(path, "w") as text_file:
|
||||
for w, n in list1.items():
|
||||
text_file.write("%s\t%d\n" % (w, n))
|
||||
|
||||
|
||||
def create_document_set(list1, path):
|
||||
with open(path, "w") as text_file:
|
||||
for w in sorted(list(list1)):
|
||||
text_file.write("%s\n" % w)
|
||||
|
||||
|
||||
def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max):
|
||||
gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max)
|
||||
sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks)
|
||||
gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize)
|
||||
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
|
||||
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
|
||||
sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||
return sloleks_zele_subtraction
|
||||
|
||||
|
||||
def main(args):
|
||||
gigafida_raw = read_gigafida(args.gigafida_verb_list)
|
||||
sloleks = read_sloleks(args.sloleks)
|
||||
zele = read_zele(args.zele)
|
||||
if args.wordlist is not None:
|
||||
sloleks_wordlist = set()
|
||||
# sloleks_wordlist = set()
|
||||
for el in sloleks:
|
||||
if el in gigafida_raw:
|
||||
sloleks_wordlist.add(el)
|
||||
filtered_wordlist = read_wordlist(args.wordlist)
|
||||
|
||||
# sloleks_wordlist = set()
|
||||
for el in sloleks:
|
||||
if el in gigafida_raw:
|
||||
filtered_wordlist.add(el)
|
||||
|
||||
create_document_set(filtered_wordlist, 'wordlist.tsv')
|
||||
# gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize)
|
||||
gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize)
|
||||
sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks)
|
||||
|
||||
nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True)
|
||||
res = [el[0] for el in nouns_sloleks_gf_intersect]
|
||||
|
||||
gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize)
|
||||
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
|
||||
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
|
||||
sloleks_zele_subtraction = set_set_subtraction(sloleks, zele)
|
||||
create_document(gigafida_filtered3, 'gigafida_3+.tsv')
|
||||
# create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv')
|
||||
create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv')
|
||||
create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv')
|
||||
|
||||
# gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize)
|
||||
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||
gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize)
|
||||
create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv')
|
||||
|
||||
# gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10)
|
||||
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||
gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10)
|
||||
create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv')
|
||||
# pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract keywords from multiple lists.')
|
||||
parser.add_argument('gigafida_verb_list',
|
||||
help='Path to gigafida list of verbs in tsv format.')
|
||||
parser.add_argument('sloleks',
|
||||
help='Path to Sloleks in tsv format.')
|
||||
parser.add_argument('--zele',
|
||||
help='Path to zele valency dictionary.')
|
||||
parser.add_argument('--wordlist', default=None,
|
||||
help='Path to filtered wordlist.')
|
||||
parser.add_argument('--handchecked_words', default=None,
|
||||
help='Path to handchecked words.')
|
||||
# parser.add_argument('--min_limit',
|
||||
# help='Limit min number of ocurrences',
|
||||
# type=int, default=0)
|
||||
# parser.add_argument('--max_limit',
|
||||
# help='Limit max number of ocurrences',
|
||||
# type=int, default=sys.maxsize)
|
||||
parser.add_argument('--verbose', help='Enable verbose output to stderr',
|
||||
choices=["warning", "info", "debug"], default="info",
|
||||
const="info", nargs='?')
|
||||
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||
|
||||
start = time.time()
|
||||
main(args)
|
||||
logging.info("TIME: {}".format(time.time() - start))
|
8
scripts/xsd_checker.py
Normal file
8
scripts/xsd_checker.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from lxml import etree as lxml
|
||||
|
||||
with open('../data/inventory.xsd') as f:
|
||||
xmlschema_doc = lxml.parse(f)
|
||||
xmlschema = lxml.XMLSchema(xmlschema_doc)
|
||||
with open('../data/xmls/output.xml') as op:
|
||||
doc = lxml.parse(op)
|
||||
print(xmlschema.validate(doc))
|
|
@ -37,8 +37,8 @@ app = Flask(__name__)
|
|||
app.config.from_object("db_config")
|
||||
mongo = PyMongo(app)
|
||||
|
||||
# app.config["CORPORA"] = ["ssj", "kres"]
|
||||
app.config["CORPORA"] = ["ssj"]
|
||||
# app.config["CORPORA"] = ["ssj", "kres", "gigafida"]
|
||||
app.config["CORPORA"] = ["gigafida"]
|
||||
app.config["BANNED_HEADWORDS"] = ["biti"]
|
||||
app.config["QUERY_LIMIT"] = 1000
|
||||
|
||||
|
@ -248,20 +248,23 @@ def api_get_frames():
|
|||
if corpus not in app.config["CORPORA"]:
|
||||
return json.dumps({"error": "cor={kres,ssj}"})
|
||||
|
||||
log.info("Test1")
|
||||
cur = mongo.db[corpus].find({"headwords": hw})
|
||||
log.info("Test2")
|
||||
frames = []
|
||||
for ent in cur[:app.config["QUERY_LIMIT"]]:
|
||||
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
|
||||
cur.close()
|
||||
|
||||
log.info("Test3")
|
||||
# filter by relevant hw
|
||||
frames = [x for x in frames if x.hw == hw]
|
||||
|
||||
ret_frames = RF(frames, mongo.db.sensemap)
|
||||
|
||||
log.info("Test3")
|
||||
json_ret = {"frames": []}
|
||||
for frame in ret_frames:
|
||||
json_ret["frames"].append(frame.to_json())
|
||||
log.info("Test4")
|
||||
return json.dumps(json_ret)
|
||||
# return prepare_frames(ret_frames)
|
||||
|
||||
|
@ -445,7 +448,7 @@ def _is_banned(hw):
|
|||
banned = False
|
||||
return banned
|
||||
|
||||
def prepare_app_index(appindex_json, sskj_wordlist):
|
||||
def prepare_app_index(appindex_json):
|
||||
log.info("[*] preparing app_index")
|
||||
# create app_index (used in frontend, left side word index)
|
||||
tmp_app_index = {c: {} for c in app.config["CORPORA"]}
|
||||
|
@ -453,18 +456,17 @@ def prepare_app_index(appindex_json, sskj_wordlist):
|
|||
res_hws = {}
|
||||
res_fns = {}
|
||||
|
||||
print('CORPUS...!!...')
|
||||
print(corpus)
|
||||
a = mongo.db[corpus]
|
||||
print('TEST_OK')
|
||||
print(a)
|
||||
print(mongo.db)
|
||||
a = mongo.db.list_collection_names()
|
||||
print('TEST_OK2')
|
||||
# print('CORPUS...!!...')
|
||||
# print(corpus)
|
||||
# a = mongo.db[corpus]
|
||||
# print('TEST_OK')
|
||||
# print(a)
|
||||
# print(mongo.db)
|
||||
# a = mongo.db.list_collection_names()
|
||||
# print('TEST_OK2')
|
||||
nentries = mongo.db[corpus].count()
|
||||
idx = 0
|
||||
for e in mongo.db[corpus].find({}):
|
||||
print('aaa')
|
||||
if "headwords" not in e:
|
||||
continue
|
||||
for hw in e["headwords"]:
|
||||
|
@ -494,6 +496,7 @@ def prepare_app_index(appindex_json, sskj_wordlist):
|
|||
|
||||
for letter, words in alphabetical.items():
|
||||
filtered_words = [x for x in words if not _is_banned(x[0])]
|
||||
# filtered_words = [x for x in words]
|
||||
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
||||
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
|
@ -570,12 +573,16 @@ if __name__ == "__main__":
|
|||
if args.prepare_db:
|
||||
with Path(args.sskj_wordlist).open("r") as fp:
|
||||
sskj_wordlist = json.load(fp)
|
||||
prepare_app_index(args.appindex_json, sskj_wordlist)
|
||||
prepare_app_index(args.appindex_json)
|
||||
sys.exit()
|
||||
|
||||
# app index from db
|
||||
with Path(args.appindex_json).open("r") as fp:
|
||||
app.config["app_index"] = json.load(fp)
|
||||
# a = app.config["app_index"]
|
||||
# b = app.config["app_index"]["kres"]
|
||||
# c = app.config["app_index"]["kres"]["words"]
|
||||
# print('HERE')
|
||||
|
||||
# log.info("[*] Starting app.py with config:\n%s".format(config))
|
||||
log.info("[*] Starting app.py with config:\n{}".format(config))
|
||||
|
|
106
src/backend_flask/build_app_index.py
Normal file
106
src/backend_flask/build_app_index.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
import argparse
|
||||
import json
|
||||
|
||||
from flask import Flask
|
||||
from flask_pymongo import PyMongo
|
||||
from pathlib import Path
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
app.config.from_object("db_config")
|
||||
mongo = PyMongo(app)
|
||||
|
||||
app.config["BANNED_HEADWORDS"] = ["biti"]
|
||||
|
||||
def _is_banned(hw):
|
||||
banned = True
|
||||
if hw in app.config["BANNED_HEADWORDS"]:
|
||||
banned = True
|
||||
elif hw in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
return banned
|
||||
|
||||
|
||||
def prepare_app_index(appindex_json, corporas, previous_json=None):
|
||||
if previous_json:
|
||||
with Path(previous_json).open("r") as fp:
|
||||
tmp_app_index = json.load(fp)
|
||||
else:
|
||||
tmp_app_index = {}
|
||||
# create app_index (used in frontend, left side word index)
|
||||
for c in corporas:
|
||||
tmp_app_index[c] = {}
|
||||
|
||||
for corpus in corporas:
|
||||
res_hws = {}
|
||||
res_fns = {}
|
||||
|
||||
# print('CORPUS...!!...')
|
||||
# print(corpus)
|
||||
# a = mongo.db[corpus]
|
||||
# print('TEST_OK')
|
||||
# print(a)
|
||||
# print(mongo.db)
|
||||
# a = mongo.db.list_collection_names()
|
||||
# print('TEST_OK2')
|
||||
nentries = mongo.db[corpus].count()
|
||||
idx = 0
|
||||
for e in mongo.db[corpus].find({}):
|
||||
if "headwords" not in e:
|
||||
continue
|
||||
for hw in e["headwords"]:
|
||||
if hw in res_hws:
|
||||
res_hws[hw] += 1
|
||||
else:
|
||||
res_hws[hw] = 1
|
||||
if "functors" not in e:
|
||||
continue
|
||||
for fn in e["functors"]:
|
||||
if fn in res_fns:
|
||||
res_fns[fn] += 1
|
||||
else:
|
||||
res_fns[fn] = 1
|
||||
idx += 1
|
||||
if idx % 10000 == 0:
|
||||
print("indexing {}: {}/{}".format(
|
||||
corpus, idx, nentries))
|
||||
|
||||
alphabetical = {}
|
||||
for k, e in res_hws.items():
|
||||
fst = k[0].lower()
|
||||
if fst in alphabetical:
|
||||
alphabetical[fst].append((k, e))
|
||||
else:
|
||||
alphabetical[fst] = [(k, e)]
|
||||
|
||||
for letter, words in alphabetical.items():
|
||||
filtered_words = [x for x in words if not _is_banned(x[0])]
|
||||
# filtered_words = [x for x in words]
|
||||
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
||||
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
|
||||
|
||||
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||
functors = sorted(functors, key=lambda x: x[0])
|
||||
tmp_app_index[corpus]["functors"] = functors
|
||||
|
||||
with Path(appindex_json).open("w") as fp:
|
||||
json.dump(tmp_app_index, fp)
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting app.py main()")
|
||||
aparser = argparse.ArgumentParser(description="Arguments for app.py")
|
||||
aparser.add_argument("--previous-json", type=str, default=None)
|
||||
aparser.add_argument("--appindex-json", type=str)
|
||||
aparser.add_argument("--sskj-wordlist", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
corporas = ['gigafida']
|
||||
|
||||
with Path(args.sskj_wordlist).open("r") as fp:
|
||||
sskj_wordlist = json.load(fp)
|
||||
|
||||
prepare_app_index(args.appindex_json, corporas, args.previous_json)
|
|
@ -1,2 +1,2 @@
|
|||
MONGO_URI = "mongodb://sizif:p5e3r4u8t7@my_mongo:27017/valdb"
|
||||
MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb"
|
||||
MONGO_AUTH_SOURCE = 'admin'
|
||||
|
|
18
src/backend_flask/get_sentence_ids.py
Normal file
18
src/backend_flask/get_sentence_ids.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json"
|
||||
output_file = "../../all_sentences.json"
|
||||
|
||||
results = {}
|
||||
filenames = os.listdir(input_dir)
|
||||
len(filenames)
|
||||
for i, filename in enumerate(filenames):
|
||||
if filename.endswith(".json"):
|
||||
with open(os.path.join(input_dir, filename)) as json_file:
|
||||
data = json.load(json_file)
|
||||
results[filename.split('-')[0]] = list(data.keys())
|
||||
print('Progress: %.2f %%' % (i/len(filenames)))
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(results, f)
|
|
@ -1,3 +1,3 @@
|
|||
{
|
||||
"api_addr": "http://193.2.76.103:8084"
|
||||
"api_addr": "http://0.0.0.0:8084"
|
||||
}
|
||||
|
|
28
src/frontend_vue/package-lock.json
generated
28
src/frontend_vue/package-lock.json
generated
|
@ -3513,14 +3513,12 @@
|
|||
"balanced-match": {
|
||||
"version": "1.0.0",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true
|
||||
"dev": true
|
||||
},
|
||||
"brace-expansion": {
|
||||
"version": "1.1.11",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"requires": {
|
||||
"balanced-match": "^1.0.0",
|
||||
"concat-map": "0.0.1"
|
||||
|
@ -3535,20 +3533,17 @@
|
|||
"code-point-at": {
|
||||
"version": "1.1.0",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true
|
||||
"dev": true
|
||||
},
|
||||
"concat-map": {
|
||||
"version": "0.0.1",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true
|
||||
"dev": true
|
||||
},
|
||||
"console-control-strings": {
|
||||
"version": "1.1.0",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true
|
||||
"dev": true
|
||||
},
|
||||
"core-util-is": {
|
||||
"version": "1.0.2",
|
||||
|
@ -3665,8 +3660,7 @@
|
|||
"inherits": {
|
||||
"version": "2.0.3",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true
|
||||
"dev": true
|
||||
},
|
||||
"ini": {
|
||||
"version": "1.3.5",
|
||||
|
@ -3678,7 +3672,6 @@
|
|||
"version": "1.0.0",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"requires": {
|
||||
"number-is-nan": "^1.0.0"
|
||||
}
|
||||
|
@ -3693,7 +3686,6 @@
|
|||
"version": "3.0.4",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"requires": {
|
||||
"brace-expansion": "^1.1.7"
|
||||
}
|
||||
|
@ -3701,14 +3693,12 @@
|
|||
"minimist": {
|
||||
"version": "0.0.8",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true
|
||||
"dev": true
|
||||
},
|
||||
"minipass": {
|
||||
"version": "2.3.5",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"requires": {
|
||||
"safe-buffer": "^5.1.2",
|
||||
"yallist": "^3.0.0"
|
||||
|
@ -3727,7 +3717,6 @@
|
|||
"version": "0.5.1",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"requires": {
|
||||
"minimist": "0.0.8"
|
||||
}
|
||||
|
@ -3808,8 +3797,7 @@
|
|||
"number-is-nan": {
|
||||
"version": "1.0.1",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true
|
||||
"dev": true
|
||||
},
|
||||
"object-assign": {
|
||||
"version": "4.1.1",
|
||||
|
@ -3821,7 +3809,6 @@
|
|||
"version": "1.4.0",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"requires": {
|
||||
"wrappy": "1"
|
||||
}
|
||||
|
@ -3943,7 +3930,6 @@
|
|||
"version": "1.0.2",
|
||||
"bundled": true,
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"requires": {
|
||||
"code-point-at": "^1.0.0",
|
||||
"is-fullwidth-code-point": "^1.0.0",
|
||||
|
|
|
@ -62,7 +62,7 @@ export default {
|
|||
name: "Nav",
|
||||
props: ["appState"],
|
||||
data() {return {
|
||||
optCorpora: ["kres", "ssj"],
|
||||
optCorpora: ["kres", "ssj", "gigafida"],
|
||||
optIndexes: [
|
||||
{key: "besede", val: "words"},
|
||||
{key: "udeleženske vloge", val: "functors"},
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 2e1d8d06b33eb4e64d3558fea2161811e81f6a28
|
||||
Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd
|
Loading…
Reference in New Issue
Block a user