2 Commits

Author SHA1 Message Date
bf0970a90a make sskj-senses 2019-04-28 01:03:44 +02:00
81395890ab filtering some words in index 2019-04-27 20:24:11 +02:00
16 changed files with 466 additions and 10 deletions

1
.gitignore vendored
View File

@@ -1,4 +1,5 @@
data/samples/ data/samples/
data/wordlist.json
*egg-info/ *egg-info/
*.pyc *.pyc
src/frontend_vue/node_modules/ src/frontend_vue/node_modules/

View File

@@ -11,9 +11,15 @@ MAKE_ROOT = $(shell pwd)
# SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml" # SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link" SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml" # KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link" # KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml"
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json" # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link" # KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
# This file comes with the source code. Make sure you unpack it and name it right.
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
SSKJ_JSON = "$(MAKE_ROOT)/data/sskj_senses.json"
OUTPUT = "db" OUTPUT = "db"
# OUTPUT = "file" # OUTPUT = "file"
@@ -24,7 +30,7 @@ DBADDR = "0.0.0.0:27017" # don't use localhost
# create it from env.default # create it from env.default
include env.local include env.local
N_CORES = 5 N_CORES = 3
# insert kres files into database in chunks, for fewer connections # insert kres files into database in chunks, for fewer connections
KRES_CHUNK_SIZE = 30 KRES_CHUNK_SIZE = 30
@@ -59,6 +65,7 @@ python-env:
python-env-install: python-env-install:
pip3 install -e src/pkg/cjvt-corpusparser/. pip3 install -e src/pkg/cjvt-corpusparser/.
pip3 install -e src/pkg/valency/. pip3 install -e src/pkg/valency/.
pip3 install -e src/pkg/seqparser/.
# from inside python-env container: # from inside python-env container:
data/samples: data/samples:
@@ -99,10 +106,12 @@ frontend-prod:
## Backend ## Backend
# runs once and exits before the app starts # runs once and exits before the app starts
# need to extract ./data/sskj_data.tar.gz first
backend-prepare-db: backend-prepare-db:
cd ./src/backend_flask; python3 app.py \ cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/dev_conf.yaml \ --config-file ./conf_files/dev_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \ --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
--sskj-wordlist $(SSKJ_WORDLIST) \
--prepare-db --prepare-db
backend-dev: backend-dev:
@@ -114,3 +123,12 @@ backend-prod:
cd ./src/backend_flask; python3 app.py \ cd ./src/backend_flask; python3 app.py \
--config-file ./conf_files/prod_conf.yaml \ --config-file ./conf_files/prod_conf.yaml \
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) --dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR)
## add sskj senses to db (generated with pkg/seqparser)
sskj-senses:
python3 ./src/pkg/seqparser/seqparser/main.py \
--sskj-json $(SSKJ_JSON) \
--operation "senses_to_db" \
--dbaddr $(DBADDR) \
--dbuser $(DB_USR_USER) \
--dbpass $(DB_USR_PASS)

View File

@@ -1 +0,0 @@
/home/kristjan/kres_data/payload/kres_json/

View File

@@ -1 +0,0 @@
/home/kristjan/kres_mount/kres_parsed/tei/

BIN
data/sskj_data.tar.gz Normal file

Binary file not shown.

1
data/sskj_senses.json Normal file

File diff suppressed because one or more lines are too long

View File

@@ -38,6 +38,8 @@ SENSEMAP_COLL = "sensemap"
# pre-generated data (gui leftside word index) # pre-generated data (gui leftside word index)
CORPORA = ["ssj", "kres"] CORPORA = ["ssj", "kres"]
app_index = None app_index = None
sskj_wordlist = None # used by _is_banned(hw)
BANNED_HEADWORDS = ["biti"]
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
valdb = None valdb = None
@@ -322,6 +324,8 @@ def api_get_functor_frames():
# SENSES ----------------------------. # SENSES ----------------------------.
# ssj_id is legacy notation, read
# it as general sentence_id
@app.route("/api/senses/get") @app.route("/api/senses/get")
def api_senses_get(): def api_senses_get():
@@ -407,6 +411,8 @@ def api_senses_update():
ns["date"] = tmp_dt ns["date"] = tmp_dt
id_map[frontend_sense_id] = new_sense_id id_map[frontend_sense_id] = new_sense_id
print(ns)
# insert into db # insert into db
valdb[SENSES_COLL].insert(ns) valdb[SENSES_COLL].insert(ns)
@@ -430,6 +436,18 @@ def api_senses_update():
# APP PREFLIGHT ---------------------. # APP PREFLIGHT ---------------------.
def _is_banned(hw):
banned = True
if hw in BANNED_HEADWORDS:
banned = True
elif hw in sskj_wordlist["wordlist"]:
banned = False
elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False
if hw[-1] == "_":
log.debug("hw: {}, banned: {}".format(hw, banned))
return banned
def prepare_app_index(): def prepare_app_index():
log.info("[*] preparing app_index") log.info("[*] preparing app_index")
@@ -462,8 +480,10 @@ def prepare_app_index():
else: else:
alphabetical[fst] = [(k, e)] alphabetical[fst] = [(k, e)]
for k, e in alphabetical.items(): for letter, words in alphabetical.items():
alphabetical[k] = sorted(e, key=lambda x: x[0]) filtered_words = [x for x in words if not _is_banned(x[0])]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()] functors = [(k, e) for (k, e) in res_fns.items()]
@@ -483,6 +503,7 @@ if __name__ == "__main__":
aparser.add_argument("--dbuser", type=str) aparser.add_argument("--dbuser", type=str)
aparser.add_argument("--dbpass", type=str) aparser.add_argument("--dbpass", type=str)
aparser.add_argument("--dbaddr", type=str) aparser.add_argument("--dbaddr", type=str)
aparser.add_argument("--sskj-wordlist", type=str)
args = aparser.parse_args() args = aparser.parse_args()
config = None config = None
@@ -507,6 +528,8 @@ if __name__ == "__main__":
valdb = client.valdb valdb = client.valdb
if args.prepare_db: if args.prepare_db:
with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp)
prepare_app_index() prepare_app_index()
sys.exit() sys.exit()

View File

@@ -9,7 +9,7 @@
<LFunctors v-else></LFunctors> <LFunctors v-else></LFunctors>
</div> </div>
<div class="col-sm-10"> <div class="col-sm-10">
<p v-if="this.$root.store.api_error != null"> <p class="text-danger" v-if="this.$root.store.api_error != null">
{{ this.$root.store.api_error }} {{ this.$root.store.api_error }}
</p> </p>
<router-view></router-view> <router-view></router-view>

View File

@@ -1,6 +1,11 @@
<template> <template>
<!--in case of error-->
<div v-if="this.$root.store.api_error != null">
</div>
<!--load mode--> <!--load mode-->
<div v-if="state === 'loading'"> <div v-else-if="state === 'loading'">
<pulse-loader :color="loader_color"></pulse-loader> <pulse-loader :color="loader_color"></pulse-loader>
</div> </div>

View File

@@ -3,7 +3,9 @@
<b-navbar toggleable="md" type="light" variant="light"> <b-navbar toggleable="md" type="light" variant="light">
<b-navbar-toggle target="nav_collapse"></b-navbar-toggle> <b-navbar-toggle target="nav_collapse"></b-navbar-toggle>
<!--b-navbar-brand>Vezljivostni vzorci slovenskih glagolov</b-navbar-brand--> <!--b-navbar-brand>Vezljivostni vzorci slovenskih glagolov</b-navbar-brand-->
<b-navbar-brand>VEZLJIVOSTNI VZORCI SLOVENSKIH GLAGOLOV</b-navbar-brand> <b-navbar-brand class=cursorpointer v-on:click="goHome">
VEZLJIVOSTNI VZORCI SLOVENSKIH GLAGOLOV
</b-navbar-brand>
<b-collapse is-nav id="nav_collapse"> <b-collapse is-nav id="nav_collapse">
<b-navbar-nav> <b-navbar-nav>
@@ -101,6 +103,9 @@ export default {
this.$router.push({ this.$router.push({
name: "Home" name: "Home"
}) })
},
goHome() {
this.$router.replace({path: "/home"})
} }
} }
} }
@@ -116,4 +121,7 @@ nav a {
nav a:hover { nav a:hover {
color: white; color: white;
} }
.cursorpointer {
cursor: pointer;
}
</style> </style>

View File

@@ -0,0 +1,9 @@
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
SSKJ_JSON = "./sskj_senses.json"
WORDLIST = "./wordlist.json"
gen_json_files:
cd seqparser; python3 main.py \
--sskj-html=$(SSKJ_HTML) \
--sskj-json=$(SSKJ_JSON) \
--wordlist=$(WORDLIST)

View File

@@ -0,0 +1 @@
bs4

View File

@@ -0,0 +1,313 @@
from bs4 import BeautifulSoup as BS
import re
from collections import defaultdict
from time import time
import pickle
import json
from copy import deepcopy as DC
from pathlib import Path
# Match sese ordinals (1., 2., ...)
rord = re.compile(r"^ *[0-9]+\. *$")
# Get rid of accented characters.
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
outtb = "AEIOUaaaceeeiiinoooouuučRr"
transtab = str.maketrans(intab, outtb)
def d_time(fun):
def wrapper(*args, **kwargs):
tstart = time()
fun(*args, **kwargs)
duration = time() - tstart
print("Function {} ran for {:.2f} s.".format(
fun.__name__, duration))
return wrapper
class Seqparser:
def __init__(sskj_file):
pass
@d_time
def html_to_verb_adj_json(self, infile, outfile):
out_dict = defaultdict(list)
with Path(infile).open("rb") as fp:
for line in fp:
data = self.parse_line(line)
if data is None: continue
out_dict[data["izt_clean"]].append(data)
with Path(outfile).open("w") as fp:
json.dump(dict(out_dict), fp)
@d_time
def generate_sskj_wordlist(self, in_json_file, out_wordlist):
wordlist = None
with Path(in_json_file).open("r") as fp:
jdata = json.load(fp)
wordlist = list(jdata.keys())
with Path(out_wordlist).open("w") as fp:
json.dump({"wordlist": wordlist}, fp)
# main functions
def html_to_raw_pickle(self, sskj_html_filepath, raw_pickle_filepath):
entries = dict(self.parse_file(sskj_html_filepath, self.parse_line))
print("entries len: " + str(len(entries)))
with open(raw_pickle_filepath, "wb") as f:
tmpstr = json.dumps(dict(entries))
pickle.dump(tmpstr, f)
# debugging
def raw_pickle_to_parsed_pickle(
self, raw_pickle_filepath, parsed_pickle_filepath,
se_list_filepath
):
data = self.load_raw_pickle(raw_pickle_filepath)
print("raw_pickle data len: " + str(len(data)))
se_list = self.gen_se_list(data)
print("se_list len: " + str(len(se_list)))
with open(se_list_filepath, "wb") as f:
pickle.dump(se_list, f)
data1 = self.remove_se(data)
data2 = self.reorganize(data1, se_list)
print("data2 len: " + str(len(data2.keys())))
with open(parsed_pickle_filepath, "wb") as f:
pickle.dump(data2, f)
# helper html reading functions
def parse_file(self, path, f_parse_line):
tstart = time()
entries = defaultdict(list)
with open(path, "r") as f:
for line in f:
data = f_parse_line(line)
if data is not None:
entries[data["izt_clean"]].append(data)
print("parse_file({}) in {:.2f}s".format(path, time() - tstart))
return entries
def parse_line(self, line):
def helper_bv_set(g_or_p):
if g_or_p not in ["G", "P"]:
print("Err g_or_p.")
exit(1)
if data.get("bv") is not None:
if data["bv"] != g_or_p:
print(str(line))
# exit(1)
data["bv"] = g_or_p
data = {
"izt": "",
"izt_clean": "",
"senses": defaultdict(list)
}
soup = BS(line, "html.parser")
current_sense_id = "0"
for span in soup.find_all("span"):
# sense id
if span.string is not None:
rmatch = rord.match(span.string)
if rmatch is not None:
current_sense_id = rmatch.group().strip()
title = span.attrs.get("title")
if title is not None:
title = title.lower()
# only verbs and adjectives
if "glagol" in title:
helper_bv_set("G")
data["bv_full"] = title
elif "pridevn" in title:
helper_bv_set("P")
data["bv_full"] = title
# žšč
if title == "iztočnica":
data["izt"] = span.string
data["izt_clean"] = span.string.translate(transtab).lower()
# sense description
if title == "razlaga" and span.string is not None:
data["senses"][current_sense_id].append(
("razl", span.string))
if "pridevnik od" in span.string:
helper_bv_set("P")
if title == "sopomenka":
subspan = span.find_all("a")[0]
if subspan.string is not None:
data["senses"][current_sense_id].append(
("sopo", subspan.string))
# save verbs and adjectives
if (
("bv" not in data) or
(data["bv"] != "P" and data["bv"] != "G")
):
return None
# sanity check
if data["bv"] == "P" and " se" in data["izt_clean"]:
print(data)
exit(1)
# append _ to adjective keywords
if data["bv"] == "P":
data["izt_clean"] = data["izt_clean"] + "_"
# cleanup
if "bv" not in data:
print("Should not be here (no bv).")
exit(1)
del(data["bv"])
if "bv_full" in data:
del(data["bv_full"])
return data
# helper functions
def load_raw_pickle(self, raw_pickle_filepath):
with open(raw_pickle_filepath, "rb") as f:
tmpstr = pickle.load(f)
return json.loads(tmpstr)
def helper_loop(self, data, fnc):
for k, lst in data.items():
for el in lst:
fnc(el)
def gen_se_list(self, data):
def fnc1(el):
ic = el["izt_clean"]
if " se" in ic:
se_list.append(ic)
def fnc2(el):
ic = el["izt_clean"]
if ic in se_pruned:
se_pruned.remove(ic)
# hw entries that only exist with " se"
se_list = []
self.helper_loop(data, fnc1)
se_pruned = set([hw.split(" se")[0] for hw in se_list])
self.helper_loop(data, fnc2)
return sorted(list(se_pruned))
def remove_se(self, data):
def fnc1(el):
nel = DC(el)
ic = nel["izt_clean"]
if " se" in ic:
nic = ic.split(" se")[0]
nel["izt_clean"] = nic
data_new[nel["izt_clean"]].append(nel)
data_new = defaultdict(list)
self.helper_loop(data, fnc1)
return dict(data_new)
def reorganize(self, data, se_list):
# some hw entries have several headwords,
# some senses have subsenses
# index everything, make 1 object per hw
def helper_prune(sense_str):
# remove space padding
sense_str = sense_str.strip()
if len(sense_str) == 1:
return sense_str
# remove banned characters from string ending
banned = ": ; . , - ! ?".split(" ")
if sense_str[-1] in banned:
return sense_str[:-1]
return sense_str
data_new = {}
for k, lst in data.items():
new_el = {
"hw": k,
"has_se": k in se_list,
"senses": []
}
# if there is a single hw entry, hw_id is 0
if len(lst) == 1:
homonym_id = -1
else:
homonym_id = 0
# loop homonyms
for el in lst:
homonym_id += 1
# loop top lvl sense ids
for sense_id, sens_lst in el["senses"].items():
# loop subsenses
for i, sens in enumerate(sens_lst):
nsid = sense_id.split(".")[0]
if len(sens_lst) == 1:
nsid += "-0"
else:
nsid += ("-" + str(i + 1))
new_sense = {
"homonym_id": homonym_id,
# sense_id: sense_id-subsense_id
"sense_id": nsid,
"sense_type": sens[0],
"sense_desc": helper_prune(sens[1]),
}
new_el["senses"].append(new_sense)
hw = new_el["hw"]
if hw in data_new:
print("Shouldn't be here.")
print(new_el)
exit(1)
data_new[hw] = DC(new_el)
# return data_new
# check
for hw, el in data_new.items():
for sens in el["senses"]:
if sens["sense_desc"] is None:
print(sens)
return data_new
def plst(lst):
for el in lst:
print(el)
if __name__ == "__main__":
datapath = "../../../data"
html_filepath = datapath + "/sskj/sskj2_v1.html"
raw_pickle_filepath = datapath + "/tmp_pickles/raw_sskj.pickle"
parsed_pickle_filepath = datapath + "/no_del_pickles/sskj_senses.pickle"
se_list_filepath = datapath + "/no_del_pickles/se_list.pickle"
p = Seqparser()
if True:
print("html_to_raw_pickle({}, {})".format(
html_filepath, raw_pickle_filepath))
print("Big file, this might take a while (2 min).")
tstart = time()
p.html_to_raw_pickle(html_filepath, raw_pickle_filepath)
print("Finished in {:.2f}.".format(time() - tstart))
if False:
print("raw_pickle_to_parsed_pickle({}, {}, {})".format(
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath))
tstart = time()
p.raw_pickle_to_parsed_pickle(
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath)
print("Finished in {:.2f}.".format(time() - tstart))
print("Done.")

View File

View File

@@ -0,0 +1,68 @@
from Seqparser import Seqparser
import argparse
import sys
from pathlib import Path
import json
import datetime
import hashlib
from pymongo import MongoClient
SSKJ_USER = "sskj2"
if __name__ == "__main__":
aparser = argparse.ArgumentParser()
aparser.add_argument("--sskj-html", type=str)
aparser.add_argument("--sskj-json", type=str)
aparser.add_argument("--wordlist", type=str)
aparser.add_argument("--operation", type=str)
aparser.add_argument("--dbaddr", type=str)
aparser.add_argument("--dbuser", type=str)
aparser.add_argument("--dbpass", type=str)
args = aparser.parse_args()
if args.operation == "gen_sskj_json":
sqp = Seqparser()
sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
sys.exit()
if args.operation == "gen_wordlist":
sqp = Seqparser()
sqp.generate_sskj_wordlist(args.sskj_senses, args.wordlist)
sys.exit()
if args.operation == "senses_to_db":
db_entries = []
tmp_dt = datetime.datetime.utcnow()
with Path(args.sskj_json).open("r") as fp:
jdata = json.load(fp)
# print(jdata[list(jdata.keys())[201]])
for hw, entry in jdata.items():
for key, sense in entry[0]["senses"].items():
desc = sense[0][1]
if sense[0][0] == "razl":
desc = desc[:-1] # for some reason, descriptions contain a ':'
else:
desc = sense[0][0] + ": " + desc
tmp_entry = {
"desc": desc,
"hw": hw,
"author": SSKJ_USER
}
tmp_entry["sense_id"] = "{}-{}".format(
SSKJ_USER,
hashlib.sha256(str(tmp_entry).encode("utf-8")).hexdigest()[:10]
)
tmp_entry["date"] = tmp_dt
db_entries.append(tmp_entry)
print(len(db_entries))
# db login
client = MongoClient(
"mongodb://{}".format(args.dbaddr),
username=args.dbuser,
password=args.dbpass,
authSource="valdb",
authMechanism='SCRAM-SHA-1'
)
valdb = client.valdb
valdb.senses.insert_many(db_entries)

View File

@@ -0,0 +1,11 @@
from setuptools import setup
setup(
name='seqparser',
version='0.0.1',
description='Parser for sskj2 html dump.',
author='Kristjan Voje',
author_email='kristjan.voje@gmail.com',
license='MIT',
packages=['seqparser'],
)