forked from kristjan/cjvt-valency
filtering some words in index
This commit is contained in:
parent
fd94627fdb
commit
81395890ab
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,4 +1,5 @@
|
|||
data/samples/
|
||||
data/wordlist.json
|
||||
*egg-info/
|
||||
*.pyc
|
||||
src/frontend_vue/node_modules/
|
||||
|
|
13
Makefile
13
Makefile
|
@ -11,9 +11,14 @@ MAKE_ROOT = $(shell pwd)
|
|||
# SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_xml/ssj500k-sl.body.sample.xml"
|
||||
SSJ_FILE = "$(MAKE_ROOT)/data/ssj_file_link"
|
||||
# KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_xml"
|
||||
KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
|
||||
# KRES_FOLDER = "$(MAKE_ROOT)/data/kres_xml_folder_link"
|
||||
KRES_FOLDER = "/home/kristjan/kres_data/payload/kres_xml"
|
||||
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/samples/kres_srl_json"
|
||||
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
|
||||
# KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_json_folder_link"
|
||||
KRES_SRL_FOLDER = "/home/kristjan/kres_data/payload/kres_json"
|
||||
|
||||
# This file comes with the source code. Make sure you unpack it and name it right.
|
||||
SSKJ_WORDLIST = "$(MAKE_ROOT)/data/wordlist.json"
|
||||
|
||||
OUTPUT = "db"
|
||||
# OUTPUT = "file"
|
||||
|
@ -24,7 +29,7 @@ DBADDR = "0.0.0.0:27017" # don't use localhost
|
|||
# create it from env.default
|
||||
include env.local
|
||||
|
||||
N_CORES = 5
|
||||
N_CORES = 3
|
||||
# insert kres files into database in chunks, for fewer connections
|
||||
KRES_CHUNK_SIZE = 30
|
||||
|
||||
|
@ -59,6 +64,7 @@ python-env:
|
|||
python-env-install:
|
||||
pip3 install -e src/pkg/cjvt-corpusparser/.
|
||||
pip3 install -e src/pkg/valency/.
|
||||
pip3 install -e src/pkg/seqparser/.
|
||||
|
||||
# from inside python-env container:
|
||||
data/samples:
|
||||
|
@ -103,6 +109,7 @@ backend-prepare-db:
|
|||
cd ./src/backend_flask; python3 app.py \
|
||||
--config-file ./conf_files/dev_conf.yaml \
|
||||
--dbuser $(DB_USR_USER) --dbpass $(DB_USR_PASS) --dbaddr $(DBADDR) \
|
||||
--sskj-wordlist $(SSKJ_WORDLIST) \
|
||||
--prepare-db
|
||||
|
||||
backend-dev:
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
/home/kristjan/kres_data/payload/kres_json/
|
|
@ -1 +0,0 @@
|
|||
/home/kristjan/kres_mount/kres_parsed/tei/
|
BIN
data/wordlist.tar.gz
Normal file
BIN
data/wordlist.tar.gz
Normal file
Binary file not shown.
|
@ -38,6 +38,8 @@ SENSEMAP_COLL = "sensemap"
|
|||
# pre-generated data (gui leftside word index)
|
||||
CORPORA = ["ssj", "kres"]
|
||||
app_index = None
|
||||
sskj_wordlist = None # used by _is_banned(hw)
|
||||
BANNED_HEADWORDS = ["biti"]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
valdb = None
|
||||
|
@ -430,6 +432,18 @@ def api_senses_update():
|
|||
|
||||
|
||||
# APP PREFLIGHT ---------------------.
|
||||
def _is_banned(hw):
|
||||
banned = True
|
||||
if hw in BANNED_HEADWORDS:
|
||||
banned = True
|
||||
elif hw in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
|
||||
if banned:
|
||||
log.debug("Banned headword: {}".format(hw))
|
||||
return banned
|
||||
|
||||
def prepare_app_index():
|
||||
log.info("[*] preparing app_index")
|
||||
|
@ -462,8 +476,10 @@ def prepare_app_index():
|
|||
else:
|
||||
alphabetical[fst] = [(k, e)]
|
||||
|
||||
for k, e in alphabetical.items():
|
||||
alphabetical[k] = sorted(e, key=lambda x: x[0])
|
||||
for letter, words in alphabetical.items():
|
||||
filtered_words = [x for x in words if not _is_banned(x[0])]
|
||||
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
||||
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
|
||||
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||
|
@ -483,6 +499,7 @@ if __name__ == "__main__":
|
|||
aparser.add_argument("--dbuser", type=str)
|
||||
aparser.add_argument("--dbpass", type=str)
|
||||
aparser.add_argument("--dbaddr", type=str)
|
||||
aparser.add_argument("--sskj-wordlist", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
config = None
|
||||
|
@ -507,6 +524,8 @@ if __name__ == "__main__":
|
|||
valdb = client.valdb
|
||||
|
||||
if args.prepare_db:
|
||||
with Path(args.sskj_wordlist).open("r") as fp:
|
||||
sskj_wordlist = json.load(fp)
|
||||
prepare_app_index()
|
||||
sys.exit()
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
<LFunctors v-else></LFunctors>
|
||||
</div>
|
||||
<div class="col-sm-10">
|
||||
<p v-if="this.$root.store.api_error != null">
|
||||
<p class="text-danger" v-if="this.$root.store.api_error != null">
|
||||
{{ this.$root.store.api_error }}
|
||||
</p>
|
||||
<router-view></router-view>
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
<template>
|
||||
|
||||
<!--in case of error-->
|
||||
<div v-if="this.$root.store.api_error != null">
|
||||
</div>
|
||||
|
||||
<!--load mode-->
|
||||
<div v-if="state === 'loading'">
|
||||
<div v-else-if="state === 'loading'">
|
||||
<pulse-loader :color="loader_color"></pulse-loader>
|
||||
</div>
|
||||
|
||||
|
|
|
@ -3,7 +3,9 @@
|
|||
<b-navbar toggleable="md" type="light" variant="light">
|
||||
<b-navbar-toggle target="nav_collapse"></b-navbar-toggle>
|
||||
<!--b-navbar-brand>Vezljivostni vzorci slovenskih glagolov</b-navbar-brand-->
|
||||
<b-navbar-brand>VEZLJIVOSTNI VZORCI SLOVENSKIH GLAGOLOV</b-navbar-brand>
|
||||
<b-navbar-brand class=cursorpointer v-on:click="goHome">
|
||||
VEZLJIVOSTNI VZORCI SLOVENSKIH GLAGOLOV
|
||||
</b-navbar-brand>
|
||||
<b-collapse is-nav id="nav_collapse">
|
||||
|
||||
<b-navbar-nav>
|
||||
|
@ -101,6 +103,9 @@ export default {
|
|||
this.$router.push({
|
||||
name: "Home"
|
||||
})
|
||||
},
|
||||
goHome() {
|
||||
this.$router.replace({path: "/home"})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -116,4 +121,7 @@ nav a {
|
|||
nav a:hover {
|
||||
color: white;
|
||||
}
|
||||
.cursorpointer {
|
||||
cursor: pointer;
|
||||
}
|
||||
</style>
|
9
src/pkg/seqparser/Makefile
Normal file
9
src/pkg/seqparser/Makefile
Normal file
|
@ -0,0 +1,9 @@
|
|||
SSKJ_HTML = /home/kristjan/git/diploma/data/sskj/sskj2_v1.html
|
||||
SSKJ_JSON = "./sskj.json"
|
||||
WORDLIST = "./wordlist.json"
|
||||
|
||||
gen_json_files:
|
||||
cd seqparser; python3 main.py \
|
||||
--sskj-html=$(SSKJ_HTML) \
|
||||
--sskj-json=$(SSKJ_JSON) \
|
||||
--wordlist=$(WORDLIST)
|
1
src/pkg/seqparser/requirements.txt
Normal file
1
src/pkg/seqparser/requirements.txt
Normal file
|
@ -0,0 +1 @@
|
|||
bs4
|
313
src/pkg/seqparser/seqparser/Seqparser.py
Normal file
313
src/pkg/seqparser/seqparser/Seqparser.py
Normal file
|
@ -0,0 +1,313 @@
|
|||
from bs4 import BeautifulSoup as BS
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from time import time
|
||||
import pickle
|
||||
import json
|
||||
from copy import deepcopy as DC
|
||||
from pathlib import Path
|
||||
|
||||
# Match sese ordinals (1., 2., ...)
|
||||
rord = re.compile(r"^ *[0-9]+\. *$")
|
||||
|
||||
# Get rid of accented characters.
|
||||
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
|
||||
outtb = "AEIOUaaaceeeiiinoooouuučRr"
|
||||
transtab = str.maketrans(intab, outtb)
|
||||
|
||||
def d_time(fun):
|
||||
def wrapper(*args, **kwargs):
|
||||
tstart = time()
|
||||
fun(*args, **kwargs)
|
||||
duration = time() - tstart
|
||||
print("Function {} ran for {:.2f} s.".format(
|
||||
fun.__name__, duration))
|
||||
return wrapper
|
||||
|
||||
class Seqparser:
|
||||
def __init__(sskj_file):
|
||||
pass
|
||||
|
||||
@d_time
|
||||
def html_to_verb_adj_json(self, infile, outfile):
|
||||
out_dict = defaultdict(list)
|
||||
with Path(infile).open("rb") as fp:
|
||||
for line in fp:
|
||||
data = self.parse_line(line)
|
||||
if data is None: continue
|
||||
out_dict[data["izt_clean"]].append(data)
|
||||
with Path(outfile).open("w") as fp:
|
||||
json.dump(dict(out_dict), fp)
|
||||
|
||||
@d_time
|
||||
def generate_sskj_wordlist(self, in_json_file, out_wordlist):
|
||||
wordlist = None
|
||||
with Path(in_json_file).open("r") as fp:
|
||||
jdata = json.load(fp)
|
||||
wordlist = list(jdata.keys())
|
||||
with Path(out_wordlist).open("w") as fp:
|
||||
json.dump({"wordlist": wordlist}, fp)
|
||||
|
||||
# main functions
|
||||
def html_to_raw_pickle(self, sskj_html_filepath, raw_pickle_filepath):
|
||||
entries = dict(self.parse_file(sskj_html_filepath, self.parse_line))
|
||||
print("entries len: " + str(len(entries)))
|
||||
with open(raw_pickle_filepath, "wb") as f:
|
||||
tmpstr = json.dumps(dict(entries))
|
||||
pickle.dump(tmpstr, f)
|
||||
# debugging
|
||||
|
||||
def raw_pickle_to_parsed_pickle(
|
||||
self, raw_pickle_filepath, parsed_pickle_filepath,
|
||||
se_list_filepath
|
||||
):
|
||||
data = self.load_raw_pickle(raw_pickle_filepath)
|
||||
print("raw_pickle data len: " + str(len(data)))
|
||||
se_list = self.gen_se_list(data)
|
||||
print("se_list len: " + str(len(se_list)))
|
||||
with open(se_list_filepath, "wb") as f:
|
||||
pickle.dump(se_list, f)
|
||||
data1 = self.remove_se(data)
|
||||
data2 = self.reorganize(data1, se_list)
|
||||
print("data2 len: " + str(len(data2.keys())))
|
||||
with open(parsed_pickle_filepath, "wb") as f:
|
||||
pickle.dump(data2, f)
|
||||
|
||||
# helper html reading functions
|
||||
def parse_file(self, path, f_parse_line):
|
||||
tstart = time()
|
||||
entries = defaultdict(list)
|
||||
with open(path, "r") as f:
|
||||
for line in f:
|
||||
data = f_parse_line(line)
|
||||
if data is not None:
|
||||
entries[data["izt_clean"]].append(data)
|
||||
print("parse_file({}) in {:.2f}s".format(path, time() - tstart))
|
||||
return entries
|
||||
|
||||
def parse_line(self, line):
|
||||
def helper_bv_set(g_or_p):
|
||||
if g_or_p not in ["G", "P"]:
|
||||
print("Err g_or_p.")
|
||||
exit(1)
|
||||
if data.get("bv") is not None:
|
||||
if data["bv"] != g_or_p:
|
||||
print(str(line))
|
||||
# exit(1)
|
||||
data["bv"] = g_or_p
|
||||
data = {
|
||||
"izt": "",
|
||||
"izt_clean": "",
|
||||
"senses": defaultdict(list)
|
||||
}
|
||||
soup = BS(line, "html.parser")
|
||||
|
||||
current_sense_id = "0"
|
||||
for span in soup.find_all("span"):
|
||||
|
||||
# sense id
|
||||
if span.string is not None:
|
||||
rmatch = rord.match(span.string)
|
||||
if rmatch is not None:
|
||||
current_sense_id = rmatch.group().strip()
|
||||
|
||||
title = span.attrs.get("title")
|
||||
if title is not None:
|
||||
title = title.lower()
|
||||
|
||||
# only verbs and adjectives
|
||||
if "glagol" in title:
|
||||
helper_bv_set("G")
|
||||
data["bv_full"] = title
|
||||
elif "pridevn" in title:
|
||||
helper_bv_set("P")
|
||||
data["bv_full"] = title
|
||||
|
||||
# žšč
|
||||
if title == "iztočnica":
|
||||
data["izt"] = span.string
|
||||
data["izt_clean"] = span.string.translate(transtab).lower()
|
||||
|
||||
# sense description
|
||||
if title == "razlaga" and span.string is not None:
|
||||
data["senses"][current_sense_id].append(
|
||||
("razl", span.string))
|
||||
if "pridevnik od" in span.string:
|
||||
helper_bv_set("P")
|
||||
|
||||
if title == "sopomenka":
|
||||
subspan = span.find_all("a")[0]
|
||||
if subspan.string is not None:
|
||||
data["senses"][current_sense_id].append(
|
||||
("sopo", subspan.string))
|
||||
|
||||
# save verbs and adjectives
|
||||
if (
|
||||
("bv" not in data) or
|
||||
(data["bv"] != "P" and data["bv"] != "G")
|
||||
):
|
||||
return None
|
||||
|
||||
# sanity check
|
||||
if data["bv"] == "P" and " se" in data["izt_clean"]:
|
||||
print(data)
|
||||
exit(1)
|
||||
|
||||
# append _ to adjective keywords
|
||||
if data["bv"] == "P":
|
||||
data["izt_clean"] = data["izt_clean"] + "_"
|
||||
|
||||
# cleanup
|
||||
if "bv" not in data:
|
||||
print("Should not be here (no bv).")
|
||||
exit(1)
|
||||
del(data["bv"])
|
||||
if "bv_full" in data:
|
||||
del(data["bv_full"])
|
||||
|
||||
return data
|
||||
|
||||
# helper functions
|
||||
def load_raw_pickle(self, raw_pickle_filepath):
|
||||
with open(raw_pickle_filepath, "rb") as f:
|
||||
tmpstr = pickle.load(f)
|
||||
return json.loads(tmpstr)
|
||||
|
||||
def helper_loop(self, data, fnc):
|
||||
for k, lst in data.items():
|
||||
for el in lst:
|
||||
fnc(el)
|
||||
|
||||
def gen_se_list(self, data):
|
||||
|
||||
def fnc1(el):
|
||||
ic = el["izt_clean"]
|
||||
if " se" in ic:
|
||||
se_list.append(ic)
|
||||
|
||||
def fnc2(el):
|
||||
ic = el["izt_clean"]
|
||||
if ic in se_pruned:
|
||||
se_pruned.remove(ic)
|
||||
|
||||
# hw entries that only exist with " se"
|
||||
se_list = []
|
||||
self.helper_loop(data, fnc1)
|
||||
se_pruned = set([hw.split(" se")[0] for hw in se_list])
|
||||
self.helper_loop(data, fnc2)
|
||||
return sorted(list(se_pruned))
|
||||
|
||||
def remove_se(self, data):
|
||||
|
||||
def fnc1(el):
|
||||
nel = DC(el)
|
||||
ic = nel["izt_clean"]
|
||||
if " se" in ic:
|
||||
nic = ic.split(" se")[0]
|
||||
nel["izt_clean"] = nic
|
||||
data_new[nel["izt_clean"]].append(nel)
|
||||
|
||||
data_new = defaultdict(list)
|
||||
self.helper_loop(data, fnc1)
|
||||
return dict(data_new)
|
||||
|
||||
def reorganize(self, data, se_list):
|
||||
# some hw entries have several headwords,
|
||||
# some senses have subsenses
|
||||
# index everything, make 1 object per hw
|
||||
|
||||
def helper_prune(sense_str):
|
||||
# remove space padding
|
||||
sense_str = sense_str.strip()
|
||||
|
||||
if len(sense_str) == 1:
|
||||
return sense_str
|
||||
|
||||
# remove banned characters from string ending
|
||||
banned = ": ; . , - ! ?".split(" ")
|
||||
if sense_str[-1] in banned:
|
||||
return sense_str[:-1]
|
||||
|
||||
return sense_str
|
||||
|
||||
data_new = {}
|
||||
for k, lst in data.items():
|
||||
new_el = {
|
||||
"hw": k,
|
||||
"has_se": k in se_list,
|
||||
"senses": []
|
||||
}
|
||||
|
||||
# if there is a single hw entry, hw_id is 0
|
||||
if len(lst) == 1:
|
||||
homonym_id = -1
|
||||
else:
|
||||
homonym_id = 0
|
||||
|
||||
# loop homonyms
|
||||
for el in lst:
|
||||
homonym_id += 1
|
||||
# loop top lvl sense ids
|
||||
for sense_id, sens_lst in el["senses"].items():
|
||||
# loop subsenses
|
||||
for i, sens in enumerate(sens_lst):
|
||||
nsid = sense_id.split(".")[0]
|
||||
if len(sens_lst) == 1:
|
||||
nsid += "-0"
|
||||
else:
|
||||
nsid += ("-" + str(i + 1))
|
||||
new_sense = {
|
||||
"homonym_id": homonym_id,
|
||||
# sense_id: sense_id-subsense_id
|
||||
"sense_id": nsid,
|
||||
"sense_type": sens[0],
|
||||
"sense_desc": helper_prune(sens[1]),
|
||||
}
|
||||
new_el["senses"].append(new_sense)
|
||||
hw = new_el["hw"]
|
||||
if hw in data_new:
|
||||
print("Shouldn't be here.")
|
||||
print(new_el)
|
||||
exit(1)
|
||||
data_new[hw] = DC(new_el)
|
||||
# return data_new
|
||||
|
||||
# check
|
||||
for hw, el in data_new.items():
|
||||
for sens in el["senses"]:
|
||||
if sens["sense_desc"] is None:
|
||||
print(sens)
|
||||
|
||||
return data_new
|
||||
|
||||
|
||||
def plst(lst):
|
||||
for el in lst:
|
||||
print(el)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
datapath = "../../../data"
|
||||
html_filepath = datapath + "/sskj/sskj2_v1.html"
|
||||
raw_pickle_filepath = datapath + "/tmp_pickles/raw_sskj.pickle"
|
||||
parsed_pickle_filepath = datapath + "/no_del_pickles/sskj_senses.pickle"
|
||||
se_list_filepath = datapath + "/no_del_pickles/se_list.pickle"
|
||||
|
||||
p = Seqparser()
|
||||
|
||||
if True:
|
||||
print("html_to_raw_pickle({}, {})".format(
|
||||
html_filepath, raw_pickle_filepath))
|
||||
print("Big file, this might take a while (2 min).")
|
||||
tstart = time()
|
||||
p.html_to_raw_pickle(html_filepath, raw_pickle_filepath)
|
||||
print("Finished in {:.2f}.".format(time() - tstart))
|
||||
|
||||
if False:
|
||||
print("raw_pickle_to_parsed_pickle({}, {}, {})".format(
|
||||
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath))
|
||||
tstart = time()
|
||||
p.raw_pickle_to_parsed_pickle(
|
||||
raw_pickle_filepath, parsed_pickle_filepath, se_list_filepath)
|
||||
print("Finished in {:.2f}.".format(time() - tstart))
|
||||
print("Done.")
|
0
src/pkg/seqparser/seqparser/__init__.py
Normal file
0
src/pkg/seqparser/seqparser/__init__.py
Normal file
15
src/pkg/seqparser/seqparser/main.py
Normal file
15
src/pkg/seqparser/seqparser/main.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
from Seqparser import Seqparser
|
||||
import argparse
|
||||
|
||||
if __name__ == "__main__":
|
||||
aparser = argparse.ArgumentParser()
|
||||
aparser.add_argument("--sskj-html", type=str)
|
||||
aparser.add_argument("--sskj-json", type=str)
|
||||
aparser.add_argument("--wordlist", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
sqp = Seqparser()
|
||||
# sqp.html_to_verb_adj_json(args.sskj_html, args.sskj_json)
|
||||
|
||||
sqp.generate_sskj_wordlist(args.sskj_json, args.wordlist)
|
||||
|
1
src/pkg/seqparser/seqparser/sskj.json
Normal file
1
src/pkg/seqparser/seqparser/sskj.json
Normal file
File diff suppressed because one or more lines are too long
11
src/pkg/seqparser/setup.py
Normal file
11
src/pkg/seqparser/setup.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name='seqparser',
|
||||
version='0.0.1',
|
||||
description='Parser for sskj2 html dump.',
|
||||
author='Kristjan Voje',
|
||||
author_email='kristjan.voje@gmail.com',
|
||||
license='MIT',
|
||||
packages=['seqparser'],
|
||||
)
|
Loading…
Reference in New Issue
Block a user