filtering some words in index

This commit is contained in:
2019-04-27 20:24:11 +02:00
parent fd94627fdb
commit 81395890ab
16 changed files with 398 additions and 10 deletions

View File

@@ -38,6 +38,8 @@ SENSEMAP_COLL = "sensemap"
# pre-generated data (gui leftside word index)
CORPORA = ["ssj", "kres"]
app_index = None
sskj_wordlist = None # used by _is_banned(hw)
BANNED_HEADWORDS = ["biti"]
log = logging.getLogger(__name__)
valdb = None
@@ -430,6 +432,18 @@ def api_senses_update():
# APP PREFLIGHT ---------------------.
def _is_banned(hw):
banned = True
if hw in BANNED_HEADWORDS:
banned = True
elif hw in sskj_wordlist["wordlist"]:
banned = False
elif (hw + " se") in sskj_wordlist["wordlist"]:
banned = False
if banned:
log.debug("Banned headword: {}".format(hw))
return banned
def prepare_app_index():
log.info("[*] preparing app_index")
@@ -462,8 +476,10 @@ def prepare_app_index():
else:
alphabetical[fst] = [(k, e)]
for k, e in alphabetical.items():
alphabetical[k] = sorted(e, key=lambda x: x[0])
for letter, words in alphabetical.items():
filtered_words = [x for x in words if not _is_banned(x[0])]
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
tmp_app_index[corpus]["words"] = alphabetical
functors = [(k, e) for (k, e) in res_fns.items()]
@@ -483,6 +499,7 @@ if __name__ == "__main__":
aparser.add_argument("--dbuser", type=str)
aparser.add_argument("--dbpass", type=str)
aparser.add_argument("--dbaddr", type=str)
aparser.add_argument("--sskj-wordlist", type=str)
args = aparser.parse_args()
config = None
@@ -507,6 +524,8 @@ if __name__ == "__main__":
valdb = client.valdb
if args.prepare_db:
with Path(args.sskj_wordlist).open("r") as fp:
sskj_wordlist = json.load(fp)
prepare_app_index()
sys.exit()