forked from kristjan/cjvt-valency
filtering some words in index
This commit is contained in:
@@ -38,6 +38,8 @@ SENSEMAP_COLL = "sensemap"
|
||||
# pre-generated data (gui leftside word index)
|
||||
CORPORA = ["ssj", "kres"]
|
||||
app_index = None
|
||||
sskj_wordlist = None # used by _is_banned(hw)
|
||||
BANNED_HEADWORDS = ["biti"]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
valdb = None
|
||||
@@ -430,6 +432,18 @@ def api_senses_update():
|
||||
|
||||
|
||||
# APP PREFLIGHT ---------------------.
|
||||
def _is_banned(hw):
|
||||
banned = True
|
||||
if hw in BANNED_HEADWORDS:
|
||||
banned = True
|
||||
elif hw in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
||||
banned = False
|
||||
|
||||
if banned:
|
||||
log.debug("Banned headword: {}".format(hw))
|
||||
return banned
|
||||
|
||||
def prepare_app_index():
|
||||
log.info("[*] preparing app_index")
|
||||
@@ -462,8 +476,10 @@ def prepare_app_index():
|
||||
else:
|
||||
alphabetical[fst] = [(k, e)]
|
||||
|
||||
for k, e in alphabetical.items():
|
||||
alphabetical[k] = sorted(e, key=lambda x: x[0])
|
||||
for letter, words in alphabetical.items():
|
||||
filtered_words = [x for x in words if not _is_banned(x[0])]
|
||||
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
||||
|
||||
tmp_app_index[corpus]["words"] = alphabetical
|
||||
|
||||
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||
@@ -483,6 +499,7 @@ if __name__ == "__main__":
|
||||
aparser.add_argument("--dbuser", type=str)
|
||||
aparser.add_argument("--dbpass", type=str)
|
||||
aparser.add_argument("--dbaddr", type=str)
|
||||
aparser.add_argument("--sskj-wordlist", type=str)
|
||||
args = aparser.parse_args()
|
||||
|
||||
config = None
|
||||
@@ -507,6 +524,8 @@ if __name__ == "__main__":
|
||||
valdb = client.valdb
|
||||
|
||||
if args.prepare_db:
|
||||
with Path(args.sskj_wordlist).open("r") as fp:
|
||||
sskj_wordlist = json.load(fp)
|
||||
prepare_app_index()
|
||||
sys.exit()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user