48 lines
1.2 KiB
Python
48 lines
1.2 KiB
Python
# Deprecated!
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from time import time
|
|
from valency import k_utils
|
|
|
|
SSKJ_BASE = "http://bos.zrc-sazu.si/cgi/a03.exe?name=sskj_testa&expression="
|
|
|
|
|
|
class SskjScraper:
|
|
def __init__(self):
|
|
self.base_url = SSKJ_BASE
|
|
|
|
def scrape(self, word):
|
|
# returns unique set of words
|
|
soup = BeautifulSoup(
|
|
requests.get(self.base_url + word).content,
|
|
"html.parser"
|
|
)
|
|
# Check for failure.
|
|
h2 = soup.find_all("h2")
|
|
if len(h2) >= 2:
|
|
# <h2>Zadetkov ni bilo: ...</h2>
|
|
return []
|
|
li_elements = soup.find_all('li', class_="nounderline")
|
|
if len(li_elements) == 0:
|
|
return []
|
|
li = li_elements[0]
|
|
# It was horrible...
|
|
# <li> ... <li> ... <li> ...</li></li></li>
|
|
# Parse sequence until you find a sedond <li>
|
|
txts = []
|
|
for el in li.find_all():
|
|
if el.name == "li":
|
|
break
|
|
txts.append(el.get_text())
|
|
print("sskj scraped {}.".format(word))
|
|
return k_utils.tokenize(txts)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sskjScr = SskjScraper()
|
|
|
|
word = "tek"
|
|
tp = sskjScr.scrape("čaj")
|
|
print(tp)
|