frontend_devops fix
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
# Deprecated!
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from time import time
|
||||
from valency import k_utils
|
||||
|
||||
SSKJ_BASE = "http://bos.zrc-sazu.si/cgi/a03.exe?name=sskj_testa&expression="
|
||||
|
||||
|
||||
class SskjScraper:
|
||||
def __init__(self):
|
||||
self.base_url = SSKJ_BASE
|
||||
|
||||
def scrape(self, word):
|
||||
# returns unique set of words
|
||||
soup = BeautifulSoup(
|
||||
requests.get(self.base_url + word).content,
|
||||
"html.parser"
|
||||
)
|
||||
# Check for failure.
|
||||
h2 = soup.find_all("h2")
|
||||
if len(h2) >= 2:
|
||||
# <h2>Zadetkov ni bilo: ...</h2>
|
||||
return []
|
||||
li_elements = soup.find_all('li', class_="nounderline")
|
||||
if len(li_elements) == 0:
|
||||
return []
|
||||
li = li_elements[0]
|
||||
# It was horrible...
|
||||
# <li> ... <li> ... <li> ...</li></li></li>
|
||||
# Parse sequence until you find a sedond <li>
|
||||
txts = []
|
||||
for el in li.find_all():
|
||||
if el.name == "li":
|
||||
break
|
||||
txts.append(el.get_text())
|
||||
print("sskj scraped {}.".format(word))
|
||||
return k_utils.tokenize(txts)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sskjScr = SskjScraper()
|
||||
|
||||
word = "tek"
|
||||
tp = sskjScr.scrape("čaj")
|
||||
print(tp)
|
||||
Reference in New Issue
Block a user