cjvt-valency/dip_src/valency/sskj_scraper.py

48 lines
1.2 KiB
Python
Raw Permalink Normal View History

2019-03-07 08:00:01 +00:00
# Deprecated!
import requests
from bs4 import BeautifulSoup
from time import time
from valency import k_utils
SSKJ_BASE = "http://bos.zrc-sazu.si/cgi/a03.exe?name=sskj_testa&expression="
class SskjScraper:
def __init__(self):
self.base_url = SSKJ_BASE
def scrape(self, word):
# returns unique set of words
soup = BeautifulSoup(
requests.get(self.base_url + word).content,
"html.parser"
)
# Check for failure.
h2 = soup.find_all("h2")
if len(h2) >= 2:
# <h2>Zadetkov ni bilo: ...</h2>
return []
li_elements = soup.find_all('li', class_="nounderline")
if len(li_elements) == 0:
return []
li = li_elements[0]
# It was horrible...
# <li> ... <li> ... <li> ...</li></li></li>
# Parse sequence until you find a sedond <li>
txts = []
for el in li.find_all():
if el.name == "li":
break
txts.append(el.get_text())
print("sskj scraped {}.".format(word))
return k_utils.tokenize(txts)
if __name__ == "__main__":
sskjScr = SskjScraper()
word = "tek"
tp = sskjScr.scrape("čaj")
print(tp)