cjvt-valency/dip_src/valency/sskj_scraper.py

# Deprecated!

import requests
from bs4 import BeautifulSoup
from time import time
from valency import k_utils

SSKJ_BASE = "http://bos.zrc-sazu.si/cgi/a03.exe?name=sskj_testa&expression="


class SskjScraper:
    def __init__(self):
        self.base_url = SSKJ_BASE

    def scrape(self, word):
        # returns unique set of words
        soup = BeautifulSoup(
            requests.get(self.base_url + word).content,
            "html.parser"
        )
        # Check for failure.
        h2 = soup.find_all("h2")
        if len(h2) >= 2:
            # <h2>Zadetkov ni bilo: ...</h2>
            return []
        li_elements = soup.find_all('li', class_="nounderline")
        if len(li_elements) == 0:
            return []
        li = li_elements[0]
        # It was horrible...
        # <li> ... <li> ... <li> ...</li></li></li>
        # Parse sequence until you find a sedond <li>
        txts = []
        for el in li.find_all():
            if el.name == "li":
                break
            txts.append(el.get_text())
        print("sskj scraped {}.".format(word))
        return k_utils.tokenize(txts)


if __name__ == "__main__":
    sskjScr = SskjScraper()

    word = "tek"
    tp = sskjScr.scrape("čaj")
    print(tp)
old files from diploma's poc 2019-03-07 08:00:01 +00:00			`# Deprecated!`

			`import requests`
			`from bs4 import BeautifulSoup`
			`from time import time`
			`from valency import k_utils`

			`SSKJ_BASE = "http://bos.zrc-sazu.si/cgi/a03.exe?name=sskj_testa&expression="`


			`class SskjScraper:`
			`def __init__(self):`
			`self.base_url = SSKJ_BASE`

			`def scrape(self, word):`
			`# returns unique set of words`
			`soup = BeautifulSoup(`
			`requests.get(self.base_url + word).content,`
			`"html.parser"`
			`)`
			`# Check for failure.`
			`h2 = soup.find_all("h2")`
			`if len(h2) >= 2:`
			`# <h2>Zadetkov ni bilo: ...</h2>`
			`return []`
			`li_elements = soup.find_all('li', class_="nounderline")`
			`if len(li_elements) == 0:`
			`return []`
			`li = li_elements[0]`
			`# It was horrible...`
			`# <li> ... <li> ... <li> ...</li></li></li>`
			`# Parse sequence until you find a sedond <li>`
			`txts = []`
			`for el in li.find_all():`
			`if el.name == "li":`
			`break`
			`txts.append(el.get_text())`
			`print("sskj scraped {}.".format(word))`
			`return k_utils.tokenize(txts)`


			`if __name__ == "__main__":`
			`sskjScr = SskjScraper()`

			`word = "tek"`
			`tp = sskjScr.scrape("čaj")`
			`print(tp)`