frontend_devops fix
This commit is contained in:
@@ -0,0 +1,247 @@
|
||||
import pymongo
|
||||
import xmltodict
|
||||
import xml.etree.ElementTree as ET
|
||||
from time import time
|
||||
import json
|
||||
from valency.sskj_scraper import SskjScraper
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Get rid of accented characters.
|
||||
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
|
||||
outtb = "AEIOUaaaceeeiiinoooouuučRr"
|
||||
transtab = str.maketrans(intab, outtb)
|
||||
|
||||
|
||||
def mongo_test():
|
||||
client = pymongo.MongoClient(
|
||||
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
|
||||
)
|
||||
|
||||
db = client.texts
|
||||
|
||||
coll = db.test
|
||||
|
||||
print(coll.find_one())
|
||||
|
||||
|
||||
def basic_connection(ip_addr=None, port=None):
|
||||
if ip_addr is None:
|
||||
ip_addr = "127.0.0.1"
|
||||
if port is None:
|
||||
port = 26644
|
||||
client = pymongo.MongoClient(
|
||||
"mongodb://{}:{}@{}:{}/texts".format(
|
||||
"kristjan", "simple567", ip_addr, str(port))
|
||||
)
|
||||
err_msg = "OK"
|
||||
try:
|
||||
client.server_info()
|
||||
except pymongo.errors.ServerSelectionTimeoutError as err:
|
||||
err_msg = err
|
||||
return (None, err_msg)
|
||||
db = client.texts
|
||||
return (db, err_msg)
|
||||
|
||||
|
||||
def check_collections(db, coll_names):
|
||||
collections = db.collection_names()
|
||||
for cn in coll_names:
|
||||
if cn not in collections:
|
||||
db.create_collection(cn)
|
||||
|
||||
|
||||
def prepare_user_tokens(db):
|
||||
CNAME = "v2_user_tokens"
|
||||
db[CNAME].drop()
|
||||
db.create_collection(CNAME)
|
||||
EXPIRE = 151200 # 2 days
|
||||
# EXPIRE = 10 # 10 sec
|
||||
db[CNAME].ensure_index("date", expireAfterSeconds=EXPIRE)
|
||||
|
||||
# user this: utc_timestamp = datetime.datetime.utcnow()
|
||||
# user_tokens.insert({
|
||||
# '_id': 'utc_session', "date": utc_timestamp,
|
||||
# "session": "test session"})
|
||||
|
||||
|
||||
def sskj_to_mongo(sskj_path):
|
||||
# Deprecated, use sskj2_to_mongo
|
||||
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
|
||||
ts = time()
|
||||
sskj = ET.parse(sskj_path).getroot()
|
||||
db = basic_connection()
|
||||
col_names = ["sskj"]
|
||||
for cn in col_names:
|
||||
if cn in db.collection_names():
|
||||
db[cn].drop()
|
||||
text = sskj.find("tei:text", ns)
|
||||
body = text.find("tei:body", ns)
|
||||
n_ent = 0
|
||||
for entry in body.findall("tei:entry", ns):
|
||||
n_ent += 1
|
||||
tmpstr = ET.tostring(entry)
|
||||
datachunk = xmltodict.parse(tmpstr)
|
||||
dictchunk = json.loads(json.dumps(datachunk))
|
||||
"""
|
||||
pp = pprint.PrettyPrinter()
|
||||
pp.pprint(dictchunk)
|
||||
"""
|
||||
db.sskj.insert(dictchunk)
|
||||
# iskanje: db.sskj.find({'ns0:entry.ns0:form.ns0:orth':"kaplanček"})
|
||||
print("sskj to mongo: {} entries in {:.2f}s".format(n_ent, time() - ts))
|
||||
|
||||
|
||||
def slownet_to_mongo(slw_path):
|
||||
# .slownet contains the database from .xml file
|
||||
# added toplevel field ["slo_lemma"] for faster querying
|
||||
ts = time()
|
||||
slownet = ET.parse(slw_path).getroot()
|
||||
db = basic_connection()
|
||||
col_names = ["slownet_map", "slownet"]
|
||||
for cn in col_names:
|
||||
if cn in db.collection_names():
|
||||
db[cn].drop()
|
||||
|
||||
slo_to_id = {}
|
||||
for synset in slownet.findall("SYNSET"):
|
||||
tmpstr = ET.tostring(synset)
|
||||
datachunk = xmltodict.parse(tmpstr)
|
||||
dictchunk = json.loads(json.dumps(datachunk))
|
||||
dictchunk = dictchunk["SYNSET"]
|
||||
# pp.pprint(dictchunk)
|
||||
|
||||
# insert into slo_ti_id
|
||||
if "SYNONYM" in dictchunk:
|
||||
synonyms = dictchunk["SYNONYM"]
|
||||
if not isinstance(synonyms, list):
|
||||
synonyms = [synonyms]
|
||||
for syn in synonyms:
|
||||
if syn["@xml:lang"] == "sl":
|
||||
if "LITERAL" in syn:
|
||||
literals = syn["LITERAL"]
|
||||
if not isinstance(literals, list):
|
||||
literals = [literals]
|
||||
for lit in literals:
|
||||
slo_keyword = lit["#text"]
|
||||
if "." in slo_keyword:
|
||||
continue
|
||||
if "slo_lemma" not in dictchunk:
|
||||
dictchunk["slo_lemma"] = []
|
||||
dictchunk["slo_lemma"].append(slo_keyword)
|
||||
db.slownet.insert(dictchunk)
|
||||
|
||||
# pp.pprint(slo_to_id)
|
||||
db.slownet.ensure_index([("id", pymongo.ASCENDING)])
|
||||
db.slo_to_id.insert(slo_to_id)
|
||||
print("sloWNet to mongo in {:.2f}s".format(time() - ts))
|
||||
|
||||
|
||||
def scrape_sskj():
|
||||
# Deprecated!
|
||||
# Walk through keys in slo_to_id and scrape sskj data.
|
||||
client = pymongo.MongoClient(
|
||||
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
|
||||
)
|
||||
db = client.texts
|
||||
words_list = sorted(db.slo_to_id.find_one())
|
||||
|
||||
print(len(words_list))
|
||||
sscraper = SskjScraper()
|
||||
|
||||
last_word = "nogometaš"
|
||||
db.scraped_sskj.remove({"word": last_word})
|
||||
lock = True
|
||||
for word in words_list:
|
||||
if word == last_word:
|
||||
lock = False
|
||||
|
||||
if not lock:
|
||||
res = sscraper.scrape(word)
|
||||
if len(res) > 0:
|
||||
db.scraped_sskj.insert({"word": word, "bag": res})
|
||||
|
||||
|
||||
def sskj2_to_mongo(sskj2_path):
|
||||
tstart = time()
|
||||
|
||||
db = basic_connection()
|
||||
col_names = ["sskj2"]
|
||||
for cn in col_names:
|
||||
if cn in db.collection_names():
|
||||
db[cn].drop()
|
||||
|
||||
with open(sskj2_path) as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
divs = soup.find_all("div")
|
||||
for i, div in enumerate(divs):
|
||||
if (i) % 100 == 0:
|
||||
print("{}/{}".format(i, len(divs)))
|
||||
datachunk = xmltodict.parse(str(div))
|
||||
datachunk = datachunk["div"]
|
||||
|
||||
# pos (besedna vrsta)
|
||||
pos_keywords = {
|
||||
"samostalnik": 0,
|
||||
"pridevnik": 0,
|
||||
"glagol": 0,
|
||||
"prislov": 0,
|
||||
"predlog": 0,
|
||||
"členek": 0,
|
||||
"veznik": 0,
|
||||
"medmet": 0,
|
||||
"povedkovnik": 0
|
||||
}
|
||||
for span in div.find_all("span"):
|
||||
attrs = [e for k, e in span.attrs.items()]
|
||||
for attr in attrs:
|
||||
for ak in attr.split(" "):
|
||||
akl = ak.lower()
|
||||
if akl in pos_keywords:
|
||||
pos_keywords[akl] += 1
|
||||
pos = "unknonw"
|
||||
pos_max = 0
|
||||
for k, e in pos_keywords.items():
|
||||
if e > pos_max:
|
||||
pos = k
|
||||
pos_max = e
|
||||
datachunk["pos"] = pos
|
||||
|
||||
# izt_clean
|
||||
izts = div.find_all("span", {"title": "Iztočnica"})
|
||||
if len(izts) == 0:
|
||||
print("Entry {} has no Iztočnica.".format(i))
|
||||
continue
|
||||
izt = ((izts[0].text).translate(transtab)).lower()
|
||||
ispl = izt.split(" ")
|
||||
has_se = False
|
||||
if len(ispl) and ispl[-1] == "se":
|
||||
izt = " ".join(ispl[:-1])
|
||||
has_se = True
|
||||
datachunk["izt_clean"] = izt
|
||||
datachunk["has_se"] = has_se
|
||||
|
||||
dictchunk = json.loads(json.dumps(datachunk))
|
||||
db.sskj.insert(dictchunk)
|
||||
|
||||
db.sskj.create_index([("izt_clean", pymongo.TEXT)])
|
||||
print("sskj2 to mongo: {} entries in {:.2f}s".format(
|
||||
len(divs), time() - tstart))
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# slownet_path = "../../data/slownet/slownet-2015-05-07.xml"
|
||||
# slownet_to_mongo(slownet_path)
|
||||
|
||||
# scrape_sskj()
|
||||
|
||||
# sskj_path = "../../data/sskj/sskj.p5.xml"
|
||||
# sskj_to_mongo(sskj_path)
|
||||
|
||||
# first file for testing, the original file takes up most of RAM
|
||||
# sskj2_path = "../../data/sskj/sskj2_200.html"
|
||||
# sskj2_path = "../../data/sskj/sskj2_v1.html"
|
||||
# sskj2_to_mongo(sskj2_path)
|
||||
|
||||
print("nothing here")
|
||||
Reference in New Issue
Block a user