cjvt-valency/dip_src/valency/mongo_tools.py

248 lines
7.3 KiB
Python
Raw Permalink Normal View History

2019-03-07 08:00:01 +00:00
import pymongo
import xmltodict
import xml.etree.ElementTree as ET
from time import time
import json
from valency.sskj_scraper import SskjScraper
from bs4 import BeautifulSoup
# Get rid of accented characters.
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
outtb = "AEIOUaaaceeeiiinoooouuučRr"
transtab = str.maketrans(intab, outtb)
def mongo_test():
client = pymongo.MongoClient(
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
)
db = client.texts
coll = db.test
print(coll.find_one())
def basic_connection(ip_addr=None, port=None):
if ip_addr is None:
ip_addr = "127.0.0.1"
if port is None:
port = 26644
client = pymongo.MongoClient(
"mongodb://{}:{}@{}:{}/texts".format(
"kristjan", "simple567", ip_addr, str(port))
)
err_msg = "OK"
try:
client.server_info()
except pymongo.errors.ServerSelectionTimeoutError as err:
err_msg = err
return (None, err_msg)
db = client.texts
return (db, err_msg)
def check_collections(db, coll_names):
collections = db.collection_names()
for cn in coll_names:
if cn not in collections:
db.create_collection(cn)
def prepare_user_tokens(db):
CNAME = "v2_user_tokens"
db[CNAME].drop()
db.create_collection(CNAME)
EXPIRE = 151200 # 2 days
# EXPIRE = 10 # 10 sec
db[CNAME].ensure_index("date", expireAfterSeconds=EXPIRE)
# user this: utc_timestamp = datetime.datetime.utcnow()
# user_tokens.insert({
# '_id': 'utc_session', "date": utc_timestamp,
# "session": "test session"})
def sskj_to_mongo(sskj_path):
# Deprecated, use sskj2_to_mongo
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
ts = time()
sskj = ET.parse(sskj_path).getroot()
db = basic_connection()
col_names = ["sskj"]
for cn in col_names:
if cn in db.collection_names():
db[cn].drop()
text = sskj.find("tei:text", ns)
body = text.find("tei:body", ns)
n_ent = 0
for entry in body.findall("tei:entry", ns):
n_ent += 1
tmpstr = ET.tostring(entry)
datachunk = xmltodict.parse(tmpstr)
dictchunk = json.loads(json.dumps(datachunk))
"""
pp = pprint.PrettyPrinter()
pp.pprint(dictchunk)
"""
db.sskj.insert(dictchunk)
# iskanje: db.sskj.find({'ns0:entry.ns0:form.ns0:orth':"kaplanček"})
print("sskj to mongo: {} entries in {:.2f}s".format(n_ent, time() - ts))
def slownet_to_mongo(slw_path):
# .slownet contains the database from .xml file
# added toplevel field ["slo_lemma"] for faster querying
ts = time()
slownet = ET.parse(slw_path).getroot()
db = basic_connection()
col_names = ["slownet_map", "slownet"]
for cn in col_names:
if cn in db.collection_names():
db[cn].drop()
slo_to_id = {}
for synset in slownet.findall("SYNSET"):
tmpstr = ET.tostring(synset)
datachunk = xmltodict.parse(tmpstr)
dictchunk = json.loads(json.dumps(datachunk))
dictchunk = dictchunk["SYNSET"]
# pp.pprint(dictchunk)
# insert into slo_ti_id
if "SYNONYM" in dictchunk:
synonyms = dictchunk["SYNONYM"]
if not isinstance(synonyms, list):
synonyms = [synonyms]
for syn in synonyms:
if syn["@xml:lang"] == "sl":
if "LITERAL" in syn:
literals = syn["LITERAL"]
if not isinstance(literals, list):
literals = [literals]
for lit in literals:
slo_keyword = lit["#text"]
if "." in slo_keyword:
continue
if "slo_lemma" not in dictchunk:
dictchunk["slo_lemma"] = []
dictchunk["slo_lemma"].append(slo_keyword)
db.slownet.insert(dictchunk)
# pp.pprint(slo_to_id)
db.slownet.ensure_index([("id", pymongo.ASCENDING)])
db.slo_to_id.insert(slo_to_id)
print("sloWNet to mongo in {:.2f}s".format(time() - ts))
def scrape_sskj():
# Deprecated!
# Walk through keys in slo_to_id and scrape sskj data.
client = pymongo.MongoClient(
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
)
db = client.texts
words_list = sorted(db.slo_to_id.find_one())
print(len(words_list))
sscraper = SskjScraper()
last_word = "nogometaš"
db.scraped_sskj.remove({"word": last_word})
lock = True
for word in words_list:
if word == last_word:
lock = False
if not lock:
res = sscraper.scrape(word)
if len(res) > 0:
db.scraped_sskj.insert({"word": word, "bag": res})
def sskj2_to_mongo(sskj2_path):
tstart = time()
db = basic_connection()
col_names = ["sskj2"]
for cn in col_names:
if cn in db.collection_names():
db[cn].drop()
with open(sskj2_path) as f:
soup = BeautifulSoup(f.read(), "html.parser")
divs = soup.find_all("div")
for i, div in enumerate(divs):
if (i) % 100 == 0:
print("{}/{}".format(i, len(divs)))
datachunk = xmltodict.parse(str(div))
datachunk = datachunk["div"]
# pos (besedna vrsta)
pos_keywords = {
"samostalnik": 0,
"pridevnik": 0,
"glagol": 0,
"prislov": 0,
"predlog": 0,
"členek": 0,
"veznik": 0,
"medmet": 0,
"povedkovnik": 0
}
for span in div.find_all("span"):
attrs = [e for k, e in span.attrs.items()]
for attr in attrs:
for ak in attr.split(" "):
akl = ak.lower()
if akl in pos_keywords:
pos_keywords[akl] += 1
pos = "unknonw"
pos_max = 0
for k, e in pos_keywords.items():
if e > pos_max:
pos = k
pos_max = e
datachunk["pos"] = pos
# izt_clean
izts = div.find_all("span", {"title": "Iztočnica"})
if len(izts) == 0:
print("Entry {} has no Iztočnica.".format(i))
continue
izt = ((izts[0].text).translate(transtab)).lower()
ispl = izt.split(" ")
has_se = False
if len(ispl) and ispl[-1] == "se":
izt = " ".join(ispl[:-1])
has_se = True
datachunk["izt_clean"] = izt
datachunk["has_se"] = has_se
dictchunk = json.loads(json.dumps(datachunk))
db.sskj.insert(dictchunk)
db.sskj.create_index([("izt_clean", pymongo.TEXT)])
print("sskj2 to mongo: {} entries in {:.2f}s".format(
len(divs), time() - tstart))
return None
if __name__ == "__main__":
# slownet_path = "../../data/slownet/slownet-2015-05-07.xml"
# slownet_to_mongo(slownet_path)
# scrape_sskj()
# sskj_path = "../../data/sskj/sskj.p5.xml"
# sskj_to_mongo(sskj_path)
# first file for testing, the original file takes up most of RAM
# sskj2_path = "../../data/sskj/sskj2_200.html"
# sskj2_path = "../../data/sskj/sskj2_v1.html"
# sskj2_to_mongo(sskj2_path)
print("nothing here")