forked from kristjan/cjvt-valency
248 lines
7.3 KiB
Python
248 lines
7.3 KiB
Python
|
import pymongo
|
||
|
import xmltodict
|
||
|
import xml.etree.ElementTree as ET
|
||
|
from time import time
|
||
|
import json
|
||
|
from valency.sskj_scraper import SskjScraper
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
# Get rid of accented characters.
|
||
|
intab = "ÁÉÍÓÚàáäçèéêìíîñòóôöùúüčŔŕ"
|
||
|
outtb = "AEIOUaaaceeeiiinoooouuučRr"
|
||
|
transtab = str.maketrans(intab, outtb)
|
||
|
|
||
|
|
||
|
def mongo_test():
|
||
|
client = pymongo.MongoClient(
|
||
|
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
|
||
|
)
|
||
|
|
||
|
db = client.texts
|
||
|
|
||
|
coll = db.test
|
||
|
|
||
|
print(coll.find_one())
|
||
|
|
||
|
|
||
|
def basic_connection(ip_addr=None, port=None):
|
||
|
if ip_addr is None:
|
||
|
ip_addr = "127.0.0.1"
|
||
|
if port is None:
|
||
|
port = 26644
|
||
|
client = pymongo.MongoClient(
|
||
|
"mongodb://{}:{}@{}:{}/texts".format(
|
||
|
"kristjan", "simple567", ip_addr, str(port))
|
||
|
)
|
||
|
err_msg = "OK"
|
||
|
try:
|
||
|
client.server_info()
|
||
|
except pymongo.errors.ServerSelectionTimeoutError as err:
|
||
|
err_msg = err
|
||
|
return (None, err_msg)
|
||
|
db = client.texts
|
||
|
return (db, err_msg)
|
||
|
|
||
|
|
||
|
def check_collections(db, coll_names):
|
||
|
collections = db.collection_names()
|
||
|
for cn in coll_names:
|
||
|
if cn not in collections:
|
||
|
db.create_collection(cn)
|
||
|
|
||
|
|
||
|
def prepare_user_tokens(db):
|
||
|
CNAME = "v2_user_tokens"
|
||
|
db[CNAME].drop()
|
||
|
db.create_collection(CNAME)
|
||
|
EXPIRE = 151200 # 2 days
|
||
|
# EXPIRE = 10 # 10 sec
|
||
|
db[CNAME].ensure_index("date", expireAfterSeconds=EXPIRE)
|
||
|
|
||
|
# user this: utc_timestamp = datetime.datetime.utcnow()
|
||
|
# user_tokens.insert({
|
||
|
# '_id': 'utc_session', "date": utc_timestamp,
|
||
|
# "session": "test session"})
|
||
|
|
||
|
|
||
|
def sskj_to_mongo(sskj_path):
|
||
|
# Deprecated, use sskj2_to_mongo
|
||
|
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
|
||
|
ts = time()
|
||
|
sskj = ET.parse(sskj_path).getroot()
|
||
|
db = basic_connection()
|
||
|
col_names = ["sskj"]
|
||
|
for cn in col_names:
|
||
|
if cn in db.collection_names():
|
||
|
db[cn].drop()
|
||
|
text = sskj.find("tei:text", ns)
|
||
|
body = text.find("tei:body", ns)
|
||
|
n_ent = 0
|
||
|
for entry in body.findall("tei:entry", ns):
|
||
|
n_ent += 1
|
||
|
tmpstr = ET.tostring(entry)
|
||
|
datachunk = xmltodict.parse(tmpstr)
|
||
|
dictchunk = json.loads(json.dumps(datachunk))
|
||
|
"""
|
||
|
pp = pprint.PrettyPrinter()
|
||
|
pp.pprint(dictchunk)
|
||
|
"""
|
||
|
db.sskj.insert(dictchunk)
|
||
|
# iskanje: db.sskj.find({'ns0:entry.ns0:form.ns0:orth':"kaplanček"})
|
||
|
print("sskj to mongo: {} entries in {:.2f}s".format(n_ent, time() - ts))
|
||
|
|
||
|
|
||
|
def slownet_to_mongo(slw_path):
|
||
|
# .slownet contains the database from .xml file
|
||
|
# added toplevel field ["slo_lemma"] for faster querying
|
||
|
ts = time()
|
||
|
slownet = ET.parse(slw_path).getroot()
|
||
|
db = basic_connection()
|
||
|
col_names = ["slownet_map", "slownet"]
|
||
|
for cn in col_names:
|
||
|
if cn in db.collection_names():
|
||
|
db[cn].drop()
|
||
|
|
||
|
slo_to_id = {}
|
||
|
for synset in slownet.findall("SYNSET"):
|
||
|
tmpstr = ET.tostring(synset)
|
||
|
datachunk = xmltodict.parse(tmpstr)
|
||
|
dictchunk = json.loads(json.dumps(datachunk))
|
||
|
dictchunk = dictchunk["SYNSET"]
|
||
|
# pp.pprint(dictchunk)
|
||
|
|
||
|
# insert into slo_ti_id
|
||
|
if "SYNONYM" in dictchunk:
|
||
|
synonyms = dictchunk["SYNONYM"]
|
||
|
if not isinstance(synonyms, list):
|
||
|
synonyms = [synonyms]
|
||
|
for syn in synonyms:
|
||
|
if syn["@xml:lang"] == "sl":
|
||
|
if "LITERAL" in syn:
|
||
|
literals = syn["LITERAL"]
|
||
|
if not isinstance(literals, list):
|
||
|
literals = [literals]
|
||
|
for lit in literals:
|
||
|
slo_keyword = lit["#text"]
|
||
|
if "." in slo_keyword:
|
||
|
continue
|
||
|
if "slo_lemma" not in dictchunk:
|
||
|
dictchunk["slo_lemma"] = []
|
||
|
dictchunk["slo_lemma"].append(slo_keyword)
|
||
|
db.slownet.insert(dictchunk)
|
||
|
|
||
|
# pp.pprint(slo_to_id)
|
||
|
db.slownet.ensure_index([("id", pymongo.ASCENDING)])
|
||
|
db.slo_to_id.insert(slo_to_id)
|
||
|
print("sloWNet to mongo in {:.2f}s".format(time() - ts))
|
||
|
|
||
|
|
||
|
def scrape_sskj():
|
||
|
# Deprecated!
|
||
|
# Walk through keys in slo_to_id and scrape sskj data.
|
||
|
client = pymongo.MongoClient(
|
||
|
"mongodb://{}:{}@127.0.0.1:26633/texts".format("kristjan", "simple567")
|
||
|
)
|
||
|
db = client.texts
|
||
|
words_list = sorted(db.slo_to_id.find_one())
|
||
|
|
||
|
print(len(words_list))
|
||
|
sscraper = SskjScraper()
|
||
|
|
||
|
last_word = "nogometaš"
|
||
|
db.scraped_sskj.remove({"word": last_word})
|
||
|
lock = True
|
||
|
for word in words_list:
|
||
|
if word == last_word:
|
||
|
lock = False
|
||
|
|
||
|
if not lock:
|
||
|
res = sscraper.scrape(word)
|
||
|
if len(res) > 0:
|
||
|
db.scraped_sskj.insert({"word": word, "bag": res})
|
||
|
|
||
|
|
||
|
def sskj2_to_mongo(sskj2_path):
|
||
|
tstart = time()
|
||
|
|
||
|
db = basic_connection()
|
||
|
col_names = ["sskj2"]
|
||
|
for cn in col_names:
|
||
|
if cn in db.collection_names():
|
||
|
db[cn].drop()
|
||
|
|
||
|
with open(sskj2_path) as f:
|
||
|
soup = BeautifulSoup(f.read(), "html.parser")
|
||
|
|
||
|
divs = soup.find_all("div")
|
||
|
for i, div in enumerate(divs):
|
||
|
if (i) % 100 == 0:
|
||
|
print("{}/{}".format(i, len(divs)))
|
||
|
datachunk = xmltodict.parse(str(div))
|
||
|
datachunk = datachunk["div"]
|
||
|
|
||
|
# pos (besedna vrsta)
|
||
|
pos_keywords = {
|
||
|
"samostalnik": 0,
|
||
|
"pridevnik": 0,
|
||
|
"glagol": 0,
|
||
|
"prislov": 0,
|
||
|
"predlog": 0,
|
||
|
"členek": 0,
|
||
|
"veznik": 0,
|
||
|
"medmet": 0,
|
||
|
"povedkovnik": 0
|
||
|
}
|
||
|
for span in div.find_all("span"):
|
||
|
attrs = [e for k, e in span.attrs.items()]
|
||
|
for attr in attrs:
|
||
|
for ak in attr.split(" "):
|
||
|
akl = ak.lower()
|
||
|
if akl in pos_keywords:
|
||
|
pos_keywords[akl] += 1
|
||
|
pos = "unknonw"
|
||
|
pos_max = 0
|
||
|
for k, e in pos_keywords.items():
|
||
|
if e > pos_max:
|
||
|
pos = k
|
||
|
pos_max = e
|
||
|
datachunk["pos"] = pos
|
||
|
|
||
|
# izt_clean
|
||
|
izts = div.find_all("span", {"title": "Iztočnica"})
|
||
|
if len(izts) == 0:
|
||
|
print("Entry {} has no Iztočnica.".format(i))
|
||
|
continue
|
||
|
izt = ((izts[0].text).translate(transtab)).lower()
|
||
|
ispl = izt.split(" ")
|
||
|
has_se = False
|
||
|
if len(ispl) and ispl[-1] == "se":
|
||
|
izt = " ".join(ispl[:-1])
|
||
|
has_se = True
|
||
|
datachunk["izt_clean"] = izt
|
||
|
datachunk["has_se"] = has_se
|
||
|
|
||
|
dictchunk = json.loads(json.dumps(datachunk))
|
||
|
db.sskj.insert(dictchunk)
|
||
|
|
||
|
db.sskj.create_index([("izt_clean", pymongo.TEXT)])
|
||
|
print("sskj2 to mongo: {} entries in {:.2f}s".format(
|
||
|
len(divs), time() - tstart))
|
||
|
return None
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
# slownet_path = "../../data/slownet/slownet-2015-05-07.xml"
|
||
|
# slownet_to_mongo(slownet_path)
|
||
|
|
||
|
# scrape_sskj()
|
||
|
|
||
|
# sskj_path = "../../data/sskj/sskj.p5.xml"
|
||
|
# sskj_to_mongo(sskj_path)
|
||
|
|
||
|
# first file for testing, the original file takes up most of RAM
|
||
|
# sskj2_path = "../../data/sskj/sskj2_200.html"
|
||
|
# sskj2_path = "../../data/sskj/sskj2_v1.html"
|
||
|
# sskj2_to_mongo(sskj2_path)
|
||
|
|
||
|
print("nothing here")
|