forked from kristjan/cjvt-valency
parent
d84ad9e163
commit
1f83f96267
@ -0,0 +1,12 @@
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name='valency',
|
||||
version='0.1.1',
|
||||
description='Objects and functions for handling valency frames.',
|
||||
author='Kristjan Voje',
|
||||
author_email='kristjan.voje@gmail.com',
|
||||
license='MIT',
|
||||
packages=['valency'],
|
||||
install_requires=[],
|
||||
)
|
@ -0,0 +1,96 @@
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Frame():
|
||||
def __init__(self, tids, deep_links=None, slots=None, hw=None):
|
||||
self.hw = hw
|
||||
self.tids = tids # list of tokens with the same hw_lemma
|
||||
# Each tid = "S123.t123";
|
||||
# you can get sentence with vallex.get_sentence(S123)
|
||||
self.slots = []
|
||||
if slots is None:
|
||||
self.slots = self.init_slots(deep_links)
|
||||
else:
|
||||
self.slots = slots
|
||||
self.sense_info = {}
|
||||
self.sentences = None # Used for passing to view in app.py, get_frames
|
||||
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
|
||||
|
||||
def to_json(self):
|
||||
ret = {
|
||||
"hw": self.hw,
|
||||
"tids": self.tids,
|
||||
"slots": [slot.to_json() for slot in self.slots],
|
||||
"sentences": self.sentences,
|
||||
"aggr_sent": self.aggr_sent,
|
||||
"sense_info": self.sense_info
|
||||
}
|
||||
return ret
|
||||
|
||||
def init_slots(self, deep):
|
||||
slots = []
|
||||
for link in deep:
|
||||
slots.append(Slot(
|
||||
functor=link["functor"],
|
||||
tids=[link["to"]]
|
||||
))
|
||||
return slots
|
||||
|
||||
def sort_slots(self):
|
||||
# ACT, PAT, alphabetically
|
||||
srt1 = [
|
||||
x for x in self.slots
|
||||
if (x.functor == "ACT" or
|
||||
x.functor == "PAT")
|
||||
]
|
||||
srt1 = sorted(srt1, key=lambda x: x.functor)
|
||||
srt2 = [
|
||||
x for x in self.slots
|
||||
if (x.functor != "ACT" and
|
||||
x.functor != "PAT")
|
||||
]
|
||||
srt2 = sorted(srt2, key=lambda x: x.functor)
|
||||
self.slots = (srt1 + srt2)
|
||||
|
||||
def to_string(self):
|
||||
ret = "Frame:\n"
|
||||
ret += "sense_info: {}\n".format(str(self.sense_info))
|
||||
ret += "tids: ["
|
||||
for t in self.tids:
|
||||
ret += (str(t) + ", ")
|
||||
ret += "]\n"
|
||||
if self.slots is not None:
|
||||
ret += "slots:\n"
|
||||
for sl in self.slots:
|
||||
ret += (sl.to_string() + "\n")
|
||||
return ret
|
||||
|
||||
|
||||
class Slot():
|
||||
# Each slot is identified by its functor (ACT, PAT, ...)
|
||||
# It consists of different tokens.
|
||||
def __init__(self, functor, tids=None, count=None):
|
||||
self.functor = functor
|
||||
self.tids = tids or [] # combining multiple sentences vertically
|
||||
self.count = count or 1
|
||||
|
||||
def to_string(self):
|
||||
ret = "---- Slot:\n"
|
||||
ret += "functor: {}\n".format(self.functor)
|
||||
ret += "tids: ["
|
||||
for t in self.tids:
|
||||
ret += (str(t) + ", ")
|
||||
ret += "]\n"
|
||||
ret += "]\n"
|
||||
ret += "----\n"
|
||||
return ret
|
||||
|
||||
def to_json(self):
|
||||
ret = {
|
||||
"functor": self.functor,
|
||||
"tids": self.tids,
|
||||
"count": self.count
|
||||
}
|
||||
return ret
|
@ -0,0 +1 @@
|
||||
from valency.Frame import Frame, Slot
|
@ -0,0 +1,96 @@
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Frame():
|
||||
def __init__(self, tids, deep_links=None, slots=None, hw=None):
|
||||
self.hw = hw
|
||||
self.tids = tids # list of tokens with the same hw_lemma
|
||||
# Each tid = "S123.t123";
|
||||
# you can get sentence with vallex.get_sentence(S123)
|
||||
self.slots = []
|
||||
if slots is None:
|
||||
self.slots = self.init_slots(deep_links)
|
||||
else:
|
||||
self.slots = slots
|
||||
self.sense_info = {}
|
||||
self.sentences = None # Used for passing to view in app.py, get_frames
|
||||
self.aggr_sent = None # Dictionary { hw: self.sentences idx }
|
||||
|
||||
def to_json(self):
|
||||
ret = {
|
||||
"hw": self.hw,
|
||||
"tids": self.tids,
|
||||
"slots": [slot.to_json() for slot in self.slots],
|
||||
"sentences": self.sentences,
|
||||
"aggr_sent": self.aggr_sent,
|
||||
"sense_info": self.sense_info
|
||||
}
|
||||
return ret
|
||||
|
||||
def init_slots(self, deep):
|
||||
slots = []
|
||||
for link in deep:
|
||||
slots.append(Slot(
|
||||
functor=link["functor"],
|
||||
tids=[link["to"]]
|
||||
))
|
||||
return slots
|
||||
|
||||
def sort_slots(self):
|
||||
# ACT, PAT, alphabetically
|
||||
srt1 = [
|
||||
x for x in self.slots
|
||||
if (x.functor == "ACT" or
|
||||
x.functor == "PAT")
|
||||
]
|
||||
srt1 = sorted(srt1, key=lambda x: x.functor)
|
||||
srt2 = [
|
||||
x for x in self.slots
|
||||
if (x.functor != "ACT" and
|
||||
x.functor != "PAT")
|
||||
]
|
||||
srt2 = sorted(srt2, key=lambda x: x.functor)
|
||||
self.slots = (srt1 + srt2)
|
||||
|
||||
def to_string(self):
|
||||
ret = "Frame:\n"
|
||||
ret += "sense_info: {}\n".format(str(self.sense_info))
|
||||
ret += "tids: ["
|
||||
for t in self.tids:
|
||||
ret += (str(t) + ", ")
|
||||
ret += "]\n"
|
||||
if self.slots is not None:
|
||||
ret += "slots:\n"
|
||||
for sl in self.slots:
|
||||
ret += (sl.to_string() + "\n")
|
||||
return ret
|
||||
|
||||
|
||||
class Slot():
|
||||
# Each slot is identified by its functor (ACT, PAT, ...)
|
||||
# It consists of different tokens.
|
||||
def __init__(self, functor, tids=None, count=None):
|
||||
self.functor = functor
|
||||
self.tids = tids or [] # combining multiple sentences vertically
|
||||
self.count = count or 1
|
||||
|
||||
def to_string(self):
|
||||
ret = "---- Slot:\n"
|
||||
ret += "functor: {}\n".format(self.functor)
|
||||
ret += "tids: ["
|
||||
for t in self.tids:
|
||||
ret += (str(t) + ", ")
|
||||
ret += "]\n"
|
||||
ret += "]\n"
|
||||
ret += "----\n"
|
||||
return ret
|
||||
|
||||
def to_json(self):
|
||||
ret = {
|
||||
"functor": self.functor,
|
||||
"tids": self.tids,
|
||||
"count": self.count
|
||||
}
|
||||
return ret
|
@ -0,0 +1,242 @@
|
||||
# Reduction function for frames.
|
||||
# Input: list of Frame objects, output: list of Frame objects.
|
||||
# App uses reduce_0, 1 and 5
|
||||
|
||||
from valency import Frame, Slot
|
||||
from copy import deepcopy as DC
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
SENSE_UNDEFINED = "nedefinirano"
|
||||
|
||||
## TIDI: use frame.py
|
||||
## TODO: build a list of [Frame] with lists of [Slot]
|
||||
|
||||
|
||||
def sorted_by_len_tids(frames):
|
||||
return sorted(
|
||||
frames,
|
||||
key=lambda x: len(x.tids),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
|
||||
def reduce_0(frames, vallex=None):
|
||||
# new request... frames should be sorded by
|
||||
# functors list (basically reduce_1, just each
|
||||
# sentence gets its own frame)
|
||||
r1_frames = reduce_1(frames)
|
||||
sorting_strings = []
|
||||
separated_frames = []
|
||||
for frame in r1_frames:
|
||||
for tid in frame.tids:
|
||||
tmp_frame = DC(frame)
|
||||
tmp_frame.tids = [tid]
|
||||
separated_frames.append(tmp_frame)
|
||||
sorting_strings.append("".join(
|
||||
[slot.functor for slot in tmp_frame.slots]
|
||||
))
|
||||
permutation = [x for _, x in sorted(
|
||||
zip(sorting_strings, range(len(sorting_strings))))]
|
||||
sorted_sep_frames = [separated_frames[i] for i in permutation]
|
||||
return sorted_sep_frames
|
||||
|
||||
|
||||
def reduce_1(frames, vallex=None):
|
||||
# Combine frames with the same set of functors.
|
||||
# The order of functors is not important.
|
||||
frame_sets = [] # [set of functors, list of frames]
|
||||
for frame in frames:
|
||||
functors = [slot.functor for slot in frame.slots]
|
||||
|
||||
for fs in frame_sets:
|
||||
if set(functors) == set(fs[0]):
|
||||
fs[1].append(frame)
|
||||
break
|
||||
else:
|
||||
# Python for else -> fires if loop has ended.
|
||||
frame_sets.append([functors, [frame]])
|
||||
|
||||
ret_frames = []
|
||||
for fs in frame_sets:
|
||||
tids = []
|
||||
slots = {}
|
||||
# All possible slots in this frame.
|
||||
for functor in fs[0]:
|
||||
slots[functor] = Slot(functor=functor)
|
||||
# Reduce slots from all frames. (Merge ACT from all frames, ...)
|
||||
for frame in fs[1]:
|
||||
tids += frame.tids
|
||||
for sl in frame.slots:
|
||||
slots[sl.functor].tids += sl.tids
|
||||
slots_list = []
|
||||
for k, e in slots.items():
|
||||
slots_list.append(e)
|
||||
rf = Frame(tids=tids, slots=slots_list)
|
||||
rf.sort_slots()
|
||||
ret_frames.append(rf)
|
||||
return sorted_by_len_tids(ret_frames)
|
||||
|
||||
|
||||
def reduce_3(raw_frames, vallex):
|
||||
# sskj simple lesk ids
|
||||
ssj_ids = [frame.tids[0] for frame in raw_frames]
|
||||
db_results = list(vallex.db.sskj_simple_lesk.find(
|
||||
{"ssj_id": {"$in": ssj_ids}}))
|
||||
id_map = {}
|
||||
for entry in db_results:
|
||||
id_map.update({entry["ssj_id"]: {
|
||||
"sense_id": entry.get("sense_id"),
|
||||
"sense_desc": entry.get("sense_desc")
|
||||
}})
|
||||
return frames_from_sense_ids(raw_frames, id_map)
|
||||
|
||||
|
||||
def reduce_4(raw_frames, vallex):
|
||||
# kmeans ids
|
||||
ssj_ids = [frame.tids[0] for frame in raw_frames]
|
||||
db_results = list(vallex.db.kmeans.find(
|
||||
{"ssj_id": {"$in": ssj_ids}}))
|
||||
id_map = {}
|
||||
for entry in db_results:
|
||||
id_map.update({entry["ssj_id"]: {
|
||||
"sense_id": entry["sense_id"]
|
||||
}})
|
||||
return frames_from_sense_ids(raw_frames, id_map)
|
||||
|
||||
|
||||
def reduce_5(raw_frames, vallex):
|
||||
USER_SENSE_COLL = "v2_sense_map"
|
||||
headword = raw_frames[0].hw
|
||||
ssj_ids_full = [frame.tids[0] for frame in raw_frames]
|
||||
# v2_sense_map stores only sentence half of ssj_id
|
||||
ssj_ids = [".".join(ssj_id.split(".")[:-1]) for ssj_id in ssj_ids_full]
|
||||
db_results = list(vallex.db[USER_SENSE_COLL].find({
|
||||
"ssj_id": {"$in": ssj_ids},
|
||||
"hw": headword,
|
||||
}))
|
||||
id_map = {}
|
||||
for entry in db_results:
|
||||
id_map[entry["ssj_id"]] = entry["sense_id"]
|
||||
|
||||
ret_frames = frames_from_sense_ids(raw_frames, id_map)
|
||||
|
||||
# sort: frames with senses to top
|
||||
senses_undefined = []
|
||||
senses_defined = []
|
||||
for frame in ret_frames:
|
||||
if frame.sense_info["sense_id"] == SENSE_UNDEFINED:
|
||||
senses_undefined.append(frame)
|
||||
else:
|
||||
senses_defined.append(frame)
|
||||
ret_frames = senses_defined + senses_undefined
|
||||
|
||||
return ret_frames
|
||||
|
||||
|
||||
def frames_from_sense_ids(raw_frames, id_map):
|
||||
# id map = dict {
|
||||
# ssj_id: sense_id
|
||||
# }
|
||||
# id_dict = dict {
|
||||
# sense_id: [frame, ...]
|
||||
# }
|
||||
id_dict = {}
|
||||
for frame in raw_frames:
|
||||
# long version ssj_id (S123.t12)
|
||||
frame_ssj_id = frame.tids[0]
|
||||
frame_sense_id = id_map.get(frame_ssj_id)
|
||||
if frame_sense_id is None:
|
||||
# try short version ssj_id (S123)
|
||||
frame_ssj_id = ".".join(frame_ssj_id.split(".")[:-1])
|
||||
frame_sense_id = id_map.get(frame_ssj_id)
|
||||
|
||||
# set default if sense_id not found
|
||||
if frame_sense_id is None:
|
||||
frame_sense_id = SENSE_UNDEFINED
|
||||
"""
|
||||
sense_id = id_map.get(frame.tids[0])
|
||||
if sense_id is not None:
|
||||
sense_id = sense_id.get("sense_id")
|
||||
else:
|
||||
sense_id = "nedefinirano"
|
||||
"""
|
||||
if frame_sense_id not in id_dict:
|
||||
id_dict[frame_sense_id] = []
|
||||
id_dict[frame_sense_id].append(DC(frame))
|
||||
|
||||
ret_frames = []
|
||||
for sense_id, frames in id_dict.items():
|
||||
tids = []
|
||||
reduced_slots = []
|
||||
for frame in frames:
|
||||
tids.extend(frame.tids)
|
||||
for slot in frame.slots:
|
||||
# if functor not in reduced slots,
|
||||
# add new slot; else increase count
|
||||
for rslot in reduced_slots:
|
||||
if slot.functor == rslot.functor:
|
||||
rslot.count += 1
|
||||
rslot.tids.extend(slot.tids)
|
||||
break
|
||||
else:
|
||||
# in case for loop didn't match a slot
|
||||
reduced_slots.append(Slot(
|
||||
functor=slot.functor,
|
||||
tids=slot.tids,
|
||||
count=1
|
||||
))
|
||||
reduced_frame = Frame(tids, slots=reduced_slots)
|
||||
id_map_entry = (
|
||||
id_map.get(tids[0]) or
|
||||
id_map.get(".".join(tids[0].split(".")[:-1]))
|
||||
)
|
||||
if id_map_entry is None:
|
||||
reduced_frame.sense_info = {
|
||||
"sense_id": SENSE_UNDEFINED,
|
||||
}
|
||||
else:
|
||||
reduced_frame.sense_info = {
|
||||
"sense_id": id_map_entry
|
||||
}
|
||||
reduced_frame.sort_slots()
|
||||
ret_frames.append(reduced_frame)
|
||||
return ret_frames
|
||||
|
||||
|
||||
reduce_functions = {
|
||||
"reduce_0": {
|
||||
"f": reduce_0,
|
||||
"desc":
|
||||
"Vsaka pojavitev glagola dobi svoj stavčni vzorec.",
|
||||
"simple_name": "posamezni stavki"
|
||||
},
|
||||
"reduce_1": {
|
||||
"f": reduce_1,
|
||||
"desc":
|
||||
"Združevanje stavčnih vzorcev z enako skupino udeleženskih vlog.",
|
||||
"simple_name": "združeni stavki"
|
||||
},
|
||||
"reduce_3": {
|
||||
"f": reduce_3,
|
||||
"desc":
|
||||
"Združevanje stavčnih vzorcev na osnovi pomenov povedi v SSKJ. "
|
||||
"Pomeni so dodeljeni s pomočjo algoritma Simple Lesk.",
|
||||
"simple_name": "SSKJ_pomeni"
|
||||
},
|
||||
"reduce_4": {
|
||||
"f": reduce_4,
|
||||
"desc":
|
||||
"Združevanje stavčnih vzorcev na osnovi pomenov povedi "
|
||||
"s pomočjo algoritma K-Means. Število predvidenih pomenov "
|
||||
"podano na osnovi SSKJ.",
|
||||
"simple_name": "KMeans_pomeni"
|
||||
},
|
||||
"reduce_5": {
|
||||
"f": reduce_5,
|
||||
"desc":
|
||||
"Uporabniško dodeljeni pomeni povedi.",
|
||||
"simple_name": "po meri"
|
||||
}
|
||||
}
|
Loading…
Reference in new issue