forked from kristjan/cjvt-valency
Adding processing improvemets.
This commit is contained in:
parent
220529b777
commit
ce1fb46b4e
|
@ -26,7 +26,7 @@ import pymongo
|
||||||
|
|
||||||
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
|
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
|
||||||
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
|
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
|
||||||
from valency.Frame import frames_from_db_entry
|
from valency.Frame import frames_from_db_entry_headword
|
||||||
from valency.reduce_functions import reduce_functions
|
from valency.reduce_functions import reduce_functions
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
@ -140,11 +140,8 @@ def hws_generator(collection, headword_text, RF, mongo):
|
||||||
# print('tu2!')
|
# print('tu2!')
|
||||||
frames = []
|
frames = []
|
||||||
for ent in cur:
|
for ent in cur:
|
||||||
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
|
frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO
|
||||||
cur.close()
|
cur.close()
|
||||||
print('tu3!')
|
|
||||||
# filter by relevant hw
|
|
||||||
frames = [x for x in frames if x.hw == headword_text]
|
|
||||||
|
|
||||||
# if headword_text == 'brati':
|
# if headword_text == 'brati':
|
||||||
# print('here')
|
# print('here')
|
||||||
|
@ -197,11 +194,8 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
# print('tu2!')
|
# print('tu2!')
|
||||||
frames = []
|
frames = []
|
||||||
for ent in cur:
|
for ent in cur:
|
||||||
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
|
frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO
|
||||||
cur.close()
|
cur.close()
|
||||||
# print('tu3!')
|
|
||||||
# filter by relevant hw
|
|
||||||
frames = [x for x in frames if x.hw == headword_text]
|
|
||||||
|
|
||||||
# if headword_text == 'brati':
|
# if headword_text == 'brati':
|
||||||
# print('here')
|
# print('here')
|
||||||
|
@ -302,7 +296,8 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
# requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
|
# requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
|
||||||
# if 'GF0010453.1116.1' in sentences_of_interest:
|
# if 'GF0010453.1116.1' in sentences_of_interest:
|
||||||
# print('here')
|
# print('here')
|
||||||
status_collection.bulk_write(status_collection_update_list)
|
if len(status_collection_update_list) > 0:
|
||||||
|
status_collection.bulk_write(status_collection_update_list)
|
||||||
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
||||||
# print('print2:')
|
# print('print2:')
|
||||||
# print(time.time() - start_time)
|
# print(time.time() - start_time)
|
||||||
|
@ -359,10 +354,12 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
# if 'GF0010453.1116.1' in sentences_of_interest:
|
# if 'GF0010453.1116.1' in sentences_of_interest:
|
||||||
# a = sentences_of_interest['GF0010453.1116.1']
|
# a = sentences_of_interest['GF0010453.1116.1']
|
||||||
# print('here')
|
# print('here')
|
||||||
status_collection.bulk_write(status_collection_update_list)
|
if len(status_collection_update_list) > 0:
|
||||||
|
status_collection.bulk_write(status_collection_update_list)
|
||||||
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
||||||
|
|
||||||
result = w_collection.bulk_write(requests)
|
if len(requests) > 0:
|
||||||
|
result = w_collection.bulk_write(requests)
|
||||||
|
|
||||||
# sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
|
# sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
|
||||||
# try:
|
# try:
|
||||||
|
@ -611,10 +608,8 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
|
||||||
cur = collection.find({"headwords": headword_text})
|
cur = collection.find({"headwords": headword_text})
|
||||||
frames = []
|
frames = []
|
||||||
for ent in cur:
|
for ent in cur:
|
||||||
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
|
frames += frames_from_db_entry_headword(ent, headword_text)
|
||||||
cur.close()
|
cur.close()
|
||||||
# filter by relevant hw
|
|
||||||
frames = [x for x in frames if x.hw == headword_text]
|
|
||||||
|
|
||||||
ret_frames = RF(frames, mongo.db.sensemap)
|
ret_frames = RF(frames, mongo.db.sensemap)
|
||||||
json_ret = {"frames": []}
|
json_ret = {"frames": []}
|
||||||
|
|
|
@ -3,6 +3,41 @@ from corpusparser import enriched_lemma
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def frames_from_db_entry_headword(dbent, headword):
|
||||||
|
def _full_tid(tid):
|
||||||
|
return ".".join([dbent["sid"], str(tid)])
|
||||||
|
|
||||||
|
token_dict = {str(x["tid"]): x for x in dbent["tokens"]}
|
||||||
|
|
||||||
|
frames = []
|
||||||
|
if "srl_links" not in dbent:
|
||||||
|
return []
|
||||||
|
srldict = {}
|
||||||
|
for srl in dbent["srl_links"]:
|
||||||
|
key = str(srl["from"])
|
||||||
|
if enriched_lemma(token_dict[key]) != headword:
|
||||||
|
continue
|
||||||
|
if key not in srldict:
|
||||||
|
srldict[key] = [srl]
|
||||||
|
else:
|
||||||
|
srldict[key] += [srl]
|
||||||
|
for hwtid, srlarr in srldict.items():
|
||||||
|
frames += [Frame(
|
||||||
|
hw_lemma=enriched_lemma(token_dict[hwtid]),
|
||||||
|
tids=[_full_tid(hwtid)],
|
||||||
|
slots=[
|
||||||
|
Slot(
|
||||||
|
functor=srl["afun"],
|
||||||
|
tids=[_full_tid(srl["to"])]
|
||||||
|
) for srl in srlarr
|
||||||
|
],
|
||||||
|
# sentences=[(dbent["sid"], dbent["tokens"])],
|
||||||
|
sentences=[
|
||||||
|
[(_full_tid(t["tid"]), t) for t in dbent["tokens"]],
|
||||||
|
]
|
||||||
|
)]
|
||||||
|
return frames
|
||||||
|
|
||||||
def frames_from_db_entry(dbent):
|
def frames_from_db_entry(dbent):
|
||||||
def _full_tid(tid):
|
def _full_tid(tid):
|
||||||
return ".".join([dbent["sid"], str(tid)])
|
return ".".join([dbent["sid"], str(tid)])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user