Adding processing improvemets.

This commit is contained in:
Luka 2020-09-23 13:02:31 +02:00
parent 220529b777
commit ce1fb46b4e
2 changed files with 45 additions and 15 deletions

View File

@ -26,7 +26,7 @@ import pymongo
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency') # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser') # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
from valency.Frame import frames_from_db_entry from valency.Frame import frames_from_db_entry_headword
from valency.reduce_functions import reduce_functions from valency.reduce_functions import reduce_functions
import argparse import argparse
@ -140,11 +140,8 @@ def hws_generator(collection, headword_text, RF, mongo):
# print('tu2!') # print('tu2!')
frames = [] frames = []
for ent in cur: for ent in cur:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO
cur.close() cur.close()
print('tu3!')
# filter by relevant hw
frames = [x for x in frames if x.hw == headword_text]
# if headword_text == 'brati': # if headword_text == 'brati':
# print('here') # print('here')
@ -197,11 +194,8 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
# print('tu2!') # print('tu2!')
frames = [] frames = []
for ent in cur: for ent in cur:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO
cur.close() cur.close()
# print('tu3!')
# filter by relevant hw
frames = [x for x in frames if x.hw == headword_text]
# if headword_text == 'brati': # if headword_text == 'brati':
# print('here') # print('here')
@ -302,7 +296,8 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
# requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()] # requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
# if 'GF0010453.1116.1' in sentences_of_interest: # if 'GF0010453.1116.1' in sentences_of_interest:
# print('here') # print('here')
status_collection.bulk_write(status_collection_update_list) if len(status_collection_update_list) > 0:
status_collection.bulk_write(status_collection_update_list)
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
# print('print2:') # print('print2:')
# print(time.time() - start_time) # print(time.time() - start_time)
@ -359,10 +354,12 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
# if 'GF0010453.1116.1' in sentences_of_interest: # if 'GF0010453.1116.1' in sentences_of_interest:
# a = sentences_of_interest['GF0010453.1116.1'] # a = sentences_of_interest['GF0010453.1116.1']
# print('here') # print('here')
status_collection.bulk_write(status_collection_update_list) if len(status_collection_update_list) > 0:
status_collection.bulk_write(status_collection_update_list)
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
result = w_collection.bulk_write(requests) if len(requests) > 0:
result = w_collection.bulk_write(requests)
# sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()] # sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
# try: # try:
@ -611,10 +608,8 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
cur = collection.find({"headwords": headword_text}) cur = collection.find({"headwords": headword_text})
frames = [] frames = []
for ent in cur: for ent in cur:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO frames += frames_from_db_entry_headword(ent, headword_text)
cur.close() cur.close()
# filter by relevant hw
frames = [x for x in frames if x.hw == headword_text]
ret_frames = RF(frames, mongo.db.sensemap) ret_frames = RF(frames, mongo.db.sensemap)
json_ret = {"frames": []} json_ret = {"frames": []}

View File

@ -3,6 +3,41 @@ from corpusparser import enriched_lemma
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def frames_from_db_entry_headword(dbent, headword):
def _full_tid(tid):
return ".".join([dbent["sid"], str(tid)])
token_dict = {str(x["tid"]): x for x in dbent["tokens"]}
frames = []
if "srl_links" not in dbent:
return []
srldict = {}
for srl in dbent["srl_links"]:
key = str(srl["from"])
if enriched_lemma(token_dict[key]) != headword:
continue
if key not in srldict:
srldict[key] = [srl]
else:
srldict[key] += [srl]
for hwtid, srlarr in srldict.items():
frames += [Frame(
hw_lemma=enriched_lemma(token_dict[hwtid]),
tids=[_full_tid(hwtid)],
slots=[
Slot(
functor=srl["afun"],
tids=[_full_tid(srl["to"])]
) for srl in srlarr
],
# sentences=[(dbent["sid"], dbent["tokens"])],
sentences=[
[(_full_tid(t["tid"]), t) for t in dbent["tokens"]],
]
)]
return frames
def frames_from_db_entry(dbent): def frames_from_db_entry(dbent):
def _full_tid(tid): def _full_tid(tid):
return ".".join([dbent["sid"], str(tid)]) return ".".join([dbent["sid"], str(tid)])