diff --git a/scripts/create_xml.py b/scripts/create_xml.py index 41984f4..8d25b04 100644 --- a/scripts/create_xml.py +++ b/scripts/create_xml.py @@ -26,7 +26,7 @@ import pymongo # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency') # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser') -from valency.Frame import frames_from_db_entry +from valency.Frame import frames_from_db_entry_headword from valency.reduce_functions import reduce_functions import argparse @@ -140,11 +140,8 @@ def hws_generator(collection, headword_text, RF, mongo): # print('tu2!') frames = [] for ent in cur: - frames += frames_from_db_entry(ent) # pre-process this step for prod TODO + frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO cur.close() - print('tu3!') - # filter by relevant hw - frames = [x for x in frames if x.hw == headword_text] # if headword_text == 'brati': # print('here') @@ -197,11 +194,8 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # print('tu2!') frames = [] for ent in cur: - frames += frames_from_db_entry(ent) # pre-process this step for prod TODO + frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO cur.close() - # print('tu3!') - # filter by relevant hw - frames = [x for x in frames if x.hw == headword_text] # if headword_text == 'brati': # print('here') @@ -302,7 +296,8 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()] # if 'GF0010453.1116.1' in sentences_of_interest: # print('here') - status_collection.bulk_write(status_collection_update_list) + if len(status_collection_update_list) > 0: + status_collection.bulk_write(status_collection_update_list) requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] # print('print2:') # print(time.time() - start_time) @@ -359,10 +354,12 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # if 'GF0010453.1116.1' in sentences_of_interest: # a = sentences_of_interest['GF0010453.1116.1'] # print('here') - status_collection.bulk_write(status_collection_update_list) + if len(status_collection_update_list) > 0: + status_collection.bulk_write(status_collection_update_list) requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] - result = w_collection.bulk_write(requests) + if len(requests) > 0: + result = w_collection.bulk_write(requests) # sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()] # try: @@ -611,10 +608,8 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter cur = collection.find({"headwords": headword_text}) frames = [] for ent in cur: - frames += frames_from_db_entry(ent) # pre-process this step for prod TODO + frames += frames_from_db_entry_headword(ent, headword_text) cur.close() - # filter by relevant hw - frames = [x for x in frames if x.hw == headword_text] ret_frames = RF(frames, mongo.db.sensemap) json_ret = {"frames": []} diff --git a/src/pkg/valency/valency/Frame.py b/src/pkg/valency/valency/Frame.py index 89b7116..6e945e8 100644 --- a/src/pkg/valency/valency/Frame.py +++ b/src/pkg/valency/valency/Frame.py @@ -3,6 +3,41 @@ from corpusparser import enriched_lemma log = logging.getLogger(__name__) +def frames_from_db_entry_headword(dbent, headword): + def _full_tid(tid): + return ".".join([dbent["sid"], str(tid)]) + + token_dict = {str(x["tid"]): x for x in dbent["tokens"]} + + frames = [] + if "srl_links" not in dbent: + return [] + srldict = {} + for srl in dbent["srl_links"]: + key = str(srl["from"]) + if enriched_lemma(token_dict[key]) != headword: + continue + if key not in srldict: + srldict[key] = [srl] + else: + srldict[key] += [srl] + for hwtid, srlarr in srldict.items(): + frames += [Frame( + hw_lemma=enriched_lemma(token_dict[hwtid]), + tids=[_full_tid(hwtid)], + slots=[ + Slot( + functor=srl["afun"], + tids=[_full_tid(srl["to"])] + ) for srl in srlarr + ], + # sentences=[(dbent["sid"], dbent["tokens"])], + sentences=[ + [(_full_tid(t["tid"]), t) for t in dbent["tokens"]], + ] + )] + return frames + def frames_from_db_entry(dbent): def _full_tid(tid): return ".".join([dbent["sid"], str(tid)])