From 220529b777d6edeed2dfb604ac055376367d4281 Mon Sep 17 00:00:00 2001 From: Luka Date: Tue, 22 Sep 2020 19:31:31 +0200 Subject: [PATCH] Parameterized mongo_db + Added internal state saving for p1 --- README.md | 6 ++++++ scripts/create_xml.py | 27 +++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f2cb427..3a51a2a 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,7 @@ After uploading, restart the stack with `27017` commented out. ```bash pip install -r requirements.txt pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git +pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git ``` ### Running on already setup environment @@ -215,6 +216,11 @@ docker exec -it ef0a /bin/bash # following steps in docker bash: mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://:@0.0.0.0:27017 + # add privilegies for user to write into other databases like extvaldb + mongo --username --password --authenticationDatabase admin + use valdb + db.grantRolesToUser(, [{ role: "readWrite", db: "extvaldb"}]) + # check if it worked by mongo --username --password --authenticationDatabase valdb ``` \ No newline at end of file diff --git a/scripts/create_xml.py b/scripts/create_xml.py index 294ef6b..41984f4 100644 --- a/scripts/create_xml.py +++ b/scripts/create_xml.py @@ -158,7 +158,7 @@ def hws_generator(collection, headword_text, RF, mongo): yield frame_json -def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar): +def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar, status_collection, corpus_type): sentences_of_interest = {} # all_sentences = set() sorted(headword_category, key=lambda x: x[0]) @@ -174,8 +174,14 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # last_processed_hw = 'aktivirati' # last_processed_hw = 'aktivirati' + status_collection_update_list = [] + # already_processed = False for headword_id, (headword_text, category_text) in enumerate(headword_category): + # check whether element has been processed + if status_collection.count_documents({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}): + pbar.update(1) + continue # print(headword_text) # if already_processed: # if headword_text != last_processed_hw: @@ -296,6 +302,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()] # if 'GF0010453.1116.1' in sentences_of_interest: # print('here') + status_collection.bulk_write(status_collection_update_list) requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] # print('print2:') # print(time.time() - start_time) @@ -305,7 +312,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # print('print3:') # print(time.time() - start_time) # start_time = time.time() - + del status_collection_update_list del requests del sentences_of_interest gc.collect() @@ -339,17 +346,20 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # w_collection.insert_many(sentences_of_interest, ordered=False) # except pymongo.errors.BulkWriteError as e: # print(e.details['writeErrors']) + status_collection_update_list = [] sentences_of_interest = {} # first_sentence = True sentences_in_ram += 1 pbar.update(1) + status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'})) + # TODO uncomment # if 'GF0010453.1116.1' in sentences_of_interest: # a = sentences_of_interest['GF0010453.1116.1'] # print('here') - + status_collection.bulk_write(status_collection_update_list) requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] result = w_collection.bulk_write(requests) @@ -1467,7 +1477,10 @@ def main(args): print('beginning chunk') start_time = time.time() # user:user:valdb:127.0.0.1 - mongo = MongoClient(username='user', password='user', authSource='valdb') + + [db_user, db_password, db_database, db_host] = args.mongo_db.split(':') + + mongo = MongoClient(username=db_user, password=db_password, authSource=db_database) db = mongo.valdb collection_ssj = db['ssj'] @@ -1479,6 +1492,7 @@ def main(args): w_collection_gigafida = db2['gigafida'] w_a_collection_ssj = db2['ssj' + '_all'] w_a_collection_gigafida = db2['gigafida' + '_all'] + status_collection = db2['status'] valency_pattern_id_collection = db2['valency_pattern_ids'] @@ -1527,10 +1541,10 @@ def main(args): # sentences_of_interest_stored = args.p1_processed if not args.p1_processed: with tqdm(total=len(headword_category)) as pbar: - get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar) + get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar, status_collection, 'ssj') if not args.ignore_gigafida: with tqdm(total=len(headword_category)) as pbar: - get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar) + get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar, status_collection, 'gigafida') # sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items())) print(time.time() - start_time) # num_sentences = 0 @@ -1568,6 +1582,7 @@ def main(args): if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.') arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials') + arg_parser.add_argument('--mongo_db', type=str, help='Database credentials') arg_parser.add_argument('--schema', type=str, help='XML schema') arg_parser.add_argument('--infile', type=str, help='Input file') arg_parser.add_argument('--outdir', type=str, help='Output directory')