From 220529b777d6edeed2dfb604ac055376367d4281 Mon Sep 17 00:00:00 2001
From: Luka <krsnik.luka92@gmail.com>
Date: Tue, 22 Sep 2020 19:31:31 +0200
Subject: [PATCH] Parameterized mongo_db + Added internal state saving for p1

---
 README.md             |  6 ++++++
 scripts/create_xml.py | 27 +++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index f2cb427..3a51a2a 100644
--- a/README.md
+++ b/README.md
@@ -186,6 +186,7 @@ After uploading, restart the stack with `27017` commented out.
 ```bash
 pip install -r requirements.txt
 pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
+pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git
 ```
 
 ### Running on already setup environment
@@ -215,6 +216,11 @@ docker exec -it ef0a /bin/bash
 # following steps in docker bash:
     mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://<REGULAR USERNAME>:<REGULAR PASSWORD>@0.0.0.0:27017
 
+    # add privilegies for user to write into other databases like extvaldb
+    mongo --username <ADMIN USER> --password --authenticationDatabase admin
+    use valdb
+    db.grantRolesToUser(<REGULAR USER>, [{ role: "readWrite", db: "extvaldb"}])
+
     # check if it worked by
     mongo --username <REGULAR USER> --password --authenticationDatabase valdb
 ```
\ No newline at end of file
diff --git a/scripts/create_xml.py b/scripts/create_xml.py
index 294ef6b..41984f4 100644
--- a/scripts/create_xml.py
+++ b/scripts/create_xml.py
@@ -158,7 +158,7 @@ def hws_generator(collection, headword_text, RF, mongo):
         yield frame_json
 
 
-def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar):
+def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar, status_collection, corpus_type):
     sentences_of_interest = {}
     # all_sentences = set()
     sorted(headword_category, key=lambda x: x[0])
@@ -174,8 +174,14 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
     # last_processed_hw = 'aktivirati'
     # last_processed_hw = 'aktivirati'
 
+    status_collection_update_list = []
+
     # already_processed = False
     for headword_id, (headword_text, category_text) in enumerate(headword_category):
+        # check whether element has been processed
+        if status_collection.count_documents({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}):
+            pbar.update(1)
+            continue
         # print(headword_text)
         # if already_processed:
         #     if headword_text != last_processed_hw:
@@ -296,6 +302,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
                         # requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
                         # if 'GF0010453.1116.1' in sentences_of_interest:
                         #     print('here')
+                        status_collection.bulk_write(status_collection_update_list)
                         requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
                         # print('print2:')
                         # print(time.time() - start_time)
@@ -305,7 +312,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
                         # print('print3:')
                         # print(time.time() - start_time)
                         # start_time = time.time()
-
+                        del status_collection_update_list
                         del requests
                         del sentences_of_interest
                         gc.collect()
@@ -339,17 +346,20 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
                         #     w_collection.insert_many(sentences_of_interest, ordered=False)
                         # except pymongo.errors.BulkWriteError as e:
                         #     print(e.details['writeErrors'])
+                        status_collection_update_list = []
                         sentences_of_interest = {}
 
                         # first_sentence = True
 
                     sentences_in_ram += 1
         pbar.update(1)
+        status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}))
+
     # TODO uncomment
     # if 'GF0010453.1116.1' in sentences_of_interest:
     #     a = sentences_of_interest['GF0010453.1116.1']
     #     print('here')
-
+    status_collection.bulk_write(status_collection_update_list)
     requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
 
     result = w_collection.bulk_write(requests)
@@ -1467,7 +1477,10 @@ def main(args):
     print('beginning chunk')
     start_time = time.time()
     # user:user:valdb:127.0.0.1
-    mongo = MongoClient(username='user', password='user', authSource='valdb')
+
+    [db_user, db_password, db_database, db_host] = args.mongo_db.split(':')
+
+    mongo = MongoClient(username=db_user, password=db_password, authSource=db_database)
 
     db = mongo.valdb
     collection_ssj = db['ssj']
@@ -1479,6 +1492,7 @@ def main(args):
     w_collection_gigafida = db2['gigafida']
     w_a_collection_ssj = db2['ssj' + '_all']
     w_a_collection_gigafida = db2['gigafida' + '_all']
+    status_collection = db2['status']
 
     valency_pattern_id_collection = db2['valency_pattern_ids']
 
@@ -1527,10 +1541,10 @@ def main(args):
     # sentences_of_interest_stored = args.p1_processed
     if not args.p1_processed:
         with tqdm(total=len(headword_category)) as pbar:
-            get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar)
+            get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar, status_collection, 'ssj')
         if not args.ignore_gigafida:
             with tqdm(total=len(headword_category)) as pbar:
-                get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar)
+                get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar, status_collection, 'gigafida')
     # sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items()))
     print(time.time() - start_time)
     # num_sentences = 0
@@ -1568,6 +1582,7 @@ def main(args):
 if __name__ == '__main__':
     arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
     arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials')
+    arg_parser.add_argument('--mongo_db', type=str, help='Database credentials')
     arg_parser.add_argument('--schema', type=str, help='XML schema')
     arg_parser.add_argument('--infile', type=str, help='Input file')
     arg_parser.add_argument('--outdir', type=str, help='Output directory')