forked from kristjan/cjvt-valency
Parameterized mongo_db + Added internal state saving for p1
This commit is contained in:
parent
ae5f2869bc
commit
220529b777
|
@ -186,6 +186,7 @@ After uploading, restart the stack with `27017` commented out.
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
|
pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
|
||||||
|
pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git
|
||||||
```
|
```
|
||||||
|
|
||||||
### Running on already setup environment
|
### Running on already setup environment
|
||||||
|
@ -215,6 +216,11 @@ docker exec -it ef0a /bin/bash
|
||||||
# following steps in docker bash:
|
# following steps in docker bash:
|
||||||
mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://<REGULAR USERNAME>:<REGULAR PASSWORD>@0.0.0.0:27017
|
mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://<REGULAR USERNAME>:<REGULAR PASSWORD>@0.0.0.0:27017
|
||||||
|
|
||||||
|
# add privilegies for user to write into other databases like extvaldb
|
||||||
|
mongo --username <ADMIN USER> --password --authenticationDatabase admin
|
||||||
|
use valdb
|
||||||
|
db.grantRolesToUser(<REGULAR USER>, [{ role: "readWrite", db: "extvaldb"}])
|
||||||
|
|
||||||
# check if it worked by
|
# check if it worked by
|
||||||
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
|
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
|
||||||
```
|
```
|
|
@ -158,7 +158,7 @@ def hws_generator(collection, headword_text, RF, mongo):
|
||||||
yield frame_json
|
yield frame_json
|
||||||
|
|
||||||
|
|
||||||
def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar):
|
def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar, status_collection, corpus_type):
|
||||||
sentences_of_interest = {}
|
sentences_of_interest = {}
|
||||||
# all_sentences = set()
|
# all_sentences = set()
|
||||||
sorted(headword_category, key=lambda x: x[0])
|
sorted(headword_category, key=lambda x: x[0])
|
||||||
|
@ -174,8 +174,14 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
# last_processed_hw = 'aktivirati'
|
# last_processed_hw = 'aktivirati'
|
||||||
# last_processed_hw = 'aktivirati'
|
# last_processed_hw = 'aktivirati'
|
||||||
|
|
||||||
|
status_collection_update_list = []
|
||||||
|
|
||||||
# already_processed = False
|
# already_processed = False
|
||||||
for headword_id, (headword_text, category_text) in enumerate(headword_category):
|
for headword_id, (headword_text, category_text) in enumerate(headword_category):
|
||||||
|
# check whether element has been processed
|
||||||
|
if status_collection.count_documents({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}):
|
||||||
|
pbar.update(1)
|
||||||
|
continue
|
||||||
# print(headword_text)
|
# print(headword_text)
|
||||||
# if already_processed:
|
# if already_processed:
|
||||||
# if headword_text != last_processed_hw:
|
# if headword_text != last_processed_hw:
|
||||||
|
@ -296,6 +302,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
# requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
|
# requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
|
||||||
# if 'GF0010453.1116.1' in sentences_of_interest:
|
# if 'GF0010453.1116.1' in sentences_of_interest:
|
||||||
# print('here')
|
# print('here')
|
||||||
|
status_collection.bulk_write(status_collection_update_list)
|
||||||
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
||||||
# print('print2:')
|
# print('print2:')
|
||||||
# print(time.time() - start_time)
|
# print(time.time() - start_time)
|
||||||
|
@ -305,7 +312,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
# print('print3:')
|
# print('print3:')
|
||||||
# print(time.time() - start_time)
|
# print(time.time() - start_time)
|
||||||
# start_time = time.time()
|
# start_time = time.time()
|
||||||
|
del status_collection_update_list
|
||||||
del requests
|
del requests
|
||||||
del sentences_of_interest
|
del sentences_of_interest
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
@ -339,17 +346,20 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
# w_collection.insert_many(sentences_of_interest, ordered=False)
|
# w_collection.insert_many(sentences_of_interest, ordered=False)
|
||||||
# except pymongo.errors.BulkWriteError as e:
|
# except pymongo.errors.BulkWriteError as e:
|
||||||
# print(e.details['writeErrors'])
|
# print(e.details['writeErrors'])
|
||||||
|
status_collection_update_list = []
|
||||||
sentences_of_interest = {}
|
sentences_of_interest = {}
|
||||||
|
|
||||||
# first_sentence = True
|
# first_sentence = True
|
||||||
|
|
||||||
sentences_in_ram += 1
|
sentences_in_ram += 1
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
|
status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}))
|
||||||
|
|
||||||
# TODO uncomment
|
# TODO uncomment
|
||||||
# if 'GF0010453.1116.1' in sentences_of_interest:
|
# if 'GF0010453.1116.1' in sentences_of_interest:
|
||||||
# a = sentences_of_interest['GF0010453.1116.1']
|
# a = sentences_of_interest['GF0010453.1116.1']
|
||||||
# print('here')
|
# print('here')
|
||||||
|
status_collection.bulk_write(status_collection_update_list)
|
||||||
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
||||||
|
|
||||||
result = w_collection.bulk_write(requests)
|
result = w_collection.bulk_write(requests)
|
||||||
|
@ -1467,7 +1477,10 @@ def main(args):
|
||||||
print('beginning chunk')
|
print('beginning chunk')
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
# user:user:valdb:127.0.0.1
|
# user:user:valdb:127.0.0.1
|
||||||
mongo = MongoClient(username='user', password='user', authSource='valdb')
|
|
||||||
|
[db_user, db_password, db_database, db_host] = args.mongo_db.split(':')
|
||||||
|
|
||||||
|
mongo = MongoClient(username=db_user, password=db_password, authSource=db_database)
|
||||||
|
|
||||||
db = mongo.valdb
|
db = mongo.valdb
|
||||||
collection_ssj = db['ssj']
|
collection_ssj = db['ssj']
|
||||||
|
@ -1479,6 +1492,7 @@ def main(args):
|
||||||
w_collection_gigafida = db2['gigafida']
|
w_collection_gigafida = db2['gigafida']
|
||||||
w_a_collection_ssj = db2['ssj' + '_all']
|
w_a_collection_ssj = db2['ssj' + '_all']
|
||||||
w_a_collection_gigafida = db2['gigafida' + '_all']
|
w_a_collection_gigafida = db2['gigafida' + '_all']
|
||||||
|
status_collection = db2['status']
|
||||||
|
|
||||||
valency_pattern_id_collection = db2['valency_pattern_ids']
|
valency_pattern_id_collection = db2['valency_pattern_ids']
|
||||||
|
|
||||||
|
@ -1527,10 +1541,10 @@ def main(args):
|
||||||
# sentences_of_interest_stored = args.p1_processed
|
# sentences_of_interest_stored = args.p1_processed
|
||||||
if not args.p1_processed:
|
if not args.p1_processed:
|
||||||
with tqdm(total=len(headword_category)) as pbar:
|
with tqdm(total=len(headword_category)) as pbar:
|
||||||
get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar)
|
get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar, status_collection, 'ssj')
|
||||||
if not args.ignore_gigafida:
|
if not args.ignore_gigafida:
|
||||||
with tqdm(total=len(headword_category)) as pbar:
|
with tqdm(total=len(headword_category)) as pbar:
|
||||||
get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar)
|
get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar, status_collection, 'gigafida')
|
||||||
# sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items()))
|
# sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items()))
|
||||||
print(time.time() - start_time)
|
print(time.time() - start_time)
|
||||||
# num_sentences = 0
|
# num_sentences = 0
|
||||||
|
@ -1568,6 +1582,7 @@ def main(args):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
||||||
arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials')
|
arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials')
|
||||||
|
arg_parser.add_argument('--mongo_db', type=str, help='Database credentials')
|
||||||
arg_parser.add_argument('--schema', type=str, help='XML schema')
|
arg_parser.add_argument('--schema', type=str, help='XML schema')
|
||||||
arg_parser.add_argument('--infile', type=str, help='Input file')
|
arg_parser.add_argument('--infile', type=str, help='Input file')
|
||||||
arg_parser.add_argument('--outdir', type=str, help='Output directory')
|
arg_parser.add_argument('--outdir', type=str, help='Output directory')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user