forked from kristjan/cjvt-valency
		
	Added some progress bars + erased beginning skipping.
This commit is contained in:
		
							parent
							
								
									3d91251905
								
							
						
					
					
						commit
						931b3531b3
					
				
							
								
								
									
										35
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										35
									
								
								README.md
									
									
									
									
									
								
							| @ -180,8 +180,41 @@ $ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0 | |||||||
| 
 | 
 | ||||||
| After uploading, restart the stack with `27017` commented out.   | After uploading, restart the stack with `27017` commented out.   | ||||||
| 
 | 
 | ||||||
| ## When running script | ## Script running | ||||||
| 
 | 
 | ||||||
|  | ### Environment setup | ||||||
|  | ```bash | ||||||
|  | pip install -r requirements.txt | ||||||
|  | pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | ### Running on already setup environment | ||||||
| ```bash | ```bash | ||||||
| make database-service | make database-service | ||||||
| ``` | ``` | ||||||
|  | 
 | ||||||
|  | ### Setting up environment for running on proc1 - ramdisk | ||||||
|  | 
 | ||||||
|  | ```bash | ||||||
|  | # create ramdisk | ||||||
|  | sudo mount -t tmpfs tmpfs /mnt/tmp | ||||||
|  | sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp | ||||||
|  | 
 | ||||||
|  | # change volumes to /mnt/tmp:/data/db | ||||||
|  | vim dockerfiles/database/mongodb-stack.yml | ||||||
|  | 
 | ||||||
|  | # change Makefile -runStack to mkdir -p /mnt/tmp | ||||||
|  | vim dockerfiles/database/mongodb-stack.yml | ||||||
|  | 
 | ||||||
|  | docker swarm init | ||||||
|  | make database-service | ||||||
|  | make database-users | ||||||
|  | 
 | ||||||
|  | docker exec -it ef0a /bin/bash | ||||||
|  | 
 | ||||||
|  | # following steps in docker bash: | ||||||
|  |     mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://<REGULAR USERNAME>:<REGULAR PASSWORD>@0.0.0.0:27017 | ||||||
|  | 
 | ||||||
|  |     # check if it worked by | ||||||
|  |     mongo --username <REGULAR USER> --password --authenticationDatabase valdb | ||||||
|  | ``` | ||||||
| @ -1,5 +1,17 @@ | |||||||
| #!/usr/bin/python3 | #!/usr/bin/python3 | ||||||
| 
 | 
 | ||||||
|  | #imports from luscenje_struktur | ||||||
|  | from luscenje_struktur.progress_bar import progress | ||||||
|  | from luscenje_struktur.word import Word, WordCompressed | ||||||
|  | from luscenje_struktur.syntactic_structure import build_structures | ||||||
|  | from luscenje_struktur.match_store import MatchStore | ||||||
|  | from luscenje_struktur.word_stats import WordStats | ||||||
|  | from luscenje_struktur.writer import Writer | ||||||
|  | from luscenje_struktur.loader import load_files, file_sentence_glue_generator | ||||||
|  | from luscenje_struktur.database import Database | ||||||
|  | from luscenje_struktur.time_info import TimeInfo | ||||||
|  | from luscenje_struktur.msd_translate import MSD_TRANSLATE | ||||||
|  | 
 | ||||||
| # make database-service | # make database-service | ||||||
| import gc | import gc | ||||||
| import re | import re | ||||||
| @ -12,8 +24,8 @@ from tqdm import tqdm | |||||||
| import pymongo | import pymongo | ||||||
| # import tqdm as tqdm | # import tqdm as tqdm | ||||||
| 
 | 
 | ||||||
| sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency') | # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency') | ||||||
| sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser') | # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser') | ||||||
| from valency.Frame import frames_from_db_entry | from valency.Frame import frames_from_db_entry | ||||||
| from valency.reduce_functions import reduce_functions | from valency.reduce_functions import reduce_functions | ||||||
| 
 | 
 | ||||||
| @ -151,25 +163,25 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m | |||||||
|     # all_sentences = set() |     # all_sentences = set() | ||||||
|     sorted(headword_category, key=lambda x: x[0]) |     sorted(headword_category, key=lambda x: x[0]) | ||||||
|     # num_sentences in RAM at once |     # num_sentences in RAM at once | ||||||
|     sentences_num_limit = 10000 |     sentences_num_limit = 15000 | ||||||
|     sentences_in_ram = 0 |     sentences_in_ram = 0 | ||||||
|     part = 0 |     # part = 0 | ||||||
|     start_time = time.time() |     # start_time = time.time() | ||||||
|     # first_sentence = True |     # first_sentence = True | ||||||
|     # section_included = False |     # section_included = False | ||||||
|     # last_processed_hw = 'pomeniti' |     # last_processed_hw = 'pomeniti' | ||||||
|     # last_processed_hw = 'iti' |     # last_processed_hw = 'iti' | ||||||
|     # last_processed_hw = 'aktivirati' |     # last_processed_hw = 'aktivirati' | ||||||
|     last_processed_hw = 'aktivirati' |     # last_processed_hw = 'aktivirati' | ||||||
| 
 | 
 | ||||||
|     already_processed = False |     # already_processed = False | ||||||
|     for headword_id, (headword_text, category_text) in enumerate(headword_category): |     for headword_id, (headword_text, category_text) in enumerate(headword_category): | ||||||
|         # print(headword_text) |         # print(headword_text) | ||||||
|         if already_processed: |         # if already_processed: | ||||||
|             if headword_text != last_processed_hw: |         #     if headword_text != last_processed_hw: | ||||||
|                 continue |         #         continue | ||||||
|             else: |         #     else: | ||||||
|                 already_processed = False |         #         already_processed = False | ||||||
|         # for headword_text, category_text in headword_category[15:20]: |         # for headword_text, category_text in headword_category[15:20]: | ||||||
|         # headword_text = 'zadovoljen' |         # headword_text = 'zadovoljen' | ||||||
|         # category_text = 'adjective' |         # category_text = 'adjective' | ||||||
| @ -306,7 +318,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m | |||||||
|                         # print('HEADWORD') |                         # print('HEADWORD') | ||||||
|                         # print(headword_text) |                         # print(headword_text) | ||||||
|                         # pbar.update(1) |                         # pbar.update(1) | ||||||
|                         part += 1 |                         # part += 1 | ||||||
|                         # |                         # | ||||||
|                         # w_collection.bulk_write( |                         # w_collection.bulk_write( | ||||||
|                         #     array.map((val) = > |                         #     array.map((val) = > | ||||||
| @ -724,7 +736,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter | |||||||
|     return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max |     return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida): | def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar): | ||||||
|     query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, |     query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, | ||||||
|                                   Lexeme.dummy, LexicalUnitType.name) \ |                                   Lexeme.dummy, LexicalUnitType.name) \ | ||||||
|         .join(Category, Category.id == Lexeme.category_id) \ |         .join(Category, Category.id == Lexeme.category_id) \ | ||||||
| @ -1138,6 +1150,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, | |||||||
|         with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'), |         with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'), | ||||||
|                               encoding='utf-8') as xf: |                               encoding='utf-8') as xf: | ||||||
|             xf.write(dictionary, pretty_print=True) |             xf.write(dictionary, pretty_print=True) | ||||||
|  |         pbar.update(1) | ||||||
|             # xf.write(entry, pretty_print=True) |             # xf.write(entry, pretty_print=True) | ||||||
|             # tree.write(output_file_name, encoding='UTF-8', pretty_print=True) |             # tree.write(output_file_name, encoding='UTF-8', pretty_print=True) | ||||||
| 
 | 
 | ||||||
| @ -1546,7 +1559,8 @@ def main(args): | |||||||
|     print('write_xml') |     print('write_xml') | ||||||
|     start_time = time.time() |     start_time = time.time() | ||||||
|     # print('aa ' + 3) |     # print('aa ' + 3) | ||||||
|     write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida) |     with tqdm(total=len(headword_category)) as pbar: | ||||||
|  |         write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar) | ||||||
|     print(time.time() - start_time) |     print(time.time() - start_time) | ||||||
|     # input_file.close() |     # input_file.close() | ||||||
|     session.close() |     session.close() | ||||||
| @ -1621,20 +1635,20 @@ if __name__ == '__main__': | |||||||
|     args = arg_parser.parse_args() |     args = arg_parser.parse_args() | ||||||
|     logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) |     logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) | ||||||
| 
 | 
 | ||||||
|     try: |     # try: | ||||||
|         sys.path.insert(1, args.structure_extraction) |     #     sys.path.insert(1, args.structure_extraction) | ||||||
|         from progress_bar import progress |     #     from progress_bar import progress | ||||||
|         from word import Word, WordCompressed |     #     from word import Word, WordCompressed | ||||||
|         from syntactic_structure import build_structures |     #     from syntactic_structure import build_structures | ||||||
|         from match_store import MatchStore |     #     from match_store import MatchStore | ||||||
|         from word_stats import WordStats |     #     from word_stats import WordStats | ||||||
|         from writer import Writer |     #     from writer import Writer | ||||||
|         from loader import load_files, file_sentence_glue_generator |     #     from loader import load_files, file_sentence_glue_generator | ||||||
|         from database import Database |     #     from database import Database | ||||||
|         from time_info import TimeInfo |     #     from time_info import TimeInfo | ||||||
|         from msd_translate import MSD_TRANSLATE |     #     from msd_translate import MSD_TRANSLATE | ||||||
|     except: |     # except: | ||||||
|         raise |     #     raise | ||||||
| 
 | 
 | ||||||
|     start = time.time() |     start = time.time() | ||||||
|     main(args) |     main(args) | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								scripts/valency
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								scripts/valency
									
									
									
									
									
										Symbolic link
									
								
							| @ -0,0 +1 @@ | |||||||
|  | ../src/pkg/valency/valency | ||||||
							
								
								
									
										0
									
								
								src/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								src/pkg/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/pkg/__init__.py
									
									
									
									
									
										Normal file
									
								
							
		Loading…
	
		Reference in New Issue
	
	Block a user