Added some progress bars + erased beginning skipping.

This commit is contained in:
Luka 2020-09-18 10:21:05 +02:00
parent 3d91251905
commit 931b3531b3
5 changed files with 78 additions and 30 deletions

View File

@ -180,8 +180,41 @@ $ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0
After uploading, restart the stack with `27017` commented out. After uploading, restart the stack with `27017` commented out.
## When running script ## Script running
### Environment setup
```bash
pip install -r requirements.txt
pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
```
### Running on already setup environment
```bash ```bash
make database-service make database-service
``` ```
### Setting up environment for running on proc1 - ramdisk
```bash
# create ramdisk
sudo mount -t tmpfs tmpfs /mnt/tmp
sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp
# change volumes to /mnt/tmp:/data/db
vim dockerfiles/database/mongodb-stack.yml
# change Makefile -runStack to mkdir -p /mnt/tmp
vim dockerfiles/database/mongodb-stack.yml
docker swarm init
make database-service
make database-users
docker exec -it ef0a /bin/bash
# following steps in docker bash:
mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://<REGULAR USERNAME>:<REGULAR PASSWORD>@0.0.0.0:27017
# check if it worked by
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
```

View File

@ -1,5 +1,17 @@
#!/usr/bin/python3 #!/usr/bin/python3
#imports from luscenje_struktur
from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word, WordCompressed
from luscenje_struktur.syntactic_structure import build_structures
from luscenje_struktur.match_store import MatchStore
from luscenje_struktur.word_stats import WordStats
from luscenje_struktur.writer import Writer
from luscenje_struktur.loader import load_files, file_sentence_glue_generator
from luscenje_struktur.database import Database
from luscenje_struktur.time_info import TimeInfo
from luscenje_struktur.msd_translate import MSD_TRANSLATE
# make database-service # make database-service
import gc import gc
import re import re
@ -12,8 +24,8 @@ from tqdm import tqdm
import pymongo import pymongo
# import tqdm as tqdm # import tqdm as tqdm
sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency') # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser') # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
from valency.Frame import frames_from_db_entry from valency.Frame import frames_from_db_entry
from valency.reduce_functions import reduce_functions from valency.reduce_functions import reduce_functions
@ -151,25 +163,25 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
# all_sentences = set() # all_sentences = set()
sorted(headword_category, key=lambda x: x[0]) sorted(headword_category, key=lambda x: x[0])
# num_sentences in RAM at once # num_sentences in RAM at once
sentences_num_limit = 10000 sentences_num_limit = 15000
sentences_in_ram = 0 sentences_in_ram = 0
part = 0 # part = 0
start_time = time.time() # start_time = time.time()
# first_sentence = True # first_sentence = True
# section_included = False # section_included = False
# last_processed_hw = 'pomeniti' # last_processed_hw = 'pomeniti'
# last_processed_hw = 'iti' # last_processed_hw = 'iti'
# last_processed_hw = 'aktivirati' # last_processed_hw = 'aktivirati'
last_processed_hw = 'aktivirati' # last_processed_hw = 'aktivirati'
already_processed = False # already_processed = False
for headword_id, (headword_text, category_text) in enumerate(headword_category): for headword_id, (headword_text, category_text) in enumerate(headword_category):
# print(headword_text) # print(headword_text)
if already_processed: # if already_processed:
if headword_text != last_processed_hw: # if headword_text != last_processed_hw:
continue # continue
else: # else:
already_processed = False # already_processed = False
# for headword_text, category_text in headword_category[15:20]: # for headword_text, category_text in headword_category[15:20]:
# headword_text = 'zadovoljen' # headword_text = 'zadovoljen'
# category_text = 'adjective' # category_text = 'adjective'
@ -306,7 +318,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
# print('HEADWORD') # print('HEADWORD')
# print(headword_text) # print(headword_text)
# pbar.update(1) # pbar.update(1)
part += 1 # part += 1
# #
# w_collection.bulk_write( # w_collection.bulk_write(
# array.map((val) = > # array.map((val) = >
@ -724,7 +736,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida): def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
Lexeme.dummy, LexicalUnitType.name) \ Lexeme.dummy, LexicalUnitType.name) \
.join(Category, Category.id == Lexeme.category_id) \ .join(Category, Category.id == Lexeme.category_id) \
@ -1138,6 +1150,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'), with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'),
encoding='utf-8') as xf: encoding='utf-8') as xf:
xf.write(dictionary, pretty_print=True) xf.write(dictionary, pretty_print=True)
pbar.update(1)
# xf.write(entry, pretty_print=True) # xf.write(entry, pretty_print=True)
# tree.write(output_file_name, encoding='UTF-8', pretty_print=True) # tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
@ -1546,7 +1559,8 @@ def main(args):
print('write_xml') print('write_xml')
start_time = time.time() start_time = time.time()
# print('aa ' + 3) # print('aa ' + 3)
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida) with tqdm(total=len(headword_category)) as pbar:
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
print(time.time() - start_time) print(time.time() - start_time)
# input_file.close() # input_file.close()
session.close() session.close()
@ -1621,20 +1635,20 @@ if __name__ == '__main__':
args = arg_parser.parse_args() args = arg_parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
try: # try:
sys.path.insert(1, args.structure_extraction) # sys.path.insert(1, args.structure_extraction)
from progress_bar import progress # from progress_bar import progress
from word import Word, WordCompressed # from word import Word, WordCompressed
from syntactic_structure import build_structures # from syntactic_structure import build_structures
from match_store import MatchStore # from match_store import MatchStore
from word_stats import WordStats # from word_stats import WordStats
from writer import Writer # from writer import Writer
from loader import load_files, file_sentence_glue_generator # from loader import load_files, file_sentence_glue_generator
from database import Database # from database import Database
from time_info import TimeInfo # from time_info import TimeInfo
from msd_translate import MSD_TRANSLATE # from msd_translate import MSD_TRANSLATE
except: # except:
raise # raise
start = time.time() start = time.time()
main(args) main(args)

1
scripts/valency Symbolic link
View File

@ -0,0 +1 @@
../src/pkg/valency/valency

0
src/__init__.py Normal file
View File

0
src/pkg/__init__.py Normal file
View File