diff --git a/README.md b/README.md index 68d9b30..f2cb427 100644 --- a/README.md +++ b/README.md @@ -180,8 +180,41 @@ $ mongorestore /dumps/valdb --db valdb --uri=mongodb://valuser:valuserpass@0.0.0 After uploading, restart the stack with `27017` commented out. -## When running script +## Script running +### Environment setup +```bash +pip install -r requirements.txt +pip install git+https://gitea.cjvt.si/ozbolt/luscenje_struktur.git +``` + +### Running on already setup environment ```bash make database-service ``` + +### Setting up environment for running on proc1 - ramdisk + +```bash +# create ramdisk +sudo mount -t tmpfs tmpfs /mnt/tmp +sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp + +# change volumes to /mnt/tmp:/data/db +vim dockerfiles/database/mongodb-stack.yml + +# change Makefile -runStack to mkdir -p /mnt/tmp +vim dockerfiles/database/mongodb-stack.yml + +docker swarm init +make database-service +make database-users + +docker exec -it ef0a /bin/bash + +# following steps in docker bash: + mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://:@0.0.0.0:27017 + + # check if it worked by + mongo --username --password --authenticationDatabase valdb +``` \ No newline at end of file diff --git a/scripts/create_xml.py b/scripts/create_xml.py index 01deb92..3fd6af5 100644 --- a/scripts/create_xml.py +++ b/scripts/create_xml.py @@ -1,5 +1,17 @@ #!/usr/bin/python3 +#imports from luscenje_struktur +from luscenje_struktur.progress_bar import progress +from luscenje_struktur.word import Word, WordCompressed +from luscenje_struktur.syntactic_structure import build_structures +from luscenje_struktur.match_store import MatchStore +from luscenje_struktur.word_stats import WordStats +from luscenje_struktur.writer import Writer +from luscenje_struktur.loader import load_files, file_sentence_glue_generator +from luscenje_struktur.database import Database +from luscenje_struktur.time_info import TimeInfo +from luscenje_struktur.msd_translate import MSD_TRANSLATE + # make database-service import gc import re @@ -12,8 +24,8 @@ from tqdm import tqdm import pymongo # import tqdm as tqdm -sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency') -sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser') +# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency') +# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser') from valency.Frame import frames_from_db_entry from valency.reduce_functions import reduce_functions @@ -151,25 +163,25 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # all_sentences = set() sorted(headword_category, key=lambda x: x[0]) # num_sentences in RAM at once - sentences_num_limit = 10000 + sentences_num_limit = 15000 sentences_in_ram = 0 - part = 0 - start_time = time.time() + # part = 0 + # start_time = time.time() # first_sentence = True # section_included = False # last_processed_hw = 'pomeniti' # last_processed_hw = 'iti' # last_processed_hw = 'aktivirati' - last_processed_hw = 'aktivirati' + # last_processed_hw = 'aktivirati' - already_processed = False + # already_processed = False for headword_id, (headword_text, category_text) in enumerate(headword_category): # print(headword_text) - if already_processed: - if headword_text != last_processed_hw: - continue - else: - already_processed = False + # if already_processed: + # if headword_text != last_processed_hw: + # continue + # else: + # already_processed = False # for headword_text, category_text in headword_category[15:20]: # headword_text = 'zadovoljen' # category_text = 'adjective' @@ -306,7 +318,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # print('HEADWORD') # print(headword_text) # pbar.update(1) - part += 1 + # part += 1 # # w_collection.bulk_write( # array.map((val) = > @@ -724,7 +736,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max -def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida): +def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar): query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, Lexeme.dummy, LexicalUnitType.name) \ .join(Category, Category.id == Lexeme.category_id) \ @@ -1138,6 +1150,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'), encoding='utf-8') as xf: xf.write(dictionary, pretty_print=True) + pbar.update(1) # xf.write(entry, pretty_print=True) # tree.write(output_file_name, encoding='UTF-8', pretty_print=True) @@ -1546,7 +1559,8 @@ def main(args): print('write_xml') start_time = time.time() # print('aa ' + 3) - write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida) + with tqdm(total=len(headword_category)) as pbar: + write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar) print(time.time() - start_time) # input_file.close() session.close() @@ -1621,20 +1635,20 @@ if __name__ == '__main__': args = arg_parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) - try: - sys.path.insert(1, args.structure_extraction) - from progress_bar import progress - from word import Word, WordCompressed - from syntactic_structure import build_structures - from match_store import MatchStore - from word_stats import WordStats - from writer import Writer - from loader import load_files, file_sentence_glue_generator - from database import Database - from time_info import TimeInfo - from msd_translate import MSD_TRANSLATE - except: - raise + # try: + # sys.path.insert(1, args.structure_extraction) + # from progress_bar import progress + # from word import Word, WordCompressed + # from syntactic_structure import build_structures + # from match_store import MatchStore + # from word_stats import WordStats + # from writer import Writer + # from loader import load_files, file_sentence_glue_generator + # from database import Database + # from time_info import TimeInfo + # from msd_translate import MSD_TRANSLATE + # except: + # raise start = time.time() main(args) diff --git a/scripts/valency b/scripts/valency new file mode 120000 index 0000000..c067bb0 --- /dev/null +++ b/scripts/valency @@ -0,0 +1 @@ +../src/pkg/valency/valency \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pkg/__init__.py b/src/pkg/__init__.py new file mode 100644 index 0000000..e69de29