diff --git a/README.md b/README.md index 269b51b..8447b16 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git make database-service ``` -### Setting up environment for running on proc1 - ramdisk +### Setting up environment for running on ramdisk ```bash # create ramdisk @@ -202,30 +202,24 @@ sudo mount -t tmpfs tmpfs /mnt/tmp sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp # change volumes to /mnt/tmp:/data/db -vim dockerfiles/database/mongodb-stack.yml +vim dockerfiles/database/valency-stack.yml # change Makefile -runStack to mkdir -p /mnt/tmp -vim dockerfiles/database/mongodb-stack.yml +vim dockerfiles/database/Makefile -docker swarm init +# run service make database-service -make database-users +# run ONLY ONCE to create users and restore database +make database-users +make database-restore + +# double check if it worked docker exec -it ef0a /bin/bash # following steps in docker bash: - mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://:@0.0.0.0:27017 - - # add privilegies for user to write into other databases like extvaldb - mongo --username --password --authenticationDatabase admin - use valdb - db.grantRolesToUser(, [{ role: "readWrite", db: "extvaldb"}]) - # check if it worked by mongo --username --password --authenticationDatabase valdb + db.getRoles() -# make mongodb visible only privately -docker stack rm dbstack -cd dockerfiles/database/ -docker-compose up ``` \ No newline at end of file diff --git a/scripts/create_xml.py b/scripts/create_xml.py index 36c10b4..fdf6feb 100644 --- a/scripts/create_xml.py +++ b/scripts/create_xml.py @@ -1,6 +1,8 @@ #!/usr/bin/python3 #imports from luscenje_struktur +import copy + from luscenje_struktur.progress_bar import progress from luscenje_struktur.word import Word, WordCompressed from luscenje_struktur.syntactic_structure import build_structures @@ -114,8 +116,6 @@ CASE_MAP = { Lexeme = None LexemeFeature = None -SyntacticStructure = None -StructureComponent = None Feature = None LexicalUnitLexeme = None LexicalUnit = None @@ -191,6 +191,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m headword_patterns_ids = {} # print('tu1!') cur = collection.find({"headwords": headword_text}) + # cur = collection.find({"headwords": headword_text}, no_cursor_timeout=True) # print('tu2!') frames = [] for ent in cur: @@ -371,11 +372,12 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m # return sentences_of_interest -def create_sentence_output(sentence, headword_id, corpus): +def create_sentence_output(sentence, headword_id, corpus, sent_id): glue_outside = False headword_id = str(headword_id) parent_node = etree.Element('corpusExample') parent_node.set('corpusName', corpus) + parent_node.set('id', sent_id) # parent_node.text = 'AAA' # parent_node.prefix = 'BBB' # parent_node.tail = 'CCC' @@ -660,7 +662,9 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter sent_id = sentence[0][0].rsplit('.', 1)[0] try: - db_sentence = next(iter(w_a_collection.find({'_id': sent_id})))['words'] + cur = w_a_collection.find({'_id': sent_id}) + db_sentence = next(iter(cur))['words'] + cur.close() except StopIteration: continue @@ -703,7 +707,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter # sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data) if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key): examples_included_num += 1 - sentence_example = create_sentence_output(db_sentence, hw_idi, corpus) + sentence_example = create_sentence_output(db_sentence, hw_idi, corpus, sent_id) # sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi) # sentence_example = ''.join(sentence_example) @@ -762,7 +766,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, # with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf: # a = [a for a in valency_pattern_id_collection.find()] - patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in valency_pattern_id_collection.find()]} + cur = valency_pattern_id_collection.find() + patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in cur]} + cur.close() # patterns = {} pattern_id_max = len(patterns) + 1 @@ -817,35 +823,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, # res = query.one_or_none() query_res = query.all() - # query2 = session.query(Lexeme.id) \ - # .join(Category, Category.id == Lexeme.category_id) \ - # .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \ - # .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \ - # .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \ - # .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \ - # .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \ - # .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \ - # .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \ - # .join(Feature, Feature.id == LexemeFeature.feature_id) \ - # .filter(LexicalUnitType.name == 'single_lexeme_unit') \ - # .filter(Measure.name == 'frequency') \ - # .filter(Category.name == 'preposition') \ - # .filter(Lexeme.lemma == 'za') \ - # .filter(Feature.name == 'case') \ - # .filter(LexemeFeature.value == 'instrumental') \ - # .group_by(Lexeme.id) - - # query2 = session.query(Lexeme.id) \ - # .join(Category, Category.id == Lexeme.category_id) \ - # .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \ - # .join(Feature, Feature.id == LexemeFeature.feature_id) \ - # .filter(Lexeme.lemma == 'za') \ - # .filter(Feature.name == 'case') \ - # .filter(LexemeFeature.value == 'instrumental') \ - # .group_by(Lexeme.id) - # - # a = query2.all() - if len(query_res) == 1: (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \ query_res[0] @@ -995,20 +972,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, definition = lxml.SubElement(definitionList, 'definition') definition.text = definition_text[0] - syntactic_structures = session.query(SyntacticStructure.id, SyntacticStructure.name, - StructureComponent.id, StructureComponent.name).join( - LexicalUnit, LexicalUnit.syntactic_structure_id == SyntacticStructure.id) \ - .join(StructureComponent, StructureComponent.syntactic_structure_id == SyntacticStructure.id) \ - .filter(LexicalUnit.id == sense_id.id) - - # .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \ - - # syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \ - # .filter(SyntacticStructure.id == sense_id) - - syntactic_structuresr = syntactic_structures.all() - # syntactic_structures2r = syntactic_structures2.all() - valencyPatternList = lxml.SubElement(sense, 'valencyPatternList') valencyPatternList.set('system', 'JOS') @@ -1122,7 +1085,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, for sentence_example in headword_pattern_dict['ssj']['sentence_examples']: exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer') # corpusExample = lxml.SubElement(exampleContainer, 'corpusExample') - exampleContainer.append(sentence_example) + exampleContainer.append(copy.deepcopy(sentence_example)) if 'gf' in headword_pattern_dict: for sentence_example in headword_pattern_dict['gf']['sentence_examples']: @@ -1136,7 +1099,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, def init_db(db): - global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation + global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation [db_user, db_password, db_database, db_host] = db.split(':') Base = declarative_base() engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database, @@ -1149,12 +1112,6 @@ def init_db(db): class LexemeFeature(Base): __table__ = Base.metadata.tables['jedro_lexeme_feature'] - class SyntacticStructure(Base): - __table__ = Base.metadata.tables['jedro_syntacticstructure'] - - class StructureComponent(Base): - __table__ = Base.metadata.tables['jedro_structurecomponent'] - class Feature(Base): __table__ = Base.metadata.tables['jedro_feature'] @@ -1405,7 +1362,8 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()] - result = w_a_collection.bulk_write(requests) + if len(requests) > 0: + result = w_a_collection.bulk_write(requests) # force a bit of garbage collection # del sentence