Small README fixes

This commit is contained in:
Luka 2020-11-23 11:24:50 +01:00
parent 2551a9c6a8
commit 34b776be11
2 changed files with 26 additions and 74 deletions

View File

@ -194,7 +194,7 @@ pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git
make database-service make database-service
``` ```
### Setting up environment for running on proc1 - ramdisk ### Setting up environment for running on ramdisk
```bash ```bash
# create ramdisk # create ramdisk
@ -202,30 +202,24 @@ sudo mount -t tmpfs tmpfs /mnt/tmp
sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp
# change volumes to /mnt/tmp:/data/db # change volumes to /mnt/tmp:/data/db
vim dockerfiles/database/mongodb-stack.yml vim dockerfiles/database/valency-stack.yml
# change Makefile -runStack to mkdir -p /mnt/tmp # change Makefile -runStack to mkdir -p /mnt/tmp
vim dockerfiles/database/mongodb-stack.yml vim dockerfiles/database/Makefile
docker swarm init # run service
make database-service make database-service
make database-users
# run ONLY ONCE to create users and restore database
make database-users
make database-restore
# double check if it worked
docker exec -it ef0a /bin/bash docker exec -it ef0a /bin/bash
# following steps in docker bash: # following steps in docker bash:
mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://<REGULAR USERNAME>:<REGULAR PASSWORD>@0.0.0.0:27017
# add privilegies for user to write into other databases like extvaldb
mongo --username <ADMIN USER> --password --authenticationDatabase admin
use valdb
db.grantRolesToUser(<REGULAR USER>, [{ role: "readWrite", db: "extvaldb"}])
# check if it worked by # check if it worked by
mongo --username <REGULAR USER> --password --authenticationDatabase valdb mongo --username <REGULAR USER> --password --authenticationDatabase valdb
db.getRoles()
# make mongodb visible only privately
docker stack rm dbstack
cd dockerfiles/database/
docker-compose up
``` ```

View File

@ -1,6 +1,8 @@
#!/usr/bin/python3 #!/usr/bin/python3
#imports from luscenje_struktur #imports from luscenje_struktur
import copy
from luscenje_struktur.progress_bar import progress from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word, WordCompressed from luscenje_struktur.word import Word, WordCompressed
from luscenje_struktur.syntactic_structure import build_structures from luscenje_struktur.syntactic_structure import build_structures
@ -114,8 +116,6 @@ CASE_MAP = {
Lexeme = None Lexeme = None
LexemeFeature = None LexemeFeature = None
SyntacticStructure = None
StructureComponent = None
Feature = None Feature = None
LexicalUnitLexeme = None LexicalUnitLexeme = None
LexicalUnit = None LexicalUnit = None
@ -191,6 +191,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
headword_patterns_ids = {} headword_patterns_ids = {}
# print('tu1!') # print('tu1!')
cur = collection.find({"headwords": headword_text}) cur = collection.find({"headwords": headword_text})
# cur = collection.find({"headwords": headword_text}, no_cursor_timeout=True)
# print('tu2!') # print('tu2!')
frames = [] frames = []
for ent in cur: for ent in cur:
@ -371,11 +372,12 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
# return sentences_of_interest # return sentences_of_interest
def create_sentence_output(sentence, headword_id, corpus): def create_sentence_output(sentence, headword_id, corpus, sent_id):
glue_outside = False glue_outside = False
headword_id = str(headword_id) headword_id = str(headword_id)
parent_node = etree.Element('corpusExample') parent_node = etree.Element('corpusExample')
parent_node.set('corpusName', corpus) parent_node.set('corpusName', corpus)
parent_node.set('id', sent_id)
# parent_node.text = 'AAA' # parent_node.text = 'AAA'
# parent_node.prefix = 'BBB' # parent_node.prefix = 'BBB'
# parent_node.tail = 'CCC' # parent_node.tail = 'CCC'
@ -660,7 +662,9 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
sent_id = sentence[0][0].rsplit('.', 1)[0] sent_id = sentence[0][0].rsplit('.', 1)[0]
try: try:
db_sentence = next(iter(w_a_collection.find({'_id': sent_id})))['words'] cur = w_a_collection.find({'_id': sent_id})
db_sentence = next(iter(cur))['words']
cur.close()
except StopIteration: except StopIteration:
continue continue
@ -703,7 +707,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data) # sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key): if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key):
examples_included_num += 1 examples_included_num += 1
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus) sentence_example = create_sentence_output(db_sentence, hw_idi, corpus, sent_id)
# sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi) # sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)
# sentence_example = ''.join(sentence_example) # sentence_example = ''.join(sentence_example)
@ -762,7 +766,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
# with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf: # with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:
# a = [a for a in valency_pattern_id_collection.find()] # a = [a for a in valency_pattern_id_collection.find()]
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in valency_pattern_id_collection.find()]} cur = valency_pattern_id_collection.find()
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in cur]}
cur.close()
# patterns = {} # patterns = {}
pattern_id_max = len(patterns) + 1 pattern_id_max = len(patterns) + 1
@ -817,35 +823,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
# res = query.one_or_none() # res = query.one_or_none()
query_res = query.all() query_res = query.all()
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
# .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
# .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
# .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
# .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
# .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(LexicalUnitType.name == 'single_lexeme_unit') \
# .filter(Measure.name == 'frequency') \
# .filter(Category.name == 'preposition') \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
#
# a = query2.all()
if len(query_res) == 1: if len(query_res) == 1:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \ (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
query_res[0] query_res[0]
@ -995,20 +972,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
definition = lxml.SubElement(definitionList, 'definition') definition = lxml.SubElement(definitionList, 'definition')
definition.text = definition_text[0] definition.text = definition_text[0]
syntactic_structures = session.query(SyntacticStructure.id, SyntacticStructure.name,
StructureComponent.id, StructureComponent.name).join(
LexicalUnit, LexicalUnit.syntactic_structure_id == SyntacticStructure.id) \
.join(StructureComponent, StructureComponent.syntactic_structure_id == SyntacticStructure.id) \
.filter(LexicalUnit.id == sense_id.id)
# .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \
# syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \
# .filter(SyntacticStructure.id == sense_id)
syntactic_structuresr = syntactic_structures.all()
# syntactic_structures2r = syntactic_structures2.all()
valencyPatternList = lxml.SubElement(sense, 'valencyPatternList') valencyPatternList = lxml.SubElement(sense, 'valencyPatternList')
valencyPatternList.set('system', 'JOS') valencyPatternList.set('system', 'JOS')
@ -1122,7 +1085,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
for sentence_example in headword_pattern_dict['ssj']['sentence_examples']: for sentence_example in headword_pattern_dict['ssj']['sentence_examples']:
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer') exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample') # corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
exampleContainer.append(sentence_example) exampleContainer.append(copy.deepcopy(sentence_example))
if 'gf' in headword_pattern_dict: if 'gf' in headword_pattern_dict:
for sentence_example in headword_pattern_dict['gf']['sentence_examples']: for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
@ -1136,7 +1099,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
def init_db(db): def init_db(db):
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':') [db_user, db_password, db_database, db_host] = db.split(':')
Base = declarative_base() Base = declarative_base()
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database, engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
@ -1149,12 +1112,6 @@ def init_db(db):
class LexemeFeature(Base): class LexemeFeature(Base):
__table__ = Base.metadata.tables['jedro_lexeme_feature'] __table__ = Base.metadata.tables['jedro_lexeme_feature']
class SyntacticStructure(Base):
__table__ = Base.metadata.tables['jedro_syntacticstructure']
class StructureComponent(Base):
__table__ = Base.metadata.tables['jedro_structurecomponent']
class Feature(Base): class Feature(Base):
__table__ = Base.metadata.tables['jedro_feature'] __table__ = Base.metadata.tables['jedro_feature']
@ -1405,6 +1362,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()] requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
if len(requests) > 0:
result = w_a_collection.bulk_write(requests) result = w_a_collection.bulk_write(requests)
# force a bit of garbage collection # force a bit of garbage collection