Small README fixes

This commit is contained in:
Luka 2020-11-23 11:24:50 +01:00
parent 2551a9c6a8
commit 34b776be11
2 changed files with 26 additions and 74 deletions

View File

@ -194,7 +194,7 @@ pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git
make database-service
```
### Setting up environment for running on proc1 - ramdisk
### Setting up environment for running on ramdisk
```bash
# create ramdisk
@ -202,30 +202,24 @@ sudo mount -t tmpfs tmpfs /mnt/tmp
sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp
# change volumes to /mnt/tmp:/data/db
vim dockerfiles/database/mongodb-stack.yml
vim dockerfiles/database/valency-stack.yml
# change Makefile -runStack to mkdir -p /mnt/tmp
vim dockerfiles/database/mongodb-stack.yml
vim dockerfiles/database/Makefile
docker swarm init
# run service
make database-service
make database-users
# run ONLY ONCE to create users and restore database
make database-users
make database-restore
# double check if it worked
docker exec -it ef0a /bin/bash
# following steps in docker bash:
mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://<REGULAR USERNAME>:<REGULAR PASSWORD>@0.0.0.0:27017
# add privilegies for user to write into other databases like extvaldb
mongo --username <ADMIN USER> --password --authenticationDatabase admin
use valdb
db.grantRolesToUser(<REGULAR USER>, [{ role: "readWrite", db: "extvaldb"}])
# check if it worked by
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
db.getRoles()
# make mongodb visible only privately
docker stack rm dbstack
cd dockerfiles/database/
docker-compose up
```

View File

@ -1,6 +1,8 @@
#!/usr/bin/python3
#imports from luscenje_struktur
import copy
from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word, WordCompressed
from luscenje_struktur.syntactic_structure import build_structures
@ -114,8 +116,6 @@ CASE_MAP = {
Lexeme = None
LexemeFeature = None
SyntacticStructure = None
StructureComponent = None
Feature = None
LexicalUnitLexeme = None
LexicalUnit = None
@ -191,6 +191,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
headword_patterns_ids = {}
# print('tu1!')
cur = collection.find({"headwords": headword_text})
# cur = collection.find({"headwords": headword_text}, no_cursor_timeout=True)
# print('tu2!')
frames = []
for ent in cur:
@ -371,11 +372,12 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
# return sentences_of_interest
def create_sentence_output(sentence, headword_id, corpus):
def create_sentence_output(sentence, headword_id, corpus, sent_id):
glue_outside = False
headword_id = str(headword_id)
parent_node = etree.Element('corpusExample')
parent_node.set('corpusName', corpus)
parent_node.set('id', sent_id)
# parent_node.text = 'AAA'
# parent_node.prefix = 'BBB'
# parent_node.tail = 'CCC'
@ -660,7 +662,9 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
sent_id = sentence[0][0].rsplit('.', 1)[0]
try:
db_sentence = next(iter(w_a_collection.find({'_id': sent_id})))['words']
cur = w_a_collection.find({'_id': sent_id})
db_sentence = next(iter(cur))['words']
cur.close()
except StopIteration:
continue
@ -703,7 +707,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key):
examples_included_num += 1
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus)
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus, sent_id)
# sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)
# sentence_example = ''.join(sentence_example)
@ -762,7 +766,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
# with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:
# a = [a for a in valency_pattern_id_collection.find()]
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in valency_pattern_id_collection.find()]}
cur = valency_pattern_id_collection.find()
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in cur]}
cur.close()
# patterns = {}
pattern_id_max = len(patterns) + 1
@ -817,35 +823,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
# res = query.one_or_none()
query_res = query.all()
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
# .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
# .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
# .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
# .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
# .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(LexicalUnitType.name == 'single_lexeme_unit') \
# .filter(Measure.name == 'frequency') \
# .filter(Category.name == 'preposition') \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
#
# a = query2.all()
if len(query_res) == 1:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
query_res[0]
@ -995,20 +972,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
definition = lxml.SubElement(definitionList, 'definition')
definition.text = definition_text[0]
syntactic_structures = session.query(SyntacticStructure.id, SyntacticStructure.name,
StructureComponent.id, StructureComponent.name).join(
LexicalUnit, LexicalUnit.syntactic_structure_id == SyntacticStructure.id) \
.join(StructureComponent, StructureComponent.syntactic_structure_id == SyntacticStructure.id) \
.filter(LexicalUnit.id == sense_id.id)
# .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \
# syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \
# .filter(SyntacticStructure.id == sense_id)
syntactic_structuresr = syntactic_structures.all()
# syntactic_structures2r = syntactic_structures2.all()
valencyPatternList = lxml.SubElement(sense, 'valencyPatternList')
valencyPatternList.set('system', 'JOS')
@ -1122,7 +1085,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
for sentence_example in headword_pattern_dict['ssj']['sentence_examples']:
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
exampleContainer.append(sentence_example)
exampleContainer.append(copy.deepcopy(sentence_example))
if 'gf' in headword_pattern_dict:
for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
@ -1136,7 +1099,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
def init_db(db):
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':')
Base = declarative_base()
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
@ -1149,12 +1112,6 @@ def init_db(db):
class LexemeFeature(Base):
__table__ = Base.metadata.tables['jedro_lexeme_feature']
class SyntacticStructure(Base):
__table__ = Base.metadata.tables['jedro_syntacticstructure']
class StructureComponent(Base):
__table__ = Base.metadata.tables['jedro_structurecomponent']
class Feature(Base):
__table__ = Base.metadata.tables['jedro_feature']
@ -1405,6 +1362,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
if len(requests) > 0:
result = w_a_collection.bulk_write(requests)
# force a bit of garbage collection