forked from kristjan/cjvt-valency
Small README fixes
This commit is contained in:
parent
2551a9c6a8
commit
34b776be11
26
README.md
26
README.md
|
@ -194,7 +194,7 @@ pip install git+https://gitea.cjvt.si/kristjan/cjvt-corpusparser.git
|
||||||
make database-service
|
make database-service
|
||||||
```
|
```
|
||||||
|
|
||||||
### Setting up environment for running on proc1 - ramdisk
|
### Setting up environment for running on ramdisk
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# create ramdisk
|
# create ramdisk
|
||||||
|
@ -202,30 +202,24 @@ sudo mount -t tmpfs tmpfs /mnt/tmp
|
||||||
sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp
|
sudo mount -o remount,size=120G,noexec,nosuid,nodev,noatime /mnt/tmp
|
||||||
|
|
||||||
# change volumes to /mnt/tmp:/data/db
|
# change volumes to /mnt/tmp:/data/db
|
||||||
vim dockerfiles/database/mongodb-stack.yml
|
vim dockerfiles/database/valency-stack.yml
|
||||||
|
|
||||||
# change Makefile -runStack to mkdir -p /mnt/tmp
|
# change Makefile -runStack to mkdir -p /mnt/tmp
|
||||||
vim dockerfiles/database/mongodb-stack.yml
|
vim dockerfiles/database/Makefile
|
||||||
|
|
||||||
docker swarm init
|
# run service
|
||||||
make database-service
|
make database-service
|
||||||
make database-users
|
|
||||||
|
|
||||||
|
# run ONLY ONCE to create users and restore database
|
||||||
|
make database-users
|
||||||
|
make database-restore
|
||||||
|
|
||||||
|
# double check if it worked
|
||||||
docker exec -it ef0a /bin/bash
|
docker exec -it ef0a /bin/bash
|
||||||
|
|
||||||
# following steps in docker bash:
|
# following steps in docker bash:
|
||||||
mongorestore --gzip --archive=dump.gz --db valdb --uri=mongodb://<REGULAR USERNAME>:<REGULAR PASSWORD>@0.0.0.0:27017
|
|
||||||
|
|
||||||
# add privilegies for user to write into other databases like extvaldb
|
|
||||||
mongo --username <ADMIN USER> --password --authenticationDatabase admin
|
|
||||||
use valdb
|
|
||||||
db.grantRolesToUser(<REGULAR USER>, [{ role: "readWrite", db: "extvaldb"}])
|
|
||||||
|
|
||||||
# check if it worked by
|
# check if it worked by
|
||||||
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
|
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
|
||||||
|
db.getRoles()
|
||||||
|
|
||||||
# make mongodb visible only privately
|
|
||||||
docker stack rm dbstack
|
|
||||||
cd dockerfiles/database/
|
|
||||||
docker-compose up
|
|
||||||
```
|
```
|
|
@ -1,6 +1,8 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
#imports from luscenje_struktur
|
#imports from luscenje_struktur
|
||||||
|
import copy
|
||||||
|
|
||||||
from luscenje_struktur.progress_bar import progress
|
from luscenje_struktur.progress_bar import progress
|
||||||
from luscenje_struktur.word import Word, WordCompressed
|
from luscenje_struktur.word import Word, WordCompressed
|
||||||
from luscenje_struktur.syntactic_structure import build_structures
|
from luscenje_struktur.syntactic_structure import build_structures
|
||||||
|
@ -114,8 +116,6 @@ CASE_MAP = {
|
||||||
|
|
||||||
Lexeme = None
|
Lexeme = None
|
||||||
LexemeFeature = None
|
LexemeFeature = None
|
||||||
SyntacticStructure = None
|
|
||||||
StructureComponent = None
|
|
||||||
Feature = None
|
Feature = None
|
||||||
LexicalUnitLexeme = None
|
LexicalUnitLexeme = None
|
||||||
LexicalUnit = None
|
LexicalUnit = None
|
||||||
|
@ -191,6 +191,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
headword_patterns_ids = {}
|
headword_patterns_ids = {}
|
||||||
# print('tu1!')
|
# print('tu1!')
|
||||||
cur = collection.find({"headwords": headword_text})
|
cur = collection.find({"headwords": headword_text})
|
||||||
|
# cur = collection.find({"headwords": headword_text}, no_cursor_timeout=True)
|
||||||
# print('tu2!')
|
# print('tu2!')
|
||||||
frames = []
|
frames = []
|
||||||
for ent in cur:
|
for ent in cur:
|
||||||
|
@ -371,11 +372,12 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
# return sentences_of_interest
|
# return sentences_of_interest
|
||||||
|
|
||||||
|
|
||||||
def create_sentence_output(sentence, headword_id, corpus):
|
def create_sentence_output(sentence, headword_id, corpus, sent_id):
|
||||||
glue_outside = False
|
glue_outside = False
|
||||||
headword_id = str(headword_id)
|
headword_id = str(headword_id)
|
||||||
parent_node = etree.Element('corpusExample')
|
parent_node = etree.Element('corpusExample')
|
||||||
parent_node.set('corpusName', corpus)
|
parent_node.set('corpusName', corpus)
|
||||||
|
parent_node.set('id', sent_id)
|
||||||
# parent_node.text = 'AAA'
|
# parent_node.text = 'AAA'
|
||||||
# parent_node.prefix = 'BBB'
|
# parent_node.prefix = 'BBB'
|
||||||
# parent_node.tail = 'CCC'
|
# parent_node.tail = 'CCC'
|
||||||
|
@ -660,7 +662,9 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
|
||||||
sent_id = sentence[0][0].rsplit('.', 1)[0]
|
sent_id = sentence[0][0].rsplit('.', 1)[0]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
db_sentence = next(iter(w_a_collection.find({'_id': sent_id})))['words']
|
cur = w_a_collection.find({'_id': sent_id})
|
||||||
|
db_sentence = next(iter(cur))['words']
|
||||||
|
cur.close()
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -703,7 +707,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
|
||||||
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
|
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
|
||||||
if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key):
|
if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key):
|
||||||
examples_included_num += 1
|
examples_included_num += 1
|
||||||
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus)
|
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus, sent_id)
|
||||||
# sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)
|
# sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)
|
||||||
|
|
||||||
# sentence_example = ''.join(sentence_example)
|
# sentence_example = ''.join(sentence_example)
|
||||||
|
@ -762,7 +766,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
# with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:
|
# with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:
|
||||||
|
|
||||||
# a = [a for a in valency_pattern_id_collection.find()]
|
# a = [a for a in valency_pattern_id_collection.find()]
|
||||||
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in valency_pattern_id_collection.find()]}
|
cur = valency_pattern_id_collection.find()
|
||||||
|
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in cur]}
|
||||||
|
cur.close()
|
||||||
# patterns = {}
|
# patterns = {}
|
||||||
pattern_id_max = len(patterns) + 1
|
pattern_id_max = len(patterns) + 1
|
||||||
|
|
||||||
|
@ -817,35 +823,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
# res = query.one_or_none()
|
# res = query.one_or_none()
|
||||||
query_res = query.all()
|
query_res = query.all()
|
||||||
|
|
||||||
# query2 = session.query(Lexeme.id) \
|
|
||||||
# .join(Category, Category.id == Lexeme.category_id) \
|
|
||||||
# .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
|
||||||
# .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
|
||||||
# .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
|
|
||||||
# .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
|
|
||||||
# .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
|
|
||||||
# .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
|
|
||||||
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
|
|
||||||
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
||||||
# .filter(LexicalUnitType.name == 'single_lexeme_unit') \
|
|
||||||
# .filter(Measure.name == 'frequency') \
|
|
||||||
# .filter(Category.name == 'preposition') \
|
|
||||||
# .filter(Lexeme.lemma == 'za') \
|
|
||||||
# .filter(Feature.name == 'case') \
|
|
||||||
# .filter(LexemeFeature.value == 'instrumental') \
|
|
||||||
# .group_by(Lexeme.id)
|
|
||||||
|
|
||||||
# query2 = session.query(Lexeme.id) \
|
|
||||||
# .join(Category, Category.id == Lexeme.category_id) \
|
|
||||||
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
|
|
||||||
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
||||||
# .filter(Lexeme.lemma == 'za') \
|
|
||||||
# .filter(Feature.name == 'case') \
|
|
||||||
# .filter(LexemeFeature.value == 'instrumental') \
|
|
||||||
# .group_by(Lexeme.id)
|
|
||||||
#
|
|
||||||
# a = query2.all()
|
|
||||||
|
|
||||||
if len(query_res) == 1:
|
if len(query_res) == 1:
|
||||||
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
|
||||||
query_res[0]
|
query_res[0]
|
||||||
|
@ -995,20 +972,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
definition = lxml.SubElement(definitionList, 'definition')
|
definition = lxml.SubElement(definitionList, 'definition')
|
||||||
definition.text = definition_text[0]
|
definition.text = definition_text[0]
|
||||||
|
|
||||||
syntactic_structures = session.query(SyntacticStructure.id, SyntacticStructure.name,
|
|
||||||
StructureComponent.id, StructureComponent.name).join(
|
|
||||||
LexicalUnit, LexicalUnit.syntactic_structure_id == SyntacticStructure.id) \
|
|
||||||
.join(StructureComponent, StructureComponent.syntactic_structure_id == SyntacticStructure.id) \
|
|
||||||
.filter(LexicalUnit.id == sense_id.id)
|
|
||||||
|
|
||||||
# .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \
|
|
||||||
|
|
||||||
# syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \
|
|
||||||
# .filter(SyntacticStructure.id == sense_id)
|
|
||||||
|
|
||||||
syntactic_structuresr = syntactic_structures.all()
|
|
||||||
# syntactic_structures2r = syntactic_structures2.all()
|
|
||||||
|
|
||||||
valencyPatternList = lxml.SubElement(sense, 'valencyPatternList')
|
valencyPatternList = lxml.SubElement(sense, 'valencyPatternList')
|
||||||
valencyPatternList.set('system', 'JOS')
|
valencyPatternList.set('system', 'JOS')
|
||||||
|
|
||||||
|
@ -1122,7 +1085,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
for sentence_example in headword_pattern_dict['ssj']['sentence_examples']:
|
for sentence_example in headword_pattern_dict['ssj']['sentence_examples']:
|
||||||
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
|
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
|
||||||
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
|
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
|
||||||
exampleContainer.append(sentence_example)
|
exampleContainer.append(copy.deepcopy(sentence_example))
|
||||||
|
|
||||||
if 'gf' in headword_pattern_dict:
|
if 'gf' in headword_pattern_dict:
|
||||||
for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
|
for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
|
||||||
|
@ -1136,7 +1099,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
|
|
||||||
|
|
||||||
def init_db(db):
|
def init_db(db):
|
||||||
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
||||||
[db_user, db_password, db_database, db_host] = db.split(':')
|
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
||||||
|
@ -1149,12 +1112,6 @@ def init_db(db):
|
||||||
class LexemeFeature(Base):
|
class LexemeFeature(Base):
|
||||||
__table__ = Base.metadata.tables['jedro_lexeme_feature']
|
__table__ = Base.metadata.tables['jedro_lexeme_feature']
|
||||||
|
|
||||||
class SyntacticStructure(Base):
|
|
||||||
__table__ = Base.metadata.tables['jedro_syntacticstructure']
|
|
||||||
|
|
||||||
class StructureComponent(Base):
|
|
||||||
__table__ = Base.metadata.tables['jedro_structurecomponent']
|
|
||||||
|
|
||||||
class Feature(Base):
|
class Feature(Base):
|
||||||
__table__ = Base.metadata.tables['jedro_feature']
|
__table__ = Base.metadata.tables['jedro_feature']
|
||||||
|
|
||||||
|
@ -1405,6 +1362,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
||||||
|
|
||||||
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
|
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
|
||||||
|
|
||||||
|
if len(requests) > 0:
|
||||||
result = w_a_collection.bulk_write(requests)
|
result = w_a_collection.bulk_write(requests)
|
||||||
|
|
||||||
# force a bit of garbage collection
|
# force a bit of garbage collection
|
||||||
|
|
Loading…
Reference in New Issue
Block a user