Small README fixes

This commit is contained in:
2020-11-23 11:24:50 +01:00
parent 2551a9c6a8
commit 34b776be11
2 changed files with 26 additions and 74 deletions

View File

@@ -1,6 +1,8 @@
#!/usr/bin/python3
#imports from luscenje_struktur
import copy
from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word, WordCompressed
from luscenje_struktur.syntactic_structure import build_structures
@@ -114,8 +116,6 @@ CASE_MAP = {
Lexeme = None
LexemeFeature = None
SyntacticStructure = None
StructureComponent = None
Feature = None
LexicalUnitLexeme = None
LexicalUnit = None
@@ -191,6 +191,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
headword_patterns_ids = {}
# print('tu1!')
cur = collection.find({"headwords": headword_text})
# cur = collection.find({"headwords": headword_text}, no_cursor_timeout=True)
# print('tu2!')
frames = []
for ent in cur:
@@ -371,11 +372,12 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
# return sentences_of_interest
def create_sentence_output(sentence, headword_id, corpus):
def create_sentence_output(sentence, headword_id, corpus, sent_id):
glue_outside = False
headword_id = str(headword_id)
parent_node = etree.Element('corpusExample')
parent_node.set('corpusName', corpus)
parent_node.set('id', sent_id)
# parent_node.text = 'AAA'
# parent_node.prefix = 'BBB'
# parent_node.tail = 'CCC'
@@ -660,7 +662,9 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
sent_id = sentence[0][0].rsplit('.', 1)[0]
try:
db_sentence = next(iter(w_a_collection.find({'_id': sent_id})))['words']
cur = w_a_collection.find({'_id': sent_id})
db_sentence = next(iter(cur))['words']
cur.close()
except StopIteration:
continue
@@ -703,7 +707,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key):
examples_included_num += 1
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus)
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus, sent_id)
# sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)
# sentence_example = ''.join(sentence_example)
@@ -762,7 +766,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
# with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:
# a = [a for a in valency_pattern_id_collection.find()]
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in valency_pattern_id_collection.find()]}
cur = valency_pattern_id_collection.find()
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in cur]}
cur.close()
# patterns = {}
pattern_id_max = len(patterns) + 1
@@ -817,35 +823,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
# res = query.one_or_none()
query_res = query.all()
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
# .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
# .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
# .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
# .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
# .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(LexicalUnitType.name == 'single_lexeme_unit') \
# .filter(Measure.name == 'frequency') \
# .filter(Category.name == 'preposition') \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
#
# a = query2.all()
if len(query_res) == 1:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
query_res[0]
@@ -995,20 +972,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
definition = lxml.SubElement(definitionList, 'definition')
definition.text = definition_text[0]
syntactic_structures = session.query(SyntacticStructure.id, SyntacticStructure.name,
StructureComponent.id, StructureComponent.name).join(
LexicalUnit, LexicalUnit.syntactic_structure_id == SyntacticStructure.id) \
.join(StructureComponent, StructureComponent.syntactic_structure_id == SyntacticStructure.id) \
.filter(LexicalUnit.id == sense_id.id)
# .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \
# syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \
# .filter(SyntacticStructure.id == sense_id)
syntactic_structuresr = syntactic_structures.all()
# syntactic_structures2r = syntactic_structures2.all()
valencyPatternList = lxml.SubElement(sense, 'valencyPatternList')
valencyPatternList.set('system', 'JOS')
@@ -1122,7 +1085,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
for sentence_example in headword_pattern_dict['ssj']['sentence_examples']:
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
exampleContainer.append(sentence_example)
exampleContainer.append(copy.deepcopy(sentence_example))
if 'gf' in headword_pattern_dict:
for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
@@ -1136,7 +1099,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
def init_db(db):
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':')
Base = declarative_base()
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
@@ -1149,12 +1112,6 @@ def init_db(db):
class LexemeFeature(Base):
__table__ = Base.metadata.tables['jedro_lexeme_feature']
class SyntacticStructure(Base):
__table__ = Base.metadata.tables['jedro_syntacticstructure']
class StructureComponent(Base):
__table__ = Base.metadata.tables['jedro_structurecomponent']
class Feature(Base):
__table__ = Base.metadata.tables['jedro_feature']
@@ -1405,7 +1362,8 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
result = w_a_collection.bulk_write(requests)
if len(requests) > 0:
result = w_a_collection.bulk_write(requests)
# force a bit of garbage collection
# del sentence