|
|
|
@ -1,5 +1,17 @@
|
|
|
|
|
#!/usr/bin/python3
|
|
|
|
|
|
|
|
|
|
#imports from luscenje_struktur
|
|
|
|
|
from luscenje_struktur.progress_bar import progress
|
|
|
|
|
from luscenje_struktur.word import Word, WordCompressed
|
|
|
|
|
from luscenje_struktur.syntactic_structure import build_structures
|
|
|
|
|
from luscenje_struktur.match_store import MatchStore
|
|
|
|
|
from luscenje_struktur.word_stats import WordStats
|
|
|
|
|
from luscenje_struktur.writer import Writer
|
|
|
|
|
from luscenje_struktur.loader import load_files, file_sentence_glue_generator
|
|
|
|
|
from luscenje_struktur.database import Database
|
|
|
|
|
from luscenje_struktur.time_info import TimeInfo
|
|
|
|
|
from luscenje_struktur.msd_translate import MSD_TRANSLATE
|
|
|
|
|
|
|
|
|
|
# make database-service
|
|
|
|
|
import gc
|
|
|
|
|
import re
|
|
|
|
@ -12,8 +24,8 @@ from tqdm import tqdm
|
|
|
|
|
import pymongo
|
|
|
|
|
# import tqdm as tqdm
|
|
|
|
|
|
|
|
|
|
sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
|
|
|
|
|
sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
|
|
|
|
|
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
|
|
|
|
|
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
|
|
|
|
|
from valency.Frame import frames_from_db_entry
|
|
|
|
|
from valency.reduce_functions import reduce_functions
|
|
|
|
|
|
|
|
|
@ -151,25 +163,25 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
|
|
|
|
# all_sentences = set()
|
|
|
|
|
sorted(headword_category, key=lambda x: x[0])
|
|
|
|
|
# num_sentences in RAM at once
|
|
|
|
|
sentences_num_limit = 10000
|
|
|
|
|
sentences_num_limit = 15000
|
|
|
|
|
sentences_in_ram = 0
|
|
|
|
|
part = 0
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
# part = 0
|
|
|
|
|
# start_time = time.time()
|
|
|
|
|
# first_sentence = True
|
|
|
|
|
# section_included = False
|
|
|
|
|
# last_processed_hw = 'pomeniti'
|
|
|
|
|
# last_processed_hw = 'iti'
|
|
|
|
|
# last_processed_hw = 'aktivirati'
|
|
|
|
|
last_processed_hw = 'aktivirati'
|
|
|
|
|
# last_processed_hw = 'aktivirati'
|
|
|
|
|
|
|
|
|
|
already_processed = False
|
|
|
|
|
# already_processed = False
|
|
|
|
|
for headword_id, (headword_text, category_text) in enumerate(headword_category):
|
|
|
|
|
# print(headword_text)
|
|
|
|
|
if already_processed:
|
|
|
|
|
if headword_text != last_processed_hw:
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
already_processed = False
|
|
|
|
|
# if already_processed:
|
|
|
|
|
# if headword_text != last_processed_hw:
|
|
|
|
|
# continue
|
|
|
|
|
# else:
|
|
|
|
|
# already_processed = False
|
|
|
|
|
# for headword_text, category_text in headword_category[15:20]:
|
|
|
|
|
# headword_text = 'zadovoljen'
|
|
|
|
|
# category_text = 'adjective'
|
|
|
|
@ -306,7 +318,7 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
|
|
|
|
# print('HEADWORD')
|
|
|
|
|
# print(headword_text)
|
|
|
|
|
# pbar.update(1)
|
|
|
|
|
part += 1
|
|
|
|
|
# part += 1
|
|
|
|
|
#
|
|
|
|
|
# w_collection.bulk_write(
|
|
|
|
|
# array.map((val) = >
|
|
|
|
@ -724,7 +736,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
|
|
|
|
|
return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida):
|
|
|
|
|
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
|
|
|
|
|
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
|
|
|
|
Lexeme.dummy, LexicalUnitType.name) \
|
|
|
|
|
.join(Category, Category.id == Lexeme.category_id) \
|
|
|
|
@ -1138,6 +1150,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'),
|
|
|
|
|
encoding='utf-8') as xf:
|
|
|
|
|
xf.write(dictionary, pretty_print=True)
|
|
|
|
|
pbar.update(1)
|
|
|
|
|
# xf.write(entry, pretty_print=True)
|
|
|
|
|
# tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
|
|
|
|
|
|
|
|
@ -1546,7 +1559,8 @@ def main(args):
|
|
|
|
|
print('write_xml')
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
# print('aa ' + 3)
|
|
|
|
|
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida)
|
|
|
|
|
with tqdm(total=len(headword_category)) as pbar:
|
|
|
|
|
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
|
|
|
|
|
print(time.time() - start_time)
|
|
|
|
|
# input_file.close()
|
|
|
|
|
session.close()
|
|
|
|
@ -1621,20 +1635,20 @@ if __name__ == '__main__':
|
|
|
|
|
args = arg_parser.parse_args()
|
|
|
|
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
sys.path.insert(1, args.structure_extraction)
|
|
|
|
|
from progress_bar import progress
|
|
|
|
|
from word import Word, WordCompressed
|
|
|
|
|
from syntactic_structure import build_structures
|
|
|
|
|
from match_store import MatchStore
|
|
|
|
|
from word_stats import WordStats
|
|
|
|
|
from writer import Writer
|
|
|
|
|
from loader import load_files, file_sentence_glue_generator
|
|
|
|
|
from database import Database
|
|
|
|
|
from time_info import TimeInfo
|
|
|
|
|
from msd_translate import MSD_TRANSLATE
|
|
|
|
|
except:
|
|
|
|
|
raise
|
|
|
|
|
# try:
|
|
|
|
|
# sys.path.insert(1, args.structure_extraction)
|
|
|
|
|
# from progress_bar import progress
|
|
|
|
|
# from word import Word, WordCompressed
|
|
|
|
|
# from syntactic_structure import build_structures
|
|
|
|
|
# from match_store import MatchStore
|
|
|
|
|
# from word_stats import WordStats
|
|
|
|
|
# from writer import Writer
|
|
|
|
|
# from loader import load_files, file_sentence_glue_generator
|
|
|
|
|
# from database import Database
|
|
|
|
|
# from time_info import TimeInfo
|
|
|
|
|
# from msd_translate import MSD_TRANSLATE
|
|
|
|
|
# except:
|
|
|
|
|
# raise
|
|
|
|
|
|
|
|
|
|
start = time.time()
|
|
|
|
|
main(args)
|
|
|
|
|