forked from kristjan/cjvt-srl-tagging
192 lines
6.6 KiB
Python
192 lines
6.6 KiB
Python
import pickle
|
|
|
|
from parser.parser import Parser
|
|
import os
|
|
from os.path import join, dirname
|
|
from pathlib import Path
|
|
import re
|
|
import sys
|
|
import cProfile
|
|
import configparser
|
|
import logging
|
|
from multiprocessing import Pool
|
|
|
|
SSJ500K_2_1 = 27829 # number of sentences
|
|
par = Parser()
|
|
|
|
# path to data
|
|
config = configparser.ConfigParser()
|
|
config.read("tools.cfg")
|
|
analysis = ''
|
|
if 'kres_orig' in config["tools"]:
|
|
analysis = 'kres'
|
|
INDIR = Path(config["tools"]["kres_orig"])
|
|
OUTDIR = Path(config["tools"]["kres_tsv"])
|
|
elif 'giga_orig' in config["tools"]:
|
|
# analysis = 'gigafida'
|
|
analysis = 'giga'
|
|
INDIR_GIGA = Path(config["tools"]["giga_orig"])
|
|
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
|
|
INDIR_JOS = Path(config["tools"]["giga_jos"])
|
|
OUTDIR = Path(config["tools"]["giga_tsv"])
|
|
GIGA_PARTS = int(config["tools"]["giga_parts"])
|
|
INTERNAL_DATA = config["tools"]["internal_data"]
|
|
|
|
CPU_CORES = int(config["tools"]["cpu_cores"])
|
|
|
|
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
|
LOGFILE.touch(exist_ok=True)
|
|
LOGFILE.resolve()
|
|
|
|
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
|
|
|
origfiles = []
|
|
for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
|
|
for file in files:
|
|
origfiles.append(Path(os.path.join(subdir, file)))
|
|
origfiles=list(enumerate(sorted(origfiles)))
|
|
|
|
def giga_orig_sentence_generator():
|
|
with open(INDIR_GIGA, 'r') as gof:
|
|
previous_new_line = False
|
|
sentence_words = []
|
|
for l_gof in gof:
|
|
if l_gof == '\n':
|
|
yield ' '.join(sentence_words)
|
|
sentence_words = []
|
|
else:
|
|
sentence_words.append(l_gof.split('\t')[0])
|
|
# yield l_gof
|
|
|
|
sentence_generator = giga_orig_sentence_generator()
|
|
|
|
sentence_ids = []
|
|
for origfile in origfiles:
|
|
split_file_sentences = par.parse_tei(origfile[1])
|
|
for k, v in split_file_sentences.items():
|
|
one_file_sentence = next(sentence_generator)
|
|
if one_file_sentence == v['text']:
|
|
sentence_ids.append(v['sid'])
|
|
else:
|
|
print('----------------')
|
|
print('ERROR')
|
|
print(v['sid'])
|
|
print(one_file_sentence)
|
|
print(v['text'])
|
|
print(origfile[0])
|
|
|
|
# count sentences in orig (if not counted before)
|
|
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
|
|
if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
|
|
os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
|
|
|
|
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
|
|
pickle.dump(sentence_ids, output)
|
|
|
|
# def giga_orig_generator():
|
|
# with open(INDIR_GIGA, 'r') as gof:
|
|
# previous_new_line = False
|
|
# for l_gof in gof:
|
|
# if l_gof == '\n':
|
|
# if previous_new_line:
|
|
# continue
|
|
# previous_new_line = True
|
|
# elif previous_new_line:
|
|
# previous_new_line = False
|
|
# yield l_gof
|
|
|
|
# import time
|
|
# def handle_giga_file(ran):
|
|
# """
|
|
# File that splits big text file into more minor files. Only split on empty lines.
|
|
# """
|
|
# # with open(INDIR_GIGA, 'r') as gof:
|
|
# # with open(INDIR_JOS, 'r') as gjf:
|
|
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
# # pass
|
|
# # num_lines = i + 1
|
|
# # print(num_lines)
|
|
# num_lines = 1393184026
|
|
# # 1393184026
|
|
# # 1393184033
|
|
# # return
|
|
# num_lines_per_part = num_lines / GIGA_PARTS
|
|
# curr_part = 0
|
|
# gof_generator = giga_orig_generator()
|
|
#
|
|
# diff_files = set()
|
|
# # with open(INDIR_GIGA, 'r') as gof:
|
|
# with open(INDIR_GIGA_OLD, 'r') as gjf:
|
|
# # sentence = {}
|
|
# # sentence['tokens'] = []
|
|
# # sentence['links'] = {}
|
|
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
|
|
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
|
|
#
|
|
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
|
|
#
|
|
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
|
|
# for i, l_gjf in enumerate(gjf):
|
|
# l_gof = next(gof_generator)
|
|
# if curr_part < ran[0]:
|
|
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
|
|
# if curr_part < ran[0]:
|
|
# print(curr_part)
|
|
# curr_part += 1
|
|
# continue
|
|
# else:
|
|
# continue
|
|
#
|
|
# l_gof_split = l_gof.split('\t')
|
|
# l_gjf_split = l_gjf.split('\t')
|
|
#
|
|
# # if punctuation
|
|
# if l_gof != '\n':
|
|
# if l_gof_split != l_gjf_split:
|
|
# print(curr_part)
|
|
# diff_files.add(curr_part)
|
|
# l_gof = next(gof_generator)
|
|
#
|
|
#
|
|
# # if l_gof == '\n':
|
|
# else:
|
|
# # wf.flush()
|
|
# # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
|
|
# if i > num_lines_per_part * (curr_part + 1):
|
|
# curr_part += 1
|
|
# # if wf doesn't exist (first one)
|
|
# # wf.close()
|
|
# if curr_part >= ran[1]:
|
|
# break
|
|
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
|
|
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
|
|
#
|
|
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
|
|
#
|
|
# curr_part += 1
|
|
# return diff_files
|
|
# # wf.close()
|
|
#
|
|
# with Pool(CPU_CORES) as p:
|
|
# final_range = [0, 100000]
|
|
# # final_range = [0, 150]
|
|
# # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
|
|
# # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
|
|
# # ranges = []
|
|
# # ps = None
|
|
# # for i in range(CPU_CORES):
|
|
# # s = int(final_range[0] + size_per_proc * i)
|
|
# # ns = int(final_range[0] + size_per_proc * (i + 1))
|
|
# # ranges.append([s, ns])
|
|
# # # ranges = [[0, 1]]
|
|
# # res = p.map(handle_giga_file, ranges)
|
|
#
|
|
# res = handle_giga_file(final_range)
|
|
# res = sorted(list(res))
|
|
# if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
|
|
# os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
|
|
# with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
|
|
# pickle.dump(res, pkl_file)
|
|
# # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
|
|
# # mydict2 = pickle.load(pkl_file)
|
|
# print('test') |