You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cjvt-srl-tagging/tools/find_diff_sentence_ids.py

192 lines
6.6 KiB

import pickle
from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'giga'
INDIR_GIGA = Path(config["tools"]["giga_orig"])
INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
INDIR_JOS = Path(config["tools"]["giga_jos"])
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
origfiles = []
for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
for file in files:
origfiles.append(Path(os.path.join(subdir, file)))
origfiles=list(enumerate(sorted(origfiles)))
def giga_orig_sentence_generator():
with open(INDIR_GIGA, 'r') as gof:
previous_new_line = False
sentence_words = []
for l_gof in gof:
if l_gof == '\n':
yield ' '.join(sentence_words)
sentence_words = []
else:
sentence_words.append(l_gof.split('\t')[0])
# yield l_gof
sentence_generator = giga_orig_sentence_generator()
sentence_ids = []
for origfile in origfiles:
split_file_sentences = par.parse_tei(origfile[1])
for k, v in split_file_sentences.items():
one_file_sentence = next(sentence_generator)
if one_file_sentence == v['text']:
sentence_ids.append(v['sid'])
else:
print('----------------')
print('ERROR')
print(v['sid'])
print(one_file_sentence)
print(v['text'])
print(origfile[0])
# count sentences in orig (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))
with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
pickle.dump(sentence_ids, output)
# def giga_orig_generator():
# with open(INDIR_GIGA, 'r') as gof:
# previous_new_line = False
# for l_gof in gof:
# if l_gof == '\n':
# if previous_new_line:
# continue
# previous_new_line = True
# elif previous_new_line:
# previous_new_line = False
# yield l_gof
# import time
# def handle_giga_file(ran):
# """
# File that splits big text file into more minor files. Only split on empty lines.
# """
# # with open(INDIR_GIGA, 'r') as gof:
# # with open(INDIR_JOS, 'r') as gjf:
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# # pass
# # num_lines = i + 1
# # print(num_lines)
# num_lines = 1393184026
# # 1393184026
# # 1393184033
# # return
# num_lines_per_part = num_lines / GIGA_PARTS
# curr_part = 0
# gof_generator = giga_orig_generator()
#
# diff_files = set()
# # with open(INDIR_GIGA, 'r') as gof:
# with open(INDIR_GIGA_OLD, 'r') as gjf:
# # sentence = {}
# # sentence['tokens'] = []
# # sentence['links'] = {}
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
#
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
#
# # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
# for i, l_gjf in enumerate(gjf):
# l_gof = next(gof_generator)
# if curr_part < ran[0]:
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
# if curr_part < ran[0]:
# print(curr_part)
# curr_part += 1
# continue
# else:
# continue
#
# l_gof_split = l_gof.split('\t')
# l_gjf_split = l_gjf.split('\t')
#
# # if punctuation
# if l_gof != '\n':
# if l_gof_split != l_gjf_split:
# print(curr_part)
# diff_files.add(curr_part)
# l_gof = next(gof_generator)
#
#
# # if l_gof == '\n':
# else:
# # wf.flush()
# # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
# if i > num_lines_per_part * (curr_part + 1):
# curr_part += 1
# # if wf doesn't exist (first one)
# # wf.close()
# if curr_part >= ran[1]:
# break
# # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
# # os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
# # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
#
# curr_part += 1
# return diff_files
# # wf.close()
#
# with Pool(CPU_CORES) as p:
# final_range = [0, 100000]
# # final_range = [0, 150]
# # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
# # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
# # ranges = []
# # ps = None
# # for i in range(CPU_CORES):
# # s = int(final_range[0] + size_per_proc * i)
# # ns = int(final_range[0] + size_per_proc * (i + 1))
# # ranges.append([s, ns])
# # # ranges = [[0, 1]]
# # res = p.map(handle_giga_file, ranges)
#
# res = handle_giga_file(final_range)
# res = sorted(list(res))
# if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
# os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
# with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
# pickle.dump(res, pkl_file)
# # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
# # mydict2 = pickle.load(pkl_file)
# print('test')