cjvt-srl-tagging/tools/find_diff_sentence_ids.py

import pickle

from parser.parser import Parser
import os
from os.path import join, dirname
from pathlib import Path
import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool

SSJ500K_2_1 = 27829  # number of sentences
par = Parser()

# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
analysis = ''
if 'kres_orig' in config["tools"]:
    analysis = 'kres'
    INDIR = Path(config["tools"]["kres_orig"])
    OUTDIR = Path(config["tools"]["kres_tsv"])
elif 'giga_orig' in config["tools"]:
    # analysis = 'gigafida'
    analysis = 'giga'
    INDIR_GIGA = Path(config["tools"]["giga_orig"])
    INDIR_GIGA_ORIG = Path(config["tools"]["giga"])
    INDIR_JOS = Path(config["tools"]["giga_jos"])
    OUTDIR = Path(config["tools"]["giga_tsv"])
    GIGA_PARTS = int(config["tools"]["giga_parts"])
    INTERNAL_DATA = config["tools"]["internal_data"]

CPU_CORES = int(config["tools"]["cpu_cores"])

LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()

logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

origfiles = []
for subdir, dirs, files in os.walk(INDIR_GIGA_ORIG):
    for file in files:
        origfiles.append(Path(os.path.join(subdir, file)))
origfiles=list(enumerate(sorted(origfiles)))

def giga_orig_sentence_generator():
    with open(INDIR_GIGA, 'r') as gof:
        previous_new_line = False
        sentence_words = []
        for l_gof in gof:
            if l_gof == '\n':
                yield ' '.join(sentence_words)
                sentence_words = []
            else:
                sentence_words.append(l_gof.split('\t')[0])
            # yield l_gof

sentence_generator = giga_orig_sentence_generator()

sentence_ids = []
for origfile in origfiles:
    split_file_sentences = par.parse_tei(origfile[1])
    for k, v in split_file_sentences.items():
        one_file_sentence = next(sentence_generator)
        if one_file_sentence == v['text']:
            sentence_ids.append(v['sid'])
        else:
            print('----------------')
            print('ERROR')
            print(v['sid'])
            print(one_file_sentence)
            print(v['text'])
    print(origfile[0])

# count sentences in orig (if not counted before)
# os.remove(os.path.join(INTERNAL_DATA, 'orig_counted_sentences.pkl'))
if os.path.exists(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl')):
    os.remove(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'))

with open(os.path.join(INTERNAL_DATA, 'sentence_ids_list.pkl'), 'wb') as output:
    pickle.dump(sentence_ids, output)

# def giga_orig_generator():
#     with open(INDIR_GIGA, 'r') as gof:
#         previous_new_line = False
#         for l_gof in gof:
#             if l_gof == '\n':
#                 if previous_new_line:
#                     continue
#                 previous_new_line = True
#             elif previous_new_line:
#                 previous_new_line = False
#             yield l_gof

# import  time
# def handle_giga_file(ran):
#     """
#     File that splits big text file into more minor files. Only split on empty lines.
#     """
#     # with open(INDIR_GIGA, 'r') as gof:
#     #     with open(INDIR_JOS, 'r') as gjf:
#     #         for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
#     #             pass
#     #     num_lines = i + 1
#     # print(num_lines)
#     num_lines = 1393184026
#     # 1393184026
#     # 1393184033
#     # return
#     num_lines_per_part = num_lines / GIGA_PARTS
#     curr_part = 0
#     gof_generator = giga_orig_generator()
#
#     diff_files = set()
#     # with open(INDIR_GIGA, 'r') as gof:
#     with open(INDIR_GIGA_OLD, 'r') as gjf:
#         # sentence = {}
#         # sentence['tokens'] = []
#         # sentence['links'] = {}
#         # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0])):
#         #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]))
#
#         # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % ran[0]), 'a')
#
#         # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
#         for i, l_gjf in enumerate(gjf):
#             l_gof = next(gof_generator)
#             if curr_part < ran[0]:
#                 if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n' :
#                     if curr_part < ran[0]:
#                         print(curr_part)
#                         curr_part += 1
#                         continue
#                 else:
#                     continue
#
#             l_gof_split = l_gof.split('\t')
#             l_gjf_split = l_gjf.split('\t')
#
#             # if punctuation
#             if l_gof != '\n':
#                 if l_gof_split != l_gjf_split:
#                     print(curr_part)
#                     diff_files.add(curr_part)
#                     l_gof = next(gof_generator)
#
#
#             # if l_gof == '\n':
#             else:
#             # wf.flush()
#             # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
#                 if i > num_lines_per_part * (curr_part + 1):
#                     curr_part += 1
#                     # if wf doesn't exist (first one)
#                     # wf.close()
#                     if curr_part >= ran[1]:
#                         break
#                     # if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part)):
#                     #     os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part))
#
#                     # wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
#
#         curr_part += 1
#     return diff_files
#         # wf.close()
#
# with Pool(CPU_CORES) as p:
#     final_range = [0, 100000]
#     # final_range = [0, 150]
#     # size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
#     # # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
#     # ranges = []
#     # ps = None
#     # for i in range(CPU_CORES):
#     #     s = int(final_range[0] + size_per_proc * i)
#     #     ns = int(final_range[0] + size_per_proc * (i + 1))
#     #     ranges.append([s, ns])
#     # # ranges = [[0, 1]]
#     # res = p.map(handle_giga_file, ranges)
#
#     res = handle_giga_file(final_range)
#     res = sorted(list(res))
#     if os.path.exists(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl')):
#         os.remove(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'))
#     with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'wb') as pkl_file:
#         pickle.dump(res, pkl_file)
#     # with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
#     #     mydict2 = pickle.load(pkl_file)
#     print('test')