import argparse import json import logging import os import re import time problematic_words = ['...', '-', '—', '"', "'"] left_word = [',', '.', '!', '?', ':', ';', ')', '„'] right_word = ['(', '”'] ok_words = [] def read_json(file): jf = open(file) svala_data = json.load(jf) jf.close() return svala_data def compare_files(corrected_file, original_file): # count_differences(corrected_file['source'], original_file['source']) target = False source = False source_modifications = 0 for corrected_source, original_source in zip(corrected_file['source'], original_file['source']): if corrected_source != original_source: source_modifications += 1 target_modifications = 0 for corrected_target, original_target in zip(corrected_file['target'], original_file['target']): if corrected_target != original_target: target_modifications += 1 if target_modifications > 0: target = True if source_modifications > 0: source = True return target, source def mine_text(cor_files): text = '' has_space = False is_problematic = False errors = [] left_asterix = 0 right_asterix = 0 for corrected_source in cor_files: word = corrected_source['text'].strip() if re.match("^[a-zA-Z0-9ČĆŽŠĐčćžšđ§]+$", word): if has_space: text += ' ' text += word has_space = True elif word in problematic_words: if has_space: text += ' ' text += word is_problematic = True has_space = True elif word in left_word: if word == '„': left_asterix += 1 text += word has_space = True elif word in right_word: if word == '”': right_asterix += 1 if has_space: text += ' ' text += word has_space = False else: if has_space: text += ' ' text += word is_problematic = True has_space = True errors.append(word) if left_asterix != right_asterix: is_problematic = True if len(text) > 0 and text[-1] == ' ': text = text[:-1] return text, is_problematic, errors def write_file(is_problematic, foldername, filename, text, is_target): if is_target: new_filename = filename[:-5] + '_target.json' else: new_filename = filename[:-5] + '_source.json' if is_problematic: folder_path = os.path.join(args.problematic_folder, foldername) file_path = os.path.join(args.problematic_folder, foldername, new_filename) else: folder_path = os.path.join(args.unproblematic_folder, foldername) file_path = os.path.join(args.unproblematic_folder, foldername, new_filename) if not os.path.exists(folder_path): os.mkdir(folder_path) with open(file_path, 'w') as wf: wf.write(text) def main(args): errors_count = 0 all_errors = set() # create mapper to corrected files corrected_files_mapper = {} for foldername in os.listdir(args.corrected_folder): orig_name = 'KUS' + foldername.split('KUS')[1] corrected_files_mapper[orig_name] = foldername for foldername in os.listdir(args.original_folder): for filename in os.listdir(os.path.join(args.original_folder, foldername)): of = os.path.join(args.original_folder, foldername, filename) if filename.endswith('_problem.json'): new_filename = filename[:-13] + '_popravljeno.json' if os.path.exists(os.path.join(args.corrected_folder, corrected_files_mapper[foldername], new_filename)): filename = new_filename cf = os.path.join(args.corrected_folder, corrected_files_mapper[foldername], filename) cor_files = read_json(cf) ori_files = read_json(of) target, source = compare_files(cor_files, ori_files) if target: text, is_problematic, errors = mine_text(cor_files['target']) write_file(is_problematic, foldername, filename, text, True) for er in errors: all_errors.add(er) errors_count += 1 if source: text, is_problematic, errors = mine_text(cor_files['source']) write_file(is_problematic, foldername, filename, text, False) for er in errors: all_errors.add(er) errors_count += 1 print(corrected_files_mapper[foldername] + '/' + filename) print(errors_count) print(all_errors) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') parser.add_argument('--unproblematic_folder', default='data/svala_generated_text/unproblematic', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--problematic_folder', default='data/svala_generated_text/problematic', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--corrected_folder', default='data/solar.svala.1.0.1.corrected', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--original_folder', default='data/solar.svala1.0.1.original', help='input file in (gz or xml currently). If none, then just database is loaded') args = parser.parse_args() start = time.time() main(args) logging.info("TIME: {}".format(time.time() - start))