You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
svala-scripts/svala_formatter/copy_svala_handchecked_file...

94 lines
3.7 KiB

import argparse
import json
import logging
import os
import shutil
import time
def read_json(file):
jf = open(file)
svala_data = json.load(jf)
jf.close()
return svala_data
def compare_files(corrected_file, original_file):
# count_differences(corrected_file['source'], original_file['source'])
target = False
source = False
source_modifications = 0
for corrected_source, original_source in zip(corrected_file['source'], original_file['source']):
if corrected_source != original_source:
source_modifications += 1
target_modifications = 0
for corrected_target, original_target in zip(corrected_file['target'], original_file['target']):
if corrected_target != original_target:
target_modifications += 1
if target_modifications > 0:
target = True
if source_modifications > 0:
source = True
return target, source
def main(args):
# create mapper to corrected files
corrected_files_mapper = {}
filename_encountered = False
for foldername in os.listdir(args.corrected_folder):
orig_name = 'KUS' + foldername.split('KUS')[1]
# if orig_name == 'KUS-G-slo-4-GO-E-2009-10105':
# filename_encountered = True
# if not filename_encountered:
# continue
corrected_files_mapper[orig_name] = foldername
filename_encountered = False
for foldername in os.listdir(args.original_folder):
# if foldername == 'KUS-G-slo-4-GO-E-2009-10105':
# filename_encountered = True
# if not filename_encountered:
# continue
for filename in os.listdir(os.path.join(args.original_folder, foldername)):
fixed = False
of = os.path.join(args.original_folder, foldername, filename)
copy_filename = filename
if filename.endswith('_problem.json'):
copy_filename = filename[:-13] + '.json'
if filename.endswith('_popravljeno.json'):
copy_filename = filename[:-13] + '.json'
cpf = os.path.join(args.copied_folder, foldername, copy_filename)
cpfol = os.path.join(args.copied_folder, foldername)
if filename.endswith('_problem.json'):
new_filename = filename[:-13] + '_popravljeno.json'
if os.path.exists(os.path.join(args.corrected_folder, corrected_files_mapper[foldername], new_filename)):
fixed = True
filename = new_filename
cf = os.path.join(args.corrected_folder, corrected_files_mapper[foldername], filename)
cor_files = read_json(cf)
ori_files = read_json(of)
target, source = compare_files(cor_files, ori_files)
if target or source or fixed:
if not os.path.exists(cpfol):
os.mkdir(cpfol)
shutil.copyfile(cf, cpf)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
parser.add_argument('--copied_folder', default='data/solar.svala.fixed.1.0.1',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--corrected_folder', default='data/solar.svala.1.0.1.corrected',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--original_folder', default='data/solar.svala1.0.1.original',
help='input file in (gz or xml currently). If none, then just database is loaded')
args = parser.parse_args()
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))