You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

78 lines
2.8 KiB

import argparse
import json
import logging
import os
import shutil
import time
def read_json(file):
jf = open(file)
svala_data = json.load(jf)
jf.close()
return svala_data
def compare_files(corrected_file, original_file):
# count_differences(corrected_file['source'], original_file['source'])
target = False
source = False
source_modifications = 0
for corrected_source, original_source in zip(corrected_file['source'], original_file['source']):
if corrected_source != original_source:
source_modifications += 1
target_modifications = 0
for corrected_target, original_target in zip(corrected_file['target'], original_file['target']):
if corrected_target != original_target:
target_modifications += 1
if target_modifications > 0:
target = True
if source_modifications > 0:
source = True
return target, source
def main(args):
# create mapper to corrected files
# corrected_files_mapper = {}
# for foldername in os.listdir(args.original_folder):
# orig_name = 'KUS' + foldername.split('KUS')[1]
# corrected_files_mapper[orig_name] = foldername
if os.path.exists(args.copied_folder):
shutil.rmtree(args.copied_folder)
os.makedirs(args.copied_folder)
for foldername in os.listdir(args.original_folder):
os.makedirs(os.path.join(args.copied_folder, foldername))
for filename in os.listdir(os.path.join(args.original_folder, foldername)):
of = os.path.join(args.original_folder, foldername, filename)
copy_filename_split = filename.split('_')
assert len(copy_filename_split) == 3 or len(copy_filename_split) == 2
if len(copy_filename_split) == 3:
copy_filename = copy_filename_split[0] + '_' + copy_filename_split[2]
elif len(copy_filename_split) == 2:
copy_filename = copy_filename_split[0] + '_' + copy_filename_split[1]
else:
raise 'Impossible!'
cf = os.path.join(args.copied_folder, foldername, copy_filename)
shutil.copyfile(of, cf)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
parser.add_argument('--copied_folder', default='data/svala_generated_text.formatted',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--original_folder', default='data/svala_generated_text.handchecks',
help='input file in (gz or xml currently). If none, then just database is loaded')
args = parser.parse_args()
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))