78 lines
2.8 KiB
Python
78 lines
2.8 KiB
Python
|
import argparse
|
||
|
import json
|
||
|
import logging
|
||
|
import os
|
||
|
import shutil
|
||
|
import time
|
||
|
|
||
|
def read_json(file):
|
||
|
jf = open(file)
|
||
|
svala_data = json.load(jf)
|
||
|
jf.close()
|
||
|
return svala_data
|
||
|
|
||
|
|
||
|
def compare_files(corrected_file, original_file):
|
||
|
# count_differences(corrected_file['source'], original_file['source'])
|
||
|
target = False
|
||
|
source = False
|
||
|
|
||
|
source_modifications = 0
|
||
|
for corrected_source, original_source in zip(corrected_file['source'], original_file['source']):
|
||
|
if corrected_source != original_source:
|
||
|
source_modifications += 1
|
||
|
|
||
|
target_modifications = 0
|
||
|
for corrected_target, original_target in zip(corrected_file['target'], original_file['target']):
|
||
|
if corrected_target != original_target:
|
||
|
target_modifications += 1
|
||
|
|
||
|
if target_modifications > 0:
|
||
|
target = True
|
||
|
if source_modifications > 0:
|
||
|
source = True
|
||
|
|
||
|
return target, source
|
||
|
|
||
|
|
||
|
def main(args):
|
||
|
# create mapper to corrected files
|
||
|
# corrected_files_mapper = {}
|
||
|
# for foldername in os.listdir(args.original_folder):
|
||
|
# orig_name = 'KUS' + foldername.split('KUS')[1]
|
||
|
# corrected_files_mapper[orig_name] = foldername
|
||
|
if os.path.exists(args.copied_folder):
|
||
|
shutil.rmtree(args.copied_folder)
|
||
|
|
||
|
os.makedirs(args.copied_folder)
|
||
|
|
||
|
for foldername in os.listdir(args.original_folder):
|
||
|
os.makedirs(os.path.join(args.copied_folder, foldername))
|
||
|
for filename in os.listdir(os.path.join(args.original_folder, foldername)):
|
||
|
of = os.path.join(args.original_folder, foldername, filename)
|
||
|
copy_filename_split = filename.split('_')
|
||
|
assert len(copy_filename_split) == 3 or len(copy_filename_split) == 2
|
||
|
if len(copy_filename_split) == 3:
|
||
|
copy_filename = copy_filename_split[0] + '_' + copy_filename_split[2]
|
||
|
elif len(copy_filename_split) == 2:
|
||
|
copy_filename = copy_filename_split[0] + '_' + copy_filename_split[1]
|
||
|
else:
|
||
|
raise 'Impossible!'
|
||
|
|
||
|
cf = os.path.join(args.copied_folder, foldername, copy_filename)
|
||
|
shutil.copyfile(of, cf)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
|
||
|
parser.add_argument('--copied_folder', default='data/svala_generated_text.formatted',
|
||
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||
|
parser.add_argument('--original_folder', default='data/svala_generated_text.handchecks',
|
||
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
start = time.time()
|
||
|
main(args)
|
||
|
logging.info("TIME: {}".format(time.time() - start))
|