I have a data structure format as below:
Dataset:
training-
-Cat
-dog
-monkey
I would like to transfer/move 10 percent of files from each dataset to validation dataset. How can I do it using python?. It should automatically create the directories as well
Dataset:
validation-
-Cat
-dog
-monkey
CodePudding user response:
You can try:
import os
source = 'C:/.../training/'
destination = 'C:/.../validation/'
if not os.path.exists(destination):
# Create a new directory because it does not exist
os.makedirs(destination)
allfiles = os.listdir(source)
for f in allfiles:
os.rename(source f, destination f)
CodePudding user response:
Try this, it should help, yet not tested on Windows (only Ubuntu). But you can modify if path string is different on Windows OS.
Tested on : Python = 3.6.13, numpy = 1.19.2
from glob import glob
import os
import numpy as np
import shutil
def copy_folder(src, dst, percent_keep=0.1):
all_files = glob(f"{src}/*")
# select folders
folders = [folder for folder in all_files if os.path.isdir(folder)]
# select files
all_files = [file for file in all_files if os.path.isfile(file)]
print(f"There are {len(folders)} folders in {src.split('training')[-1]}")
print(f"There are {len(all_files)} files in {src.split('training')[-1]}")
for folder in folders:
# iterate through subfolders
copy_folder(folder, dst, percent_keep)
if len(all_files) > 0:
# find path to be attacked to validation path
remaining_path = src.split("training/")[-1]
new_path = os.path.join(dst, "validation", remaining_path) # new path for destination
if not os.path.exists(new_path):
os.makedirs(new_path)
# select samples from all files you have
keep_files = np.random.choice(all_files, int(len(all_files) * percent_keep))
print(f"Copying {len(keep_files)} random files")
for index, file in enumerate(keep_files):
print(f"\rCopying {index 1} / {len(keep_files)}", end="")
shutil.copyfile(file, os.path.join(new_path, file.rsplit("/")[-1]))
print("")
if __name__ == "__main__":
src = "/home/user/Dataset/training" # Should be path to training folder
# should be path of directory one below training folder
# (lets say root) it will attach validation folder later in code
dst = "/home/user/Dataset/"
copy_folder(src, dst, 0.1)
if you dont want to use numpy
for selecting random file to copy to validation folder, use random
library.
Something like:
keep_files = random.choices(all_files,k=int(len(all_files) * percent_keep) )
If you dont want to use shutils
or glob
, you can use os
library:
os.lisdirs() # instead of glob
os.rename() # instead of shutils (maybe somethind different, not tested)
if you dont want random samples , use :
keep_files = all_files [:int(len(all_files) * percent_keep)]