I am working with an image dataset and I want to do data augmentation and I am new to python. The dataset has 2 classes, and I want to save augmented images in the augmented class folder. dataset |
-- original_images
|
|-- class1
| |-- benign_image1.png
| |-- benign_image2.png
| |-- ...
|
|-- class2
|-- malignant_image1.png
|-- malignant_image2.png
|-- ...
I want to save augmented images like this way:
-- augmented_images
|
|-- class1
| |-- augmented_img1.png
| |-- augmented_img2.png
| |-- ...
|
|-- class2
|-- augmented_img1.png
|-- augmented_img2.png
|-- ...
CodePudding user response:
code below provides 2 functions that will do the job. The first function make_dataframe operates on the directory with the stored images, in your case that would be original_images. It produces a dataframe df with columns filepaths, labels where filepaths is the full path to an image and labels is the class label associated with the image file. The second function make_and_store_images takes in the dataframe created by the first function and generates and saves the augmented images in the augdir. If in your original_images directory a class has N image samples, then if N is less than n then n-N augmented images are created and stored for that class. If N is greator than or equal to n then no augmented images are created for that class. This is useful if you are trying to create a balanced dataset
import os
import pandas as pd
import shutil
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator
def make_dataframe(sdir):
# sdir is the directory when the class subdirectories are stored
filepaths=[]
labels=[]
classlist=sorted(os.listdir(sdir) )
for klass in classlist:
classpath=os.path.join(sdir, klass)
if os.path.isdir(classpath):
flist=sorted(os.listdir(classpath))
desc=f'{klass:25s}'
for f in tqdm(flist, ncols=130,desc=desc, unit='files', colour='blue'):
fpath=os.path.join(classpath,f)
filepaths.append(fpath)
labels.append(klass)
Fseries=pd.Series(filepaths, name='filepaths')
Lseries=pd.Series(labels, name='labels')
df=pd.concat([Fseries, Lseries], axis=1)
# return a dataframe with columns filepaths, labels
return df
def make_and_store_images(df, augdir, n, img_size, color_mode='rgb', save_prefix='aug-',save_format='jpg'):
#augdir is the full path where augmented images will be stored
#n is the number of augmented images that will be created for each class that has less than n image samples
# img_size is a tupple(height,width) that specifies the size of the augmented images
# color_mode is 'rgb by default'
# save_prefix is the prefix augmented images are identified with by default it is 'aug-'
#save_format is the format augmented images will be save in, by default it is 'jpg'
# see documentation of ImageDataGenerator at https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator for details
df=df.copy()
if os.path.isdir(augdir):# start with an empty directory
shutil.rmtree(augdir)
os.mkdir(augdir) # if directory does not exist create it
for label in df['labels'].unique():
classpath=os.path.join(augdir,label)
os.mkdir(classpath) # make class directories within aug directory
# create and store the augmented images
total=0
# in ImageDateGenerator select the types of augmentation you desire below are some examples
gen=ImageDataGenerator(horizontal_flip=True, rotation_range=20, width_shift_range=.2,
height_shift_range=.2, zoom_range=.2)
groups=df.groupby('labels') # group by class
for label in df['labels'].unique(): # for every class
classdir=os.path.join(augdir, label)
group=groups.get_group(label) # a dataframe holding only rows with the specified label
sample_count=len(group) # determine how many samples there are in this class
if sample_count< n: # if the class has less than target number of images
aug_img_count=0
delta=n - sample_count # number of augmented images to create
msg='{0:40s} for class {1:^30s} creating {2:^5s} augmented images'.format(' ', label, str(delta))
print(msg, '\r', end='') # prints over on the same line
aug_gen=gen.flow_from_dataframe( group, x_col='filepaths', y_col=None, target_size=img_size,
class_mode=None, batch_size=1, shuffle=False,
save_to_dir=classdir, save_prefix=save_prefix, color_mode=color_mode,
save_format=save_format)
while aug_img_count<delta:
images=next(aug_gen)
aug_img_count = len(images)
total =aug_img_count
print('Total Augmented images created= ', total)
Below is an example of use
sdir=r'C:\Temp\original_images'
df=make_dataframe(sdir)
print (df.head())
print ('length of dataframe is ',len(df))
augdir=r'c:\temp\aug' # directory to store the images if it does not exist it will be created
n=150 # if the class had N image samples in the sdir, if N<n than in augdir n-N augmented images will be created
img_size=(224,224) # image size (height,width) of augmented images
make_and_store_images(df, augdir, n, img_size, color_mode='rgb', save_prefix='aug-',save_format='jpg')