I have a data set with multiple CSV files (12 files)... Each file belongs to a person. I've used a neural network for modeling each file and now I want to use the Leave-One-Out method and leave one file for the test... How could I do this in python?
Here is my code for one file (In this code data is split to test and train for learning one file):
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
from keras import layers
from sklearn.preprocessing import RobustScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
def get_dataset():
data = pd.read_csv("file1.csv")
X=data.iloc[0:, 0:19]
y = data.iloc[0:, 19:]
scale_column = ['f1','f2','...','f19']
scaler = RobustScaler()
scaler = scaler.fit(X[scale_column])
X.loc[:, scale_column] = scaler.transform(X[scale_column].to_numpy())
print(X)
return X, y
# get the model
def get_model(n_inputs, n_outputs):
model = Sequential()
model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(Dense(n_outputs, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
return model
def evaluate_model(X, y):
avg_accs=[]
all_ypred=[]
all_test=[]
TPlist=[]
TNlist=[]
FPlist=[]
FNlist=[]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
model = get_model(n_inputs, n_outputs)
model.fit(X_train, y_train, verbose=0, epochs=100)
y_pred = model.predict(X_test)
y_pred = np.where(y_pred < 0.3, 0, 1)
test = y_test.to_numpy()
avg=[]
TPs=[]
TNs=[]
FPs=[]
FNs=[]
for i in range(len(y_pred)):
counter = 0
cnt_fps = 0
cnt_fns = 0
predic = y_pred[i]
real = test[i]
lst = np.equal(real, predic)
counter = np.count_nonzero(lst)
#print(counter)
avg.append(counter / 5)
precision = real * predic
TPs.append(np.count_nonzero(precision))
tmp1 = np.logical_and(predic == 1, real == 0)
FPs.append(np.count_nonzero(tmp1))
tmp2 = np.logical_and(predic == 0, real == 1)
FNs.append(np.count_nonzero(tmp2))
tmp3 = np.logical_and(predic == 0, real == 0)
TNs.append(np.count_nonzero(tmp3))
arr = np.array(avg)
Accuracy = np.mean(avg)
#avg_accs.append(Accuracy)
#a=np.mean(avg_accs)
print("final------------->", Accuracy)
Specificity = np.sum(TNs) / (np.sum(TNs) np.sum(FPs))
Sensitivity = np.sum(TPs) / (np.sum(TPs) np.sum(FNs))
fpr = dict()
tpr = dict()
roc_auc = dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
plt.figure()
lw = 2
plt.plot(fpr["micro"], tpr["micro"], color='darkorange',
lw=lw, label='micro-average ROC curve (area = %0.2f)' % roc_auc["micro"])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
X, y = get_dataset()
evaluate_model(X, y)
CodePudding user response:
You could try something like this.
import glob
def get_dataset():
csv_list = glob.glob("path_to_csvs/*.csv")
csv_test = csv_list.pop(random.randint(0,len(csv_list)-1) # remove one random element from csv list and return it
data_test = pd.read_csv(csv_test)
data_train = pd.concat([pd.read_csv(f) for f in csv_list])
.
.
.
return X, y
CodePudding user response:
It is possible this way you may shuffles or manage it as index.
[ Sample ]:
import matplotlib.pyplot as plt
import os
import tensorflow as tf
import tensorflow_io as tfio
import pandas as pd
from sklearn.preprocessing import RobustScaler
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Variables
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
list_label = [ ]
list_Image = [ ]
n_books = 12
start = 1
limit = n_books
PATH = os.path.join('F:\\datasets\\downloads\\Actors\\train\\Pikaploy', '*.tif')
PATH_2 = os.path.join('F:\\datasets\\downloads\\Actors\\train\\Candidt Kibt', '*.tif')
files = tf.data.Dataset.list_files(PATH)
files_2 = tf.data.Dataset.list_files(PATH_2)
list_file = []
list_file_actual = []
list_label = []
list_label_actual = [ 'Pikaploy', 'Pikaploy', 'Pikaploy', 'Pikaploy', 'Pikaploy', 'Candidt Kibt', 'Candidt Kibt', 'Candidt Kibt', 'Candidt Kibt', 'Candidt Kibt' ]
for file in files.take(15):
image = tf.io.read_file( file )
image = tfio.experimental.image.decode_tiff(image, index=0)
list_file_actual.append(image)
image = tf.image.resize(image, [32,32], method='nearest')
list_file.append(image)
list_label.append(1)
for file in files_2.take(18):
image = tf.io.read_file( file )
image = tfio.experimental.image.decode_tiff(image, index=0)
list_file_actual.append(image)
image = tf.image.resize(image, [32,32], method='nearest')
list_file.append(image)
list_label.append(9)
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Callback
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
class custom_callback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs={}):
if( logs['accuracy'] >= 0.97 ):
self.model.stop_training = True
custom_callback = custom_callback()
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Functions
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def get_dataset():
list_label = [ ]
list_Image = [ ]
datasets = [ ]
scale_column = ['Image']
scale_column_label = ['Label']
for i in range( n_books ) :
variables_1 = pd.read_excel('F:\\temp\\Python\\excel\\Book ' str( i 1 ) '.xlsx', index_col=None, header=[0])
for i in range( variables_1[scale_column].to_numpy().shape[0] ) :
image = tf.io.read_file( variables_1[scale_column].to_numpy()[i][0] )
image = tfio.experimental.image.decode_tiff(image, index=0)
image = tf.image.resize(image, [32,32], method='nearest')
label = variables_1[scale_column_label].to_numpy()[i][0]
list_Image.append( image )
list_label.append( label )
dataset_1 = tf.data.Dataset.from_tensor_slices((tf.constant(tf.cast(list_Image, dtype=tf.int64), shape=(len( list_Image ), 1, 32, 32, 4), dtype=tf.int64),tf.constant(list_label, shape=(len( list_label ), 1, 1), dtype=tf.int64)))
datasets.append( dataset_1 )
return datasets
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Initialize
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model = tf.keras.models.Sequential([
tf.keras.layers.InputLayer(input_shape=( 32, 32, 4 )),
tf.keras.layers.Normalization(mean=3., variance=2.),
tf.keras.layers.Normalization(mean=4., variance=6.),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Reshape((256, 32 * 32)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(196, return_sequences=True, return_state=False)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(196)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(192, activation='relu'),
tf.keras.layers.Dense(2),
])
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Callback
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
class custom_callback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs={}):
if( logs['accuracy'] >= 0.97 ):
self.model.stop_training = True
custom_callback = custom_callback()
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Optimizer
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
optimizer = tf.keras.optimizers.Nadam(
learning_rate=0.000001, beta_1=0.9, beta_2=0.999, epsilon=1e-07,
name='Nadam'
)
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Loss Fn
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
lossfn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=False,
reduction=tf.keras.losses.Reduction.AUTO,
name='sparse_categorical_crossentropy'
)
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Summary
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model.compile(optimizer=optimizer, loss=lossfn, metrics=['accuracy'] )
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Training
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
range_value = tf.range(start, limit, delta=1, dtype=tf.int32, name='range')
shuffle = tf.random.shuffle( range_value, seed=10, name='shuffle' )
datasets = get_dataset()
print( shuffle )
for i in range( int( n_books - 1 ) ) :
history = model.fit( datasets[shuffle[i]], batch_size=100, epochs=50, callbacks=[custom_callback] )
CodePudding user response:
I haven't used tensorflow but in Python when I want to find the optimum k for a k-NN classifier in sklearn using the leave-one-out method, I use the following:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifiercode
def single_case_classifier(training_df, target_group, ix, k):
'''Returns the target_group for the omitted index ix in the training_df using k-NN classifier'''
# Create a classifier instance to do k-nearest neighbours
myClassifier = KNeighborsClassifier(n_neighbors = k,
metric = 'euclidean',
weights = 'uniform')
# Apply aClassifer to all data points except index ix
myClassifier.fit(training_df.drop(ix, axis='index'),
target_group.drop(ix))
# Return the class predicted by the trained classifier
# Need to predict on list of training_df.loc[ix] as predict
# expects list/array
return myClassifier.predict([training_df.loc[ix]])[0]
Then import your data and separator the training columns and the group column, for example:
training_data_df = data_df[['#training_columns']]
group_values = data_df['#group_column']
And finally, to implement the function to find the best k value, we count to see how many correct matches for each data point there is for each k value and select the k value with the highest count. If two k values are tied with the highest correct count then I choose the smaller of the two k values
for k in range(1,8):
print('{}\t{}'.format(k,
list([single_case_classifier(training_data_df,
group_values,
i,
k)
for i in training_data_df.index] == group_values).count(True)))
Since you have the data in different files, this may work if you can combine the data into one dataframe. If your data is not setup like that then I hope this gives some idea of how a leave-one-out method is implemented in Python. Best of luck.