A few days ago, I asked this question in the community and received the answer quickly from our fellows. The solution is great on Valgrind as well.
And based on his/her suggestion, I wrote the following function for split_dataset
as a part of the decision tree algorithm. Below is the code I wrote,
#include <stdio.h>
#include <stdlib.h>
void printArray(double array[], unsigned int size)
{
for (unsigned int i = 0; i < size; i) {
printf("%.3f ", array[i]);
}
printf("\n");
}
typedef struct
{
size_t length;
double **designMatrix_Y;
} DecisionTreeData;
DecisionTreeData *split_dataset(int index, //var index
double value, //best cutoff
int row, //nrows of design matrix, number of variables
double **designMatrix_Y) //design matrix (X) and response Y
{
// Buffers to hold rows of data as we are distributing rows based on the split.
double **leftDesignMatrix_Y = calloc(row, sizeof *leftDesignMatrix_Y);
double **rightDesignMatrix_Y = calloc(row, sizeof *rightDesignMatrix_Y);
size_t left_count = 0;
size_t right_count = 0;
for (size_t i = 0; i < row; i)
{
if (designMatrix_Y[index][i] <= value)
{
// Copy the row into the left half
leftDesignMatrix_Y[left_count] = designMatrix_Y[i];
left_count ;
}
else
{
// Copy the row into the right half
rightDesignMatrix_Y[right_count] = designMatrix_Y[i];
right_count ;
}
//realloc the memory for exact space
leftDesignMatrix_Y = realloc(leftDesignMatrix_Y, left_count * sizeof(double *));
rightDesignMatrix_Y = realloc(rightDesignMatrix_Y, right_count * sizeof(double *));
}
DecisionTreeData *data_split = malloc(sizeof(DecisionTreeData) * 2);
data_split[0] = (DecisionTreeData){left_count, leftDesignMatrix_Y};
data_split[1] = (DecisionTreeData){right_count, rightDesignMatrix_Y};
return data_split;
}
double *alloc_dvector(unsigned int length)
{
return malloc((length * (sizeof(double))));
}
void dealloc_dvector(double *array)
{
free((char *) array);
}
double **alloc_2dArray(unsigned int row, unsigned int col)
{
double **Array = (double **) malloc((size_t) ((row) * (sizeof(double *))));
for (unsigned int i = 0; i < row; i ) {
Array[i] = alloc_dvector(col);
}
return Array;
}
void dealloc_2dArray(double **Array, unsigned int row, unsigned int col)
{
for (unsigned int i = 0; i < row; i )
{
dealloc_dvector(Array[i]);
}
free((char *) Array);
}
int main()
{
double **designMatrix_Y = alloc_2dArray(4,4);
for(int i = 0; i<4; i )
{
for(int j= 0; j<4; j )
{
designMatrix_Y[i][j] = i*j;
}
}
DecisionTreeData *dataSplits = split_dataset(2,
2,
4,
designMatrix_Y);
//rename for convenience
size_t leftSize = (dataSplits[0]).length;
double **designMatrix_Y_L = (dataSplits[0]).designMatrix_Y;
double **designMatrix_Y_R = (dataSplits[1]).designMatrix_Y;
for(int i = 0; i<leftSize; i)
printArray(designMatrix_Y_L[i],4);
free(designMatrix_Y_L);
free(designMatrix_Y_R);
free(dataSplits);
dealloc_2dArray(designMatrix_Y,4,4);
return 0;
}
It works fine in my mac mini M1(doesn't support Valgrind, couldn't see the log) but crashed(segmentation fault) on Linux. Also, I have attached the error message from Valgrind with the code above.
==4777== LEAK SUMMARY:
==4777== definitely lost: 0 bytes in 0 blocks
==4777== indirectly lost: 0 bytes in 0 blocks
==4777== possibly lost: 0 bytes in 0 blocks
==4777== still reachable: 224 bytes in 8 blocks
==4777== suppressed: 0 bytes in 0 blocks
==4777==
==4777== ERROR SUMMARY: 5 errors from 4 contexts (suppressed: 0 from 0)
==4777==
==4777== 1 errors in context 1 of 4:
==4777== Invalid read of size 8
==4777== at 0x4006C7: printArray (test.c:7)
==4777== by 0x400A5E: main (test.c:117)
==4777== Address 0x0 is not stack'd, malloc'd or (recently) free'd
==4777==
==4777==
==4777== 1 errors in context 2 of 4:
==4777== Use of uninitialised value of size 8
==4777== at 0x4006C7: printArray (test.c:7)
==4777== by 0x400A5E: main (test.c:117)
==4777==
==4777==
==4777== 1 errors in context 3 of 4:
==4777== Invalid write of size 8
==4777== at 0x4007B4: split_dataset (test.c:36)
==4777== by 0x400A06: main (test.c:106)
==4777== Address 0x52052e8 is 0 bytes after a block of size 8 alloc'd
==4777== at 0x4C2C291: realloc (vg_replace_malloc.c:834)
==4777== by 0x400809: split_dataset (test.c:49)
==4777== by 0x400A06: main (test.c:106)
==4777==
==4777==
==4777== 2 errors in context 4 of 4:
==4777== Invalid write of size 8
==4777== at 0x4007E7: split_dataset (test.c:43)
==4777== by 0x400A06: main (test.c:106)
==4777== Address 0x5205380 is 0 bytes after a block of size 0 alloc'd
==4777== at 0x4C29EBD: malloc (vg_replace_malloc.c:306)
==4777== by 0x4C2C210: realloc (vg_replace_malloc.c:834)
==4777== by 0x400828: split_dataset (test.c:50)
==4777== by 0x400A06: main (test.c:106)
==4777==
==4777== ERROR SUMMARY: 5 errors from 4 contexts (suppressed: 0 from 0)
(END)
With the error code, I realized the error may arise from printArray()
. So, I ran it one more time without the for loop of printArray()
. The log from Valgrind changed as follows.
==4021== HEAP SUMMARY:
==4021== in use at exit: 0 bytes in 0 blocks
==4021== total heap usage: 15 allocs, 15 frees, 336 bytes allocated
==4021==
==4021== All heap blocks were freed -- no leaks are possible
==4021==
==4021== ERROR SUMMARY: 3 errors from 2 contexts (suppressed: 0 from 0)
==4021==
==4021== 1 errors in context 1 of 2:
==4021== Invalid write of size 8
==4021== at 0x4007B4: split_dataset (test.c:36)
==4021== by 0x400A06: main (test.c:106)
==4021== Address 0x52052e8 is 0 bytes after a block of size 8 alloc'd
==4021== at 0x4C2C291: realloc (vg_replace_malloc.c:834)
==4021== by 0x400809: split_dataset (test.c:49)
==4021== by 0x400A06: main (test.c:106)
==4021==
==4021==
==4021== 2 errors in context 2 of 2:
==4021== Invalid write of size 8
==4021== at 0x4007E7: split_dataset (test.c:43)
==4021== by 0x400A06: main (test.c:106)
==4021== Address 0x5205380 is 0 bytes after a block of size 0 alloc'd
==4021== at 0x4C29EBD: malloc (vg_replace_malloc.c:306)
==4021== by 0x4C2C210: realloc (vg_replace_malloc.c:834)
==4021== by 0x400828: split_dataset (test.c:50)
==4021== by 0x400A06: main (test.c:106)
==4021==
==4021== ERROR SUMMARY: 3 errors from 2 contexts (suppressed: 0 from 0)
(END)
So, can anyone help with this issue? Any suggestion? Thanks for your time in advance for my tedious description. Let me know if anything unclear.
gcc version 9.3.0
CodePudding user response:
The two reallocs don't belong where they are inside that for-loop. The should be after the for-loop when the final sizes of the resulting left/ride splits are firm. By resizing them on each iteration you're fixing them to their current size, which means, the next iteration will guarantee to be out-of-range by one slot (no matter which side is the target). An address sanitizer report will confirm this.
Both sides were intentionally over allocated to hold row
potential pointers. Those only need adjustment trimming once the dust settles after the split iterations are done.
This (marked HERE) below:
for (size_t i = 0; i < row; i)
{
if (designMatrix_Y[index][i] <= value)
{
// Copy the row into the left half
leftDesignMatrix_Y[left_count] = designMatrix_Y[i];
left_count ;
}
else
{
// Copy the row into the right half
rightDesignMatrix_Y[right_count] = designMatrix_Y[i];
right_count ;
}
// HERE HERE HERE HERE HERE
//realloc the memory for exact space
leftDesignMatrix_Y = realloc(leftDesignMatrix_Y, left_count * sizeof(double *));
rightDesignMatrix_Y = realloc(rightDesignMatrix_Y, right_count * sizeof(double *));
}
should be like this:
for (size_t i = 0; i < row; i)
{
if (designMatrix_Y[index][i] <= value)
{
// Copy the row into the left half
leftDesignMatrix_Y[left_count] = designMatrix_Y[i];
left_count ;
}
else
{
// Copy the row into the right half
rightDesignMatrix_Y[right_count] = designMatrix_Y[i];
right_count ;
}
}
//realloc the memory for exact space
leftDesignMatrix_Y = realloc(leftDesignMatrix_Y, left_count * sizeof(double *));
rightDesignMatrix_Y = realloc(rightDesignMatrix_Y, right_count * sizeof(double *));