I'm trying to code binary decision tree. The below code is a working example for data splitting as a part of the whole algorithm although it has some error detected by Valgrind.
As you may already know, the tree is composed of nodes. In every node, it carries all the qualified observations(rows) by the splitting condition. Continue on, the splitting relies on those rows as well.
And the best split condition can be found from a greedy search through all the variables and values, which means this data splitting step could execute hundreds of times to find the optimal splitting(evaluate by Gini index in classification problem). I thought it could be expensive when using memcpy
. The worst part is I have to free the memory for the copied rows after each splitting attempt.
So, I was thinking if there is a way to avoid using memcpy
to copy the entire dataset literally, instead, just reference the qualified rows of the original dataset in each splitting. In this way, no excessive memcpy
and free
are needed.
Question: To be specific for the below example, how to use one pointer to record the address of the qualified rows so that no memcpy
is needed and allow (easy or handy) access to qualified rows when performing a greedy search for optimal splitting.
Let me know if the question is not clear to you. Thanks so much for your time.
Any help and suggestion are welcome.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void printArray(double array[], unsigned int size) {
for (unsigned int i = 0; i < size; i) {
printf("%.3f ", array[i]);
}
printf("\n");
}
double **alloc_2dArray(unsigned int row, unsigned int col)
{
double **Array = (double **) malloc(row * sizeof(double *));
for (unsigned int i = 0; i < row; i ) {
Array[i] = (double *) malloc(col * sizeof(double));
}
return Array;
}
void dealloc_2dArray(double **Array, unsigned int row, unsigned int col)
{
for (unsigned int i = 0; i < row; i )
{
free(Array[i]);
}
free(Array);
}
int main(){
double **data = alloc_2dArray(4,4);
double delta[4] = {1,0,1,0};
double **dataL = alloc_2dArray(4,4);
double **dataR = alloc_2dArray(4,4);
int i,j;
int k=0;
//data initialization
for(i=0; i<4; i ){
for(j=0; j<4; j ){
data[i][j] = k ;
}
}
int l=0, r=0;
for(i=0; i<4; i ){
if((int)delta[i] ==0){
memcpy(dataL[l ], data[i], 4*sizeof(double));
}else{
memcpy(dataR[r ], data[i], 4*sizeof(double));
}
}
for(i = l; i<4; i ){
free(dataL[i]);
}
printf("l and r are %d %d\n", l, r);
dataL = realloc(dataL, l*sizeof(double*));
for(i = r; i<4; i ){
free(dataR[i]);
}
dataR = realloc(dataR, r*sizeof(double*));
for(i=0; i<4; i ){
if(dataL[i] != NULL){
printArray(dataL[i],4);
}
if(dataR[i] != NULL){
printArray(dataR[i],4);
}
}
dealloc_2dArray(data, 4,4);
dealloc_2dArray(dataL, l, 4);
dealloc_2dArray(dataR, r, 4);
return 0;
}
CodePudding user response:
I think you're asking whether you can just retain the original data row pointers in the split matrices. The answer is yes, you can. Ownership comes into play, however.
For the example below (which has no memcpy
operations) both the dataL
and dataR
pointer beds are direct-allocated with no actual rows of data assigned to them. The rows are "borrowed" from the original data matrix.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void printArray(double const array[], unsigned int size)
{
for (unsigned int i = 0; i < size; i)
{
printf("%-7.3f", array[i]);
}
printf("\n");
}
double **alloc_2dArray(unsigned int row, unsigned int col)
{
double **Array = malloc(row * sizeof(double *));
for (unsigned int i = 0; i < row; i )
{
Array[i] = (double *)malloc(col * sizeof(double));
}
return Array;
}
void dealloc_2dArray(double **Array, unsigned int row)
{
for (unsigned int i = 0; i < row; i )
{
free(Array[i]);
}
free(Array);
}
int main()
{
double **data = alloc_2dArray(4,4);
double delta[4] = {1, 0, 1, 0};
double **dataL = calloc(4, sizeof *dataL);
double **dataR = calloc(4, sizeof *dataR);
unsigned int i, j;
double k = 0;
// data initialization
for (i = 0; i < 4; i )
{
for (j = 0; j < 4; j )
{
data[i][j] = k;
k = 1;
}
}
printf("data[]\n");
for (i=0; i<4; i)
printArray(data[i], 4);
fputc('\n', stdout);
unsigned int l = 0, r = 0;
for (i = 0; i < 4; i )
{
if ((int)delta[i] == 0)
{
dataL[l ] = data[i];
}
else
{
dataR[r ] = data[i];
}
}
printf("l=%u r=%u\n", l, r);
dataL = realloc(dataL, l * sizeof(double *));
dataR = realloc(dataR, r * sizeof(double *));
printf("dataL[]\n");
for (i=0; i<l; i)
printArray(dataL[i], 4);
fputc('\n', stdout);
printf("dataR[]\n");
for (i=0; i<r; i)
printArray(dataR[i], 4);
fputc('\n', stdout);
free(dataL);
free(dataR);
dealloc_2dArray(data,4);
return 0;
}
Output
data[]
0.000 1.000 2.000 3.000
4.000 5.000 6.000 7.000
8.000 9.000 10.000 11.000
12.000 13.000 14.000 15.000
l=2 r=2
dataL[]
4.000 5.000 6.000 7.000
12.000 13.000 14.000 15.000
dataR[]
0.000 1.000 2.000 3.000
8.000 9.000 10.000 11.000
Regarding actual ownership of those row pointers. Only you know what is appropriate there. If the original data
matrix is to continue to own them, you must take care to ensure that data
and its rows are not destroyed while dataL
and dataR
are still in use. Likewise, if dataR
and dataL
are to take ownership, then you should not free those rows when freeing data
(but still free the base pointer bed of data
), and have dataL
and dataR
free'ing be responsible for freeing not only their base pointer beds, but the rows as well.
Anyway, there you go.