I need to calculate and compare execution time of multiplication of 2 matrices in 3 different sizes (100 * 100 , 1000 * 1000 and 10000 * 10000) in C programming language. I wrote the following simple code to do that for 1000 * 1000 and I got the execution time
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main()
{
int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
// Storing elements of first matrix.
for (i = 0; i < r1; i)
{
for (j = 0; j < c1; j)
{
(*a)[i][j] = rand() / RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; i)
{
for (j = 0; j < c2; j)
{
(*b)[i][j] = rand() / RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; i)
{
for (j = 0; j < c2; j)
{
(*result)[i][j] = 0;
}
}
clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; i)
for (j = 0; j < c2; j)
for (k = 0; k < c1; k)
{
(*result)[i][j] = (*a)[i][k] * (*b)[k][j];
}
clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n function took %f seconds to execute \n", time_taken);
return 0;
}
And now I want to repeat this part for two other sizes and get the result like this at the end of my program with one run:
the execution time for 100 * 100 is 1 second
the execution time for 1000 * 1000 is 2 seconds
the execution time for 10000 * 10000 is 3 seconds
What is the best solution for that? When I repeat this part for 10000 * 10000 after 1000 * 1000 I got the segmentation fault error.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main()
{
int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
// Storing elements of first matrix.
for (i = 0; i < r1; i)
{
for (j = 0; j < c1; j)
{
(*a)[i][j] = rand() / RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; i)
{
for (j = 0; j < c2; j)
{
(*b)[i][j] = rand() / RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; i)
{
for (j = 0; j < c2; j)
{
(*result)[i][j] = 0;
}
}
clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; i)
for (j = 0; j < c2; j)
for (k = 0; k < c1; k)
{
(*result)[i][j] = (*a)[i][k] * (*b)[k][j];
}
clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n \nfunction took %f seconds to execute \n",
time_taken);
free(a);
free(b);
free(result);
r1 = 10000, c1 = 10000, r2 = 10000, c2 = 10000;
printf("\n run second one for %d \n",r1);
// Storing elements of first matrix.
for (i = 0; i < r1; i)
{
for (j = 0; j < c1; j)
{
(*a)[i][j] = rand() / RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; i)
{
for (j = 0; j < c2; j)
{
(*b)[i][j] = rand() / RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; i)
{
for (j = 0; j < c2; j)
{
(*result)[i][j] = 0;
}
}
begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; i)
for (j = 0; j < c2; j)
for (k = 0; k < c1; k)
{
(*result)[i][j] = (*a)[i][k] * (*b)[k][j];
}
end1 = clock();
time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n second function took %f seconds to execute \n",
time_taken);
free(a);
free(b);
free(result);
return 0;
}
CodePudding user response:
A simplified version of your program:
...
int main()
{
int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
...
free(a);
free(b);
free(result);
r1 = 10000, c1 = 10000, r2 = 10000, c2 = 10000;
for (i = 0; i < r1; i)
for (j = 0; j < c1; j)
(*a)[i][j] = rand() /RAND_MAX; // KABOOM !
...
}
A quick but crucial information about about VLA arrays. Name "variable" in "variable-length-array" means that the size is stored in a variable, not that the size is variable. This variable is hidden and can be only read via sizeof
operator.
The size of array is bound to it's type, not to its value. Therefore the dimensions of VLA type (and object) cannot change, no matter if the object is dynamic or automatic.
The line:
double(*a)[r1][c1] = malloc(sizeof *a);
it interpreted as:
typedef double __hidden_type[r1][c1];
__hidden_type* a = malloc(sizeof *a);
... changes of r1 or c1 do not affect sizeof(__hidden_type)
The sizes are bound to the types when the types are defined. After that the types are immutable.
Therefore changing the r1
does not change the size of *a
. You need to create a new a
(or rather its type) and allocate memory for this new *a
.
I suggest moving the whole test to a function that takes r1
, r2
, c1
and c2
as parameters. The arrays would be local to the function.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void bench(int r1, int c1, int r2, int c2) {
int i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
// Storing elements of first matrix.
for (i = 0; i < r1; i)
{
for (j = 0; j < c1; j)
{
(*a)[i][j] = rand() /RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; i)
{
for (j = 0; j < c2; j)
{
(*b)[i][j] = rand()/ RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; i)
{
for (j = 0; j < c2; j)
{
(*result)[i][j] = 0;
}
}
clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; i)
for (j = 0; j < c2; j)
for (k = 0; k < c1; k)
{
(*result)[i][j] = (*a)[i][k] * (*b)[k][j];
}
clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n \nfunction took %f seconds to execute \n", time_taken);
free(a);
free(b);
free(result);
}
int main()
{
bench(1000, 1000, 1000, 1000);
bench(2000, 2000, 2000, 2000);
}
I've reduced the size from 10000 to 2000 to get results in reasonable time. On my machine I got:
function took 1.966788 seconds to execute
function took 37.370633 seconds to execute
Note that the function is very cache unfriendly.
for (i = 0; i < r1; i)
for (j = 0; j < c2; j)
for (k = 0; k < c1; k)
(*result)[i][j] = (*a)[i][k] * (*b)[k][j];
On every iteration of k
you get a cache miss when accessing (*b)[k][j]
. Try swapping the j
and k
loops:
for (i = 0; i < r1; i)
for (k = 0; k < c1; k)
for (j = 0; j < c2; j)
(*result)[i][j] = (*a)[i][k] * (*b)[k][j];
Now when increasing j
then (*result)[i][j]
and (*b)[k][j]
are likely in cache.
On my machine this trivial change gave 10x speedup:
function took 0.319594 seconds to execute
function took 3.829459 seconds to execute
CodePudding user response:
There are multiple problems in your code:
- you free the matrices and preform a new benchmark, storing data to invalid pointers... this has undefined behavior, in your case a segmentation fault.
- the allocation code is specific for the initial matrix size, you cannot reallocate the matrices for a different size in the
main()
function. You should move the code to a separate function taking the matrix sizes as arguments and call this function multiple times. - the initialization values
rand() / RAND_MAX
are almost always zero because integer arithmetics is used for this division. You should use(*a)[i][j] = rand() / (double)RAND_MAX;
Here is a modified version (similar to tstanisl's):
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void test(int r1, int c1, int r2, int c2) {
int i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
// Storing elements of first matrix.
for (i = 0; i < r1; i) {
for (j = 0; j < c1; j) {
(*a)[i][j] = rand() / (double)RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; i) {
for (j = 0; j < c2; j) {
(*b)[i][j] = rand() / (double)RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; i) {
for (j = 0; j < c2; j) {
(*result)[i][j] = 0;
}
}
clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
// using cache friendly index order
for (i = 0; i < r1; i) {
for (k = 0; k < c1; k) {
for (j = 0; j < c2; j) {
(*result)[i][j] = (*a)[i][k] * (*b)[k][j];
}
}
}
clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("M(%d,%d) x M(%d,%d) took %f seconds to execute\n",
r1, c1, r2, c2, time_taken);
free(a);
free(b);
free(result);
}
int main() {
test(100, 100, 100, 100);
test(1000, 1000, 1000, 1000);
test(2000, 2000, 2000, 2000);
test(3000, 3000, 3000, 3000);
test(4000, 4000, 4000, 4000);
return 0;
}
Output:
M(100,100) x M(100,100) took 0.000347 seconds to execute
M(1000,1000) x M(1000,1000) took 0.616177 seconds to execute
M(2000,2000) x M(2000,2000) took 5.017987 seconds to execute
M(3000,3000) x M(3000,3000) took 17.703356 seconds to execute
M(4000,4000) x M(4000,4000) took 43.825951 seconds to execute
The time complexity of this simplistic implementation is O(N3), which is consistent with the above timings. Given enough RAM (2.4 GB), multiplying matrices with 10000 rows and columns would take a bit more than 10 minutes.
CodePudding user response:
And now I want to repeat this part for two other sizes and get the result like this at the end of my program with one run: the execution time for 100 * 100 is 1 second the execution time for 1000 * 1000 is 2 seconds the execution time for 10000 * 10000 is 3 seconds
I simply cannot believe that you have multiplied two 10,000 x 10,000 matrices in 3 seconds. What computer are you running that experiment in? Not, with that algorithm and using only one core. Probably you are optimizing your compilation (with default flag -O2
and the whole algorithm has been evicted from the compiler output, as you don't use the matrix after the calculation, so it is innecessary to lose time in a loop for nothing) first, compile your matrix, or print after the computation one element of the result matrix, so the compiler cannot evict the calculation. But don't say your algorithm is multiplying two matrices 10,000 rows and columns in 3 sec.
Allocating a matrix of 10,000x10,000 double
quantities, requires a lot of memory. It's 100,000,000 of double
entries, wich gets over 800Mb in a single malloc It's very possible that your laptop can handle this once.... but don't do many of these allocations, as you will probably will be over the limits of malloc(3)
(or your system).
More when you need at least two of these allocations, or even more, as you said you want to repeat the calculations.
Have you trying to scale yor problem to 100,000 by 100,000 matrices?
When you repeat, it's not warranted that there's no fragmentation in the heap maintained by malloc()
, so as you are requesting one gigabyte of continuous memory per matrix, it is probable that malloc(3)
runs out of memory in the last mallocs. I'd suggest you to do the three tests in different programs (separate), and don't start any other program (like the browser or your favourite desktop environment while running a matrix multiplication involving 200,000,000 numbers.
Anyway, your program probably start doing swapping if you don't fine control your execution environment, just trashing all the efficiency measurements you are trying to do.
Another thing is that you can probably could incurr in a process limit (if your administrator has established a maximum core memory limmit for your process) and you don't check the result from malloc(3)
for proper allocation.
Note:
I have not been able to multiply the two 10,000 by 10,000 matrices in my machine (Pentium Core duo with 6Gb RAM) because the program starts swaping and becomes incredible slow. BTW, be careful, and don't compile your program with optimization (even -O1 makes the matrix multiplication to be eliminated completely from the code, as there's no use of the product, so the complete algorithm is eliminated from the output code by the optimization code in the compiler) I'll leave it running and update my answer if I get some result.
(edit)
$ matrix_mult
function took 9364.303443 seconds to execute
$
It took 2h 36m 6.303443s to execute in my system. This is more approximate to the complexity of the problem (as your approximation is barely linear) you need to compile it without optimization or at least print one element of the resultant matrix, if you want the product being calculated.
Below is the code I'm testing, if you want to test it (I have modified it a bit to make it more readable, to use better timestamps, and to avoid the C initialization, as it can be done on the fly (read the comments in the code):
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 10000
int main()
{
// Dynamic allocation.
#define A (*a)
#define B (*b)
#define R (*result)
double A[N][N] = malloc(sizeof *a);
double B[N][N] = malloc(sizeof *b);
double R[N][N] = malloc(sizeof *result);
printf("Initializing both matrices\n");
// Storing elements of first (and second) matrices.
for (int row = 0; row < N; row) {
for (int col = 0; col < N; col) {
A[row][col] = rand() / (double)RAND_MAX; // matrix A
B[row][col] = rand() / (double)RAND_MAX; // matrix B
}
}
// Storing elements of second matrix. (done above)
// Initializing all elements of result matrix to 0
// (not needed, see below)
printf("Starting multiplication\n");
struct timespec start_ts; // start timestamp (secs & nsecs).
int res = clock_gettime(
CLOCK_PROCESS_CPUTIME_ID,
&start_ts); // cpu time only.
if (res < 0) {
perror("clock_gettime");
exit(EXIT_FAILURE);
}
// Multiplying matrices a and b and
// storing result in result matrix
for (int row = 0; row < N; row) {
for (int col = 0; col < N; col) {
double aux = 0.0;
for (int k = 0; k < N; k) {
aux = A[row][k] * B[k][col];
}
// why to involve calculating the address of the affected
// cell at every inner loop iteration???
R[row][col] = aux;
}
}
struct timespec end_ts;
res = clock_gettime(
CLOCK_PROCESS_CPUTIME_ID,
&end_ts);
if (res < 0) {
perror("clock_gettime");
exit(EXIT_FAILURE);
}
bool carry = start_ts.tv_nsec > end_ts.tv_nsec;
struct timespec diff_time;
diff_time.tv_sec = end_ts.tv_sec - start_ts.tv_sec;
diff_time.tv_nsec = end_ts.tv_nsec - start_ts.tv_nsec;
if (carry) {
diff_time.tv_sec--;
diff_time.tv_nsec = 1000000000;
}
printf("\n function took %ld.ld seconds to execute \n",
diff_time.tv_sec, diff_time.tv_nsec / 1000);
return 0;
}