This function f is to find common elements in an array and return result array and i am using 4 four loops to accomplish this task which i feel is no the best use of the loops, Another problem is, how can i determine the size of the returned array so that my loop is within bounds
here is the code
#include <stdio.h>
#include <stdlib.h>
int *f(int first[], int second[], int size_first, int size_second);
int main(void) {
int arr1[]={1, 8, 3, 2, 6};
int arr2[]= {2, 6, 1};
int size1 = sizeof(arr1)/sizeof(arr1[0]);
int size2 = sizeof(arr2)/sizeof(arr2[0]);
int *intersection = f(arr1, arr2, size1, size2);
for(int i=0;i<3; i ){
printf("%d ", intersection[i]);
}
return 0;
}
// function to find common elements in 2 arrays
int *f(int first[], int second[], int size_first, int size_second){
int k=0, count=0;
//loop through the array to find the number common elements and store in count for dynamic memory allocation in future
for(int i=0;i<size_first;i ){
for(int j=0;j<size_second;j ){
if(first[i]==second[j]){
count ;
}
}
}
// allocate memory for the common elements by making use of count
int * common_elements = (int*)malloc(count*sizeof(int));
// store the common elements in the new memory location
for(int i=0;i<size_first;i ){
for(int j=0;j<size_second;j ){
if(first[i]==second[j]){
common_elements[k]=first[i];
k ;
}
}
}
return common_elements;
free(common_elements);
}
CodePudding user response:
If you are allowed to waste some memory, note that the intersection cannot have cardinality larger than the number of elements in the smaller set. Therefore, you can allocate more memory than you might need and avoid having to count first and allocate later.
Or, you can realloc
as you go.
In general, you need a good data structure for checking set membership more quickly than scanning an entire array although for small sizes which fit in various caches, the linear scan will not perform too shabbily either.
For larger sets, however, you'll want to load the larger of the sets into an AVL tree or Scapegoat tree.
For really large data sets, you'll need to look into Bloom filters and related data structures depending on the use case.
I am including below the most naive improvement in your code which still has the nested loop and wastes memory up to the size of the smaller set to avoid counting common elements first.
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
// TODO: What about duplicates in smaller set?
int *
int_set_intersection(
const int *first,
const int *second,
const size_t size_first,
const size_t size_second,
size_t *n
)
{
size_t K = 0; // number of common elements
const int is_first_smaller = (size_first < size_second);
// Done this way so I can declare variables as consts
const int *set_smaller = is_first_smaller ? first : second;
const int *set_larger = is_first_smaller ? second : first;
const size_t size_smaller = is_first_smaller ? size_first : size_second;
const size_t size_larger = is_first_smaller ? size_second : size_first;
int *common = malloc(size_smaller * sizeof(*common));
if (!common) {
fprintf(stderr, "Failed to allocate memory for %z ints\n", size_smaller);
perror("Cannot allocate memory for common elements");
exit(EXIT_FAILURE);
}
for (size_t i = 0; i < size_smaller; i) {
for (size_t j = 0; j < size_larger; j) {
if (set_smaller[i] == set_larger[j]) {
common[K] = set_smaller[i];
K;
break;
}
}
}
*n = K;
return common;
}
void
int_set_print(const int *set, size_t n, FILE *f)
{
FILE *out = f ? f : stdout;
size_t i = 0;
fputs("{ ", out);
for (i = 0; i < n - 1; i) {
fprintf(out, "%d, ", set[i]);
}
fprintf(out, "%d }\n", set[i]);
}
int
main(void) {
int arr1[] = {1, 8, 3, 2, 6};
int arr2[] = {2, 5, 1};
size_t n = 0;
const int *intersection = int_set_intersection(
arr1,
arr2,
sizeof(arr1)/sizeof(arr1[0]),
sizeof(arr2)/sizeof(arr2[0]),
&n
);
int_set_print(intersection, n, NULL);
free(intersection); // not really needed, but good hygiene
return 0;
}
CodePudding user response:
For larger arrays, one option is to sort the contents first to make it easier to check for common elements, as shown in the code below. If the original array contents cannot be changed, first copy them into dynamically allocated memory. Dynamically allocated memory is also needed to hold the list of common elements, but that can use the same storage as one of the copies.
OP's original function returns a pointer to dynamically allocated memory containing the array of common elements, but does not indicate the length of the array. I added a parameter to return the length.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int *f(int first[], int second[], int size_first, int size_second, int *size_common);
int main(void) {
int arr1[]={1, 8, 3, 2, 6};
int arr2[]= {2, 6, 1};
int size1 = sizeof(arr1)/sizeof(arr1[0]);
int size2 = sizeof(arr2)/sizeof(arr2[0]);
int size_common;
int *intersection = f(arr1, arr2, size1, size2, &size_common);
for(int i=0;i<size_common; i ){
printf("%d ", intersection[i]);
}
free(intersection);
return 0;
}
static int cmp_int(const void *ap, const void *bp) {
int a = *(const int *)ap;
int b = *(const int *)bp;
return (a > b) - (a < b);
}
// function to find common elements in 2 arrays
int *f(int first[], int second[], int size_first, int size_second,
int *size_common) {
int *copy1;
int *copy2;
int idx1;
int idx2;
int count;
// allocate memory local copies of the arrays
copy1 = malloc(size_first * sizeof (int));
copy2 = malloc(size_second * sizeof (int));
if (!copy1 || !copy2) {
// allocation error
free(copy1);
free(copy2);
*size_common = -1; // use -1 to report error
return NULL;
}
// copy the arrays
memcpy(copy1, first, size_first * sizeof (int));
memcpy(copy2, second, size_second * sizeof (int));
// sort the copies in ascending order
qsort(copy1, size_first, sizeof (int), cmp_int);
qsort(copy2, size_second, sizeof (int), cmp_int);
// find common elements
idx1 = 0;
idx2 = 0;
count = 0;
while (idx1 < size_first && idx2 < size_second) {
if (copy1[idx1] < copy2[idx2]) {
idx1 ;
} else if (copy1[idx1] > copy2[idx2]) {
idx2 ;
} else {
// common element found!
// use copy1[] to store common elements
copy1[count] = copy1[idx1];
count ;
idx1 ;
idx2 ;
}
}
// common elements are in copy1[].
// finished with copy2, so free it.
free(copy2);
if (count == 0) {
// no common elements
free(copy1); // free the memory
copy1 = NULL; // and make the function return NULL
} else {
// try to reduce memory for common elements
copy2 = realloc(copy1, count * sizeof (int));
if (copy2) {
// reallocation successful
copy1 = copy2;
} // else, never mind, copy1 is still valid
}
// return the common elements
*size_common = count;
return copy1;
}
CodePudding user response:
If your arrays are of comparable elements (you use integers, which are comparable), the best way in my opinion is to sort both arrays and traverse both in parallel, looking at both sides and comparing the elements at both sides. If there's a lowest element, advance its pointer, leaving the other waiting.... if there's a match (they are equal), you can mark it (more on this later) and advance both pointers, until you reach the end in one array (the sortest). You will get the marks on the matching positions, but if you reorder the array, exchanging the found element with the first of the yet, unmatched elements, you will have all matching elements in the first positions of both arrays, letting you to return only the number of matches from the function and the matches themselves in the first positions of both arrays.
The complexity of this algorithm should be O(n*log(n)) (because of the quicksorts) if you use quicksort, plus O(n) (which doesn't affect the final O) for the matching, so O(n*log(n)) should be the big O complexity, as a general case. Below is a sample code, with a run:
comp.c
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#define N(arr) (sizeof(arr)/sizeof((arr)[0]))
void swap(int *ref_a, int *ref_b)
{
if (ref_a == ref_b)
return; /* nothing to do. */
int temp = *ref_a;
*ref_a = *ref_b;
*ref_b = temp;
}
int int_cmp(const void *_a, const void *_b)
{
const int *a = _a, *b = _b;
return *a - *b;
}
void print(int v[], int v_sz, const char *fmt, ...)
{
va_list p;
va_start(p, fmt);
vprintf(fmt, p);
va_end(p);
char *sep = "[";
for (int i = 0; i < v_sz; i ) {
printf("%s%d", sep, v[i]);
sep = ", ";
}
printf("]\n");
}
int find_matches(int a[], int b[], int a_sz, int b_sz)
{
print(a, a_sz, "a(unsorted)");
print(b, b_sz, "b(unsorted)");
qsort(a, a_sz, sizeof a[0], int_cmp);
qsort(b, b_sz, sizeof b[0], int_cmp);
print(a, a_sz, "a(sorted)");
print(b, b_sz, "b(sorted)");
int i = 0;
for (int i_a = 0, i_b = 0; i_a < a_sz && i_b < b_sz;) {
if (a[i_a] < b[i_b]) {
i_a ;
continue;
} else if (a[i_a] > b[i_b]) {
i_b ;
continue;
}
/* a[i_a] == b[i_b] */
swap(&a[i_a], &a[i]);
swap(&b[i_b], &b[i]);
print(a, a_sz, "after #%d, a:", i);
print(b, b_sz, "after #%d, b:", i);
i_a ; i_b ; i ;
}
return i;
}
int main()
{
int arr1[] = {1, 8, 3, 2, 6, 7};
int arr2[] = {2, 6, 1, 7, 4, 1, 9, 6};
int size1 = N(arr1);
int size2 = N(arr2);
int match = find_matches(arr1, arr2, size1, size2);
for (int i = 0; i < match; i ) {
printf("Match #%d: %d\n", i, arr1[i]);
}
}
It will produce:
$ comp
a(unsorted)[1, 8, 3, 2, 6, 7]
b(unsorted)[2, 6, 1, 7, 4, 1, 9, 6]
a(sorted)[1, 2, 3, 6, 7, 8]
b(sorted)[1, 1, 2, 4, 6, 6, 7, 9]
after #0, a:[1, 2, 3, 6, 7, 8]
after #0, b:[1, 1, 2, 4, 6, 6, 7, 9]
after #1, a:[1, 2, 3, 6, 7, 8]
after #1, b:[1, 2, 1, 4, 6, 6, 7, 9]
after #2, a:[1, 2, 6, 3, 7, 8]
after #2, b:[1, 2, 6, 4, 1, 6, 7, 9]
after #3, a:[1, 2, 6, 7, 3, 8]
after #3, b:[1, 2, 6, 7, 1, 6, 4, 9]
Match #0: 1
Match #1: 2
Match #2: 6
Match #3: 7
$ _
A good interface is to switch in both algorithms the matched elements with the first of the non-yet-matched elements in both arrays, so in this way you can return an integer (the one you use to know the start of the non-yet-matched elements) that tells you the number of matched elements, and you will get them from any of the two arrays.
If the elements are not comparable, and they can be just be compared for equity, then you have to compare each element with any other for a match, take them off from the arrays (this can be done swapping them with the first of the not yet matched elemnts, and advance the pointers), and start again with the reduced versions of them. Some way of doing this is, when you find a match, to exchange them with the first, second, third elements of each array, and use a variation of the above algorithm (you reorder as you match) In this case you compare at first time n*m (but not all), when you get a match, (n-1)*(m-1), ... and so until the last comparition in which you fail all comparitions to (m-k)*(n-k). This is, in the average, m*n/2 (m-1)*(n-1)/2 ... (m-k)*(n-k). something in the range of m(m-1)*n(n-1)/k^2, which is O(m^2*n^2):
comp2.c
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#define N(arr) (sizeof(arr)/sizeof((arr)[0]))
void swap(int *ref_a, int *ref_b)
{
if (ref_a == ref_b)
return; /* nothing to do. */
int temp = *ref_a;
*ref_a = *ref_b;
*ref_b = temp;
}
int int_cmp(const void *_a, const void *_b)
{
const int *a = _a, *b = _b;
return *a - *b;
}
void print(int v[], int v_sz, const char *fmt, ...)
{
va_list p;
va_start(p, fmt);
vprintf(fmt, p);
va_end(p);
char *sep = "[";
for (int i = 0; i < v_sz; i ) {
printf("%s%d", sep, v[i]);
sep = ", ";
}
printf("]\n");
}
int find_matches(int a[], int b[], int a_sz, int b_sz)
{
print(a, a_sz, "a(unsorted)");
print(b, b_sz, "b(unsorted)");
int i = 0;
loop:
for (int i_a = 0; i_a i < a_sz; i_a ) {
for (int i_b = 0; i_b i < b_sz; i_b ) {
/* we can only compare for equality */
if (a[i i_a] == b[i i_b]) {
swap(&a[i i_a], &a[i]);
swap(&b[i i_b], &b[i]);
i ;
goto loop;
}
}
}
print(a, a_sz, "a(final)");
print(b, b_sz, "b(final)");
return i;
}
int main()
{
int arr1[] = {1, 8, 3, 2, 6, 7};
int arr2[] = {2, 6, 1, 7, 4, 1, 9, 6};
int size1 = N(arr1);
int size2 = N(arr2);
int match = find_matches(arr1, arr2, size1, size2);
for (int i = 0; i < match; i ) {
printf("Match #%d: %d\n", i, arr1[i]);
}
}
which produces, when running, the following output:
$ comp2
a(unsorted)[1, 8, 3, 2, 6, 7]
b(unsorted)[2, 6, 1, 7, 4, 1, 9, 6]
a(final)[1, 2, 6, 7, 3, 8]
b(final)[1, 2, 6, 7, 4, 1, 9, 6]
Match #0: 1
Match #1: 2
Match #2: 6
Match #3: 7
$ _
You can reorder the values, there's no difference, in this case you had a two level for
loop, mixed with a third level go back to the beginning and start again loop. The loop is warranted to finish, as when you go back to the top, you have increased i
, which means the nested for
loops will be shorter each time. We can rewrite the find_matches
routine in this case by adjusting the array start points, in this manner:
comp3.c
/* ... as before */
int find_matches(int a[], int b[], int a_sz, int b_sz)
{
print(a, a_sz, "a(unsorted)");
print(b, b_sz, "b(unsorted)");
int i = 0;
loop:
for (int i_a = 0; i_a < a_sz; i_a ) {
for (int i_b = 0; i_b < b_sz; i_b ) {
/* we can only compare for equality */
if (a[i_a] == b[i_b]) {
swap(&a[i_a], &a[0]);
swap(&b[i_b], &b[0]);
i ;
print(a , a_sz--, "a(after match)");
print(b , b_sz--, "b(after match)");
goto loop;
}
}
}
print(a, a_sz, "a(final)");
print(b, b_sz, "b(final)");
return i;
}
/* ... as before */
that will produce this result (I changed the initial sort order to see how it affects the final result):
$ comp3
a(unsorted): [7, 8, 2, 3, 6, 1]
b(unsorted): [2, 6, 1, 7, 4, 1, 9, 6]
a(after match): [7, 8, 2, 3, 6, 1]
b(after match): [7, 6, 1, 2, 4, 1, 9, 6]
a(after match): [2, 8, 3, 6, 1]
b(after match): [2, 1, 6, 4, 1, 9, 6]
a(after match): [6, 3, 8, 1]
b(after match): [6, 1, 4, 1, 9, 6]
a(after match): [1, 8, 3]
b(after match): [1, 4, 1, 9, 6]
a(final): [8, 3]
b(final): [4, 1, 9, 6]
Match #0: 7
Match #1: 2
Match #2: 6
Match #3: 1
$ _