#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <dirent.h>
#include <pthread.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <string.h>
void* copyFile(void* arg) {
char *filename = ((char*)arg);
char *destname = strcat(filename, "copy");
int in = 0;
in = open(filename, O_RDONLY);
int out = 0;
out = open(destname, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
struct stat statbuf;
fstat(in, &statbuf);
off_t insize = statbuf.st_size;
lseek(out, insize - 1, SEEK_SET);
ssize_t wrote = write(out, "", 1);
char* inmap = mmap(0, insize, PROT_READ, MAP_FILE | MAP_PRIVATE, in, 0);
char* outmap = mmap(0, insize, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, out, 0);
memcpy(outmap, inmap, insize);
close(out);
}
int main(int argc, char *argv[]) {
DIR* d;
struct dirent* e;
d = opendir(argv[1]);
int num_entries = 0;
int i = 0;
pthread_t threads[50];
while((e = readdir(d)) != NULL) {
char *filename = e->d_name;
printf("%s\n", filename);
pthread_create(&threads[i], NULL, copyFile, filename);
i ;
}
for(int j = 0; j < i; j ) {
pthread_join(threads[j], NULL);
}
}
Hello, I am attempting to use mmap and memcpy to copy an entire directory of files using multithreading. I've got the code to copy the files in the directory, however it is forcing all COPIED files to be 1 Byte in size. I'm curious as to why this is happening and what can be done to resolve this bug.
I am very much a novice at C so please forgive my obvious semantic errors. I appreciate the assistance!
CodePudding user response:
There are a number of errors.
In
main
,filename
will point to the same address for all files. We must provide a unique address/copy. One way is to usestrdup
. And, we should add afree
call at the bottom ofcopyFile
to release the storage allocated by thestrdup
.We must skip over
.
and..
when copying. Otherwise, the code segfaults.Using the same directory for input files and the copy can cause the equivalent of an infinite loop. That is, given a file
xyz
that is copied toxyzcopy
, thereaddir
may be able to seexyzcopy
as input and try to createxyzcopycopy
. Better to use a separate output directory. That would require a major rewrite. But, a simpler fix is to skip any file that containscopy
.The program will [try to] copy directory entries even if they are not ordinary files (e.g. it would try to do
open
on a directory). We should check the file type and skip non-file types.In
copyFile
, even with thestrdup
inmain
, the stringfilename
does not have enough space to allow for thestrcat
of"copy"
. We need a separate buffer for the output filename.The
lseek/write
adds a superfluous binary zero at the end of the output file. This is to extend the output file. Better to useftruncate
for that.That's because intermixing
write
withmmap
can be problematic and is usually not the best practice. In your use case, it may be okay (because thewrite
is done before themmap
), but if we go to the trouble to usemmap
, it's better to just use themmap
buffer pointer andmemcpy
.copyFile
is missing areturn
. And,main
is also missing areturn
There is no
close
forin
incopyFile
.There is no error checking on
open
,mmap
,opendir
calls.There should be
munmap
calls before theclose
calls.
Bugs not fixed:
The program is limited in how many entries the directory can have.
With
pthread_t threads[50];
we can only have 50 entries in the directory. There is no check for overflow of this array.It makes no practical sense to do all files in parallel. This does not scale.
If we had (e.g.) 1,000,000 entries in the directory, we could run out of resources. We could run out of unused process slots and
pthread_create
would start to fail after a system wide limit is reached. We could run out of file descriptors and theopen
calls would begin to fail after approximately 1024 open files.After a set number of threads is created (e.g. in practice, 4-5), the copying process would actually be slower because the system would spend most of its time switching between the threads rather than doing useful work.
For small files, the overhead of thread creation/teardown becomes significant relative to the running time of the thread.
It would be better to have a limited "pool" of "worker" threads, and intersperse the pthread_create
calls with pthread_join
calls, reclaiming/reusing the now unused/free "slot" when a thread completes.
A more efficient way would be to create the pool of threads (via pthread_create
calls once at the start. Each thread would have a mailbox/queue of work to do. main
could enqueue the entries to the threads, adding a new entry when it sees a thread become idle. The pthread_join
calls could be done once at the end
Below is the corrected code.
I used preprocessor conditionals to denote old vs. new code (e.g.):
#if 0
// old code
#else
// new code
#endif
#if 1
// new code
#endif
Anyway, here is the corrected code. I've annotated the bugs/fixes with comments:
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <dirent.h>
#include <pthread.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <string.h>
void *
copyFile(void *arg)
{
// NOTE/BUG: no cast is needed from a void *
#if 0
char *filename = ((char *) arg);
#else
char *filename = arg;
#endif
// NOTE/BUG: there is _not_ enough space in filename to add "copy" -- this is
// UB (undefined behavior)
#if 0
char *destname = strcat(filename, "copy");
#else
char destname[1024];
strcpy(destname,filename);
strcat(destname,"copy");
#endif
int in = 0;
in = open(filename, O_RDONLY);
// NOTE/FIX: check for error
#if 1
if (in < 0) {
perror(filename);
exit(1);
}
#endif
int out = 0;
out = open(destname, O_RDWR | O_CREAT | O_TRUNC,
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
// NOTE/FIX: check for error
#if 1
if (out < 0) {
perror(destname);
exit(1);
}
#endif
struct stat statbuf;
fstat(in, &statbuf);
off_t insize = statbuf.st_size;
// NOTE/BUG: this will add a binary zero at the end of the output file
#if 0
lseek(out, insize - 1, SEEK_SET);
ssize_t wrote = write(out, "", 1);
#else
ftruncate(out,insize);
#endif
char *inmap = mmap(0, insize, PROT_READ, MAP_FILE | MAP_PRIVATE, in, 0);
// NOTE/FIX: check for error
#if 1
if (inmap == MAP_FAILED) {
perror("mmap/in");
exit(1);
}
#endif
char *outmap = mmap(0, insize, PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED, out, 0);
// NOTE/FIX: check for error
#if 1
if (inmap == MAP_FAILED) {
perror("mmap/in");
exit(1);
}
#endif
memcpy(outmap, inmap, insize);
// NOTE/FIX: we should unamp the output area
#if 1
munmap(outmap,insize);
#endif
close(out);
// NOTE/FIX: we should unmap and close the input descriptor
#if 1
munmap(inmap,insize);
close(in);
#endif
// NOTE/FIX: free the string allocated by strdup in main
#if 1
free(filename);
#endif
// NOTE/FIX: needs return value -- would be flagged by compiler if compiled
// with -Wall
#if 1
return (void *) 0;
#endif
}
int
main(int argc, char *argv[])
{
DIR *d;
struct dirent *e;
// NOTE/BUG: no checks for missing argument or bad directory
#if 0
d = opendir(argv[1]);
#else
if (argv[1] == NULL) {
fprintf(stderr,"missing argument\n");
exit(1);
}
d = opendir(argv[1]);
if (d == NULL) {
perror(argv[1]);
exit(1);
}
#endif
// NOTE/BUG: unused variable
#if 0
int num_entries = 0;
#endif
int i = 0;
pthread_t threads[50];
while ((e = readdir(d)) != NULL) {
// NOTE/FIX: must skip "." and ".."
#if 1
if (strcmp(e->d_name,".") == 0)
continue;
if (strcmp(e->d_name,"..") == 0)
continue;
#endif
// NOTE/FIX: only copy simple files (i.e. do _not_ copy subdirectories, etc.)
#if 1
if (e->d_type != DT_REG)
continue;
#endif
// NOTE/FIX: skip "copy" files
#if 1
if (strstr(e->d_name,"copy") != NULL)
continue;
#endif
// NOTE/BUG: this will present the _same_ memory address to all threads so there
// is a "race condition"
#if 0
char *filename = e->d_name;
#else
char *filename = strdup(e->d_name);
#endif
printf("%s\n", filename);
pthread_create(&threads[i], NULL, copyFile, filename);
i ;
}
for (int j = 0; j < i; j ) {
pthread_join(threads[j], NULL);
}
// NOTE/FIX: needs return
#if 1
return 0;
#endif
}