remove files when I download the samefiles in C-CodePudding

when we download some same files on Internet, the filename becomes (2), (3)...

I want to remove these files with C. First of all, I want to find files and print. I write some code blow. But It doesn't work.

int main(){

        const char *path;
        DIR *dir;
        struct dirent* entry;
        if((path=getenv("HOME"))==NULL){//get HOME path
                path = getpwuid(getuid())->pw_dir;
        }
        const char *downloads = "/Downloads";
        strcat(path,downloads); //make ~/Downloads
        if(chdir(path)!=0){
                perror("chdir()");
                return -1;
        }
        if((dir=opendir(path))==NULL){ //open directory
                perror("open");
                return 1;
        }
        while((entry=readdir(dir))!=NULL){
                struct dirent *cmpentry;
                DIR *cmpdir;
                if((cmpdir=opendir(path))==NULL){
                        perror("opendir");
                        return -1;
                }


                while((cmpentry=readdir(cmpdir))!=NULL){
                        if((entry->d_name[0]!='.')&&strcmp(entry->d_name,cmpentry->d_name)!=0){
                                char *ptr=strstr(cmpentry->d_name,entry->d_name);
                                if(ptr!=NULL)
                                        printf("%s\n",cmpentry->d_name);
                          
                                }
                        }
                }
        }

How can i fix it?

CodePudding user response：

readdir() reads files not like ls but by order its places in directory. There is working variant of your program, but it works wrong, not how you want. Please correct it itself.

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>
#include <unistd.h>
#include <pwd.h>

int main(){
int m;
char path[256],downloads[256],substr[256],buf[160],*ptr;
DIR *dir,*cmpdir;
struct dirent entry,cmpentry,*pe;

 strcpy(path,getenv("HOME"));
 if(path==NULL){//get HOME path
  strcpy(path,getpwuid(getuid())->pw_dir);
 }
        strcpy(downloads,"/Downloads");
// strcpy(downloads,"/tmp/down");
 strcat(path,downloads);errno=0; //make ~/Downloads
 if(chdir(path)!=0){
    m = errno;strcpy(buf,strerror(m));fprintf(stdout,"%d %s\n",m,buf);
  return -1;
 }
 errno=0;
 if((dir=opendir(path))==NULL){ //open directory
    m=errno;strcpy(buf,strerror(m));fprintf(stdout,"%d %s\n",m,buf);
    return 1;
 }
 while((pe=readdir(dir))!=NULL){entry=*pe;
    errno=0;if((cmpdir=opendir(path))==NULL){m=errno;
     strcpy(buf,strerror(m));fprintf(stdout,"%d %s\n",m,buf);
     return -1;
    }
 }
 while((pe=readdir(cmpdir))!=NULL){cmpentry=*pe;
 if((entry.d_name[0]!='.')&&(strncmp(entry.d_name,"..",2)!=0)
    &&(strcmp(entry.d_name,cmpentry.d_name)!=0)){
fprintf(stdout,"%s %s\n",entry.d_name,cmpentry.d_name);fflush(stdout);
     ptr=strstr(cmpentry.d_name,entry.d_name);
     if(ptr!=NULL){strcpy(substr,ptr);
         fprintf(stdout,"%s\n",cmpentry.d_name);
    }
   }
  }
return 0;}

CodePudding user response：

A number of issues ...

path does not have enough space for the strcat, so you have UB (undefined behavior)
No need to use chdir
No closedir calls, so for a large directory, you'll run out of file descriptors.
No skip of . and .. entries
Using strcmp and strstr is not sufficient. Duplicates and/or misses.
Opening the same directory repeatedly is slow/wasteful. Better read the directory once and save the entries in an array.

Some fixes:

Capture the data in an array
Use an auxiliary struct (e.g. struct root below) that splits up the filenames into component parts (e.g. foo(1).pdf --> foo, (1), and .pdf)
Added comparison of lengths and file contents

Here is the original code, annotated with the bugs:

int
main()
{

    const char *path;
    DIR *dir;
    struct dirent *entry;

    // get HOME path
    if ((path = getenv("HOME")) == NULL) {
        path = getpwuid(getuid())->pw_dir;
    }
    const char *downloads = "/Downloads";

    // make ~/Downloads
// NOTE/BUG: not enough space in path
// NOTE/BUG: path is a const
    strcat(path, downloads);
// NOTE/BUG: no need to chdir as opendir is enough
    if (chdir(path) != 0) {
        perror("chdir()");
        return -1;
    }

    // open directory
// NOTE/BUG: no closedir for this
    if ((dir = opendir(path)) == NULL) {
        perror("open");
        return 1;
    }

    while ((entry = readdir(dir)) != NULL) {
// NOTE/BUG: no check for "." or ".."
        struct dirent *cmpentry;
        DIR *cmpdir;

// NOTE/BUG: no closedir for this
        if ((cmpdir = opendir(path)) == NULL) {
            perror("opendir");
            return -1;
        }

        while ((cmpentry = readdir(cmpdir)) != NULL) {
// NOTE/BUG: strcmp sense is inverted
// NOTE/BUG: strcmp wrong
            if ((entry->d_name[0] != '.') &&
                strcmp(entry->d_name, cmpentry->d_name) != 0) {
                char *ptr = strstr(cmpentry->d_name, entry->d_name);

                if (ptr != NULL)
                    printf("%s\n", cmpentry->d_name);
            }
        }
    }
}

In the above code, I've used cpp conditionals to denote old vs. new code:

#if 0
// old code
#else
// new code
#endif

#if 1
// new code
#endif

Note: this can be cleaned up by running the file through unifdef -k

Here is the refactored code. It is annotated:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <pwd.h>
#include <string.h>
#include <dirent.h>
#include <ctype.h>
#include <sys/stat.h>

#ifdef DEBUG
#define dbgprt(_fmt...) \
    fprintf(stderr,_fmt)
#else
#define dbgprt(_fmt...) \
    do { } while (0)
#endif

// filename parsing control
struct root {
    struct dirent root_ent;             // raw directory entry
    off_t root_size;                    // file size
    int root_paren;                     // 1=has "(1)"
    int root_dup;                       // 1=is a duplicate
    char *root_suf;                     // suffix/entension (e.g. ".pdf")
    char root_core[256];                // root/core/base name
};

// rootshow -- show root struct contents
void
rootshow(const struct root *root,const char *who)
{

    dbgprt("rootshow: d_name='%s' root_dup=%d root_paren=%d root_core='%s' root_suf='%s' (from %s)\n",
        root->root_ent.d_name,
        root->root_dup,root->root_paren,
        root->root_core,root->root_suf,who);
}

// rootof -- split up filenames into components
void
rootof(struct root *root,struct dirent *ent,off_t size)
{
    char tail[256];

    memset(root,0,sizeof(*root));

    do {
        // get directory entry
        root->root_ent = *ent;

        // remember the file size
        root->root_size = size;

        // get the filename
        strcpy(tail,ent->d_name);

        // remember and strip the extension
        char *dot = strrchr(tail,'.');
        if (dot != NULL) {
            root->root_suf = &ent->d_name[dot - tail];
            *dot = 0;
        }

        // get root/base (e.g. "foo.pdf" --> "foo")
        strcpy(root->root_core,tail);

        // rightmost part of file must be "(1)"
        char *rparen = &tail[strlen(tail) - 1];
        if (*rparen != ')')
            break;

        // assume it's of the correct form
        root->root_paren = 1;

        // look for "(" and ensure it has some digits
        char *lparen = rparen - 1;
        for (;  lparen >= tail;  --lparen) {
            if (*lparen == '(')
                break;
            if (! isdigit(*lparen)) {
                root->root_paren = 0;
                break;
            }
        }

        // we got something like "X)" (i.e. _not_ "(1)")
        if (! root->root_paren)
            break;

        // assume it's _not_ a match
        root->root_paren = 0;

        // we got something like "()"
        if ((lparen   1) == rparen)
            break;

        // we must have the "("
        if (lparen < tail)
            break;
        if (*lparen != '(')
            break;

        // strip "(1)"
        *lparen = 0;

        root->root_paren = 1;
        strcpy(root->root_core,tail);
    } while (0);

#if DEBUG
    rootshow(root,"rootof");
#endif
}

// fullpath -- get full path (e.g. dir/tail)
void
fullpath(char *path,const char *dir,const char *tail)
{

    strcpy(path,dir);
    strcat(path,"/");
    strcat(path,tail);
}

// dirload -- load up directory into list
struct root *
dirload(const char *path,int *countp)
{
    char file[1024];
    struct root *list = NULL;
    int count = 0;
    int cap = 0;

    // open directory
    DIR *dirp = opendir(path);
    if (dirp == NULL) {
        perror("open");
        exit(1);
    }

    while (1) {
        struct dirent *ent = readdir(dirp);
        if (ent == NULL)
            break;

        // skip over "." and ".."
        const char *tail = ent->d_name;
        if (tail[0] == '.') {
            if (tail[1] == 0)
                continue;
            if ((tail[1] == '.') && (tail[2] == 0))
                continue;
        }

        // optional -- only ordinary files
#if 1
        if (ent->d_type != DT_REG)
            continue;
#endif

        // enlarge array
        if (count >= cap) {
            cap  = 10;
            list = realloc(list,sizeof(*list) * cap);
            if (list == NULL) {
                perror("realloc");
                exit(1);
            }
        }

        // get file size
        struct stat st;
        fullpath(file,path,ent->d_name);
        if (stat(file,&st) < 0) {
            perror(file);
            exit(1);
        }

        // parse the filename
        rootof(&list[count],ent,st.st_size);
          count;
    }

    closedir(dirp);

    // return count to caller
    *countp = count;

    return list;
}

// filematch -- compare the file contents
// RETURNS: 1=match, 0=mismatch
int
filematch(const char *dir,const struct root *lhs,const struct root *rhs)
{
    int fdlhs;
    char lhsfile[1024];
    char lhsbuf[4096];

    int fdrhs;
    char rhsfile[1024];
    char rhsbuf[4096];

    int match = 0;

    do {
        // file sizes must match
        if (lhs->root_size != rhs->root_size)
            break;

        // open the LHS file
        fullpath(lhsfile,dir,lhs->root_ent.d_name);
        fdlhs = open(lhsfile,O_RDONLY);
        if (fdlhs < 0) {
            perror(lhsfile);
            exit(1);
        }

        // open the RHS file
        fullpath(rhsfile,dir,rhs->root_ent.d_name);
        fdrhs = open(rhsfile,O_RDONLY);
        if (fdrhs < 0) {
            perror(rhsfile);
            exit(1);
        }

        match = 1;

        off_t resid = lhs->root_size;
        ssize_t rlen;
        ssize_t xlen;
        for (;  resid > 0;  resid -= rlen) {
            if (resid > sizeof(lhsbuf))
                rlen = sizeof(lhsbuf);
            else
                rlen = resid;

            // get LHS chunk
            xlen = read(fdlhs,lhsbuf,rlen);
            if (xlen != rlen) {
                perror(lhsfile);
                exit(1);
            }

            // get RHS chunk
            xlen = read(fdrhs,rhsbuf,rlen);
            if (xlen != rlen) {
                perror(rhsfile);
                exit(1);
            }

            // they must match
            if (memcmp(lhsbuf,rhsbuf,rlen) != 0) {
                match = 0;
                break;
            }
        }

        close(fdlhs);
        close(fdrhs);
    } while (0);

    return match;
}

int
main(int argc,char **argv)
{
    char path[1024];

    // skip over program name
    --argc;
      argv;

    // find the directory
    do {
        if (argc > 0) {
            strcpy(path,*argv);
            break;
        }

        // get HOME path
        const char *home = getenv("HOME");
        if (home == NULL)
            home = getpwuid(getuid())->pw_dir;

        // make ~/Downloads
        fullpath(path,home,"Downloads");
    } while (0);

#if DEBUG
    setlinebuf(stdout);
    setlinebuf(stderr);
#endif

    int count = 0;
    struct root *list = dirload(path,&count);

    for (int lhsidx = 0;  lhsidx < count;    lhsidx) {
        struct root *lhs = &list[lhsidx];

        // must _not_ have "(1)"
        if (lhs->root_paren)
            continue;

        rootshow(lhs,"LHS");

        for (int rhsidx = 0;  rhsidx < count;    rhsidx) {
            // skip over the same entry
            if (rhsidx == lhsidx)
                continue;

            struct root *rhs = &list[rhsidx];

            rootshow(rhs,"RHS");

            // file types must match
            if (rhs->root_ent.d_type != lhs->root_ent.d_type)
                continue;

            // must have "(1)"
            if (! rhs->root_paren)
                continue;

            // suffix must match
            // both entries must have [or _not_ have] a suffix
            if (lhs->root_suf != NULL) {
                if (rhs->root_suf == NULL)
                    continue;
                if (strcmp(lhs->root_suf,rhs->root_suf) != 0)
                    continue;
            }
            else {
                if (rhs->root_suf != NULL)
                    continue;
            }

            // core must match
            if (strcmp(lhs->root_core,rhs->root_core) != 0)
                continue;

            // contents must match
            if (! filematch(path,lhs,rhs))
                continue;

            printf("%s is dup of %s\n",
                rhs->root_ent.d_name,lhs->root_ent.d_name);

            // mark it as a removable duplicate
            rhs->root_dup = 1;
        }
    }

    return 0;
}

Here is a test perl script:

#!/usr/bin/perl
# dotest -- test program

master(@ARGV);
exit(0);

# master -- master control
sub master
{
    my(@argv) = @_;

    $xfile = shift(@argv);
    $xfile //= "duptest";
    $pwd = $ENV{PWD};
    $xfile = "$pwd/$xfile";

    $tstdir = "/tmp/testdir";

    dotest("abc","xyz");

    dotest("abc.pdf","jkl");
    dotest("abc(1).pdf","jkl");

    dotest("abc(2)","xyz");
    dotest("abc(3)","xx");
    dotest("abc(3)","xzy");

    dotest("def","blah");
    dotest("def(3)","blah");
    dotest("def.pdf","blah");
}

sub dotest
{
    my($file,$body) = @_;

    printf("\n");
    printf("%s\n","-" x 80);

    system("rm -fr $tstdir");
    system("mkdir -p $tstdir");

    push(@allfiles,[$file,$body]);

    ###@rfiles = shuffle(@allfiles);
    @rfiles = @allfiles;

    foreach $pair (@rfiles) {
        ($tail,$body) = @$pair;
        printf("dotest: FILE %s '%s'\n",$tail,$body);

        $file = sprintf("%s/%s",$tstdir,$tail);

        open($xfdst,">$file") or
            die("dotest: unable to open '$file' -- $!\n");
        print($xfdst $body);
        close($xfdst);
    }

    @fsort = sort(@allfiles);

    @xfiles = (`$xfile $tstdir`);
    $code = $? >> 8;
    die("dotest: program aborted\n")
        if ($code);

    foreach $tail (@xfiles) {
        chomp($tail);
        printf("dotest: XDUP %s\n",$tail);
    }
}

Here is the output of the test program:


--------------------------------------------------------------------------------
dotest: FILE abc 'xyz'

--------------------------------------------------------------------------------
dotest: FILE abc 'xyz'
dotest: FILE abc.pdf 'jkl'

--------------------------------------------------------------------------------
dotest: FILE abc 'xyz'
dotest: FILE abc.pdf 'jkl'
dotest: FILE abc(1).pdf 'jkl'
dotest: XDUP abc(1).pdf is dup of abc.pdf

--------------------------------------------------------------------------------
dotest: FILE abc 'xyz'
dotest: FILE abc.pdf 'jkl'
dotest: FILE abc(1).pdf 'jkl'
dotest: FILE abc(2) 'xyz'
dotest: XDUP abc(1).pdf is dup of abc.pdf
dotest: XDUP abc(2) is dup of abc

--------------------------------------------------------------------------------
dotest: FILE abc 'xyz'
dotest: FILE abc.pdf 'jkl'
dotest: FILE abc(1).pdf 'jkl'
dotest: FILE abc(2) 'xyz'
dotest: FILE abc(3) 'xx'
dotest: XDUP abc(1).pdf is dup of abc.pdf
dotest: XDUP abc(2) is dup of abc

--------------------------------------------------------------------------------
dotest: FILE abc 'xyz'
dotest: FILE abc.pdf 'jkl'
dotest: FILE abc(1).pdf 'jkl'
dotest: FILE abc(2) 'xyz'
dotest: FILE abc(3) 'xx'
dotest: FILE abc(3) 'xzy'
dotest: XDUP abc(1).pdf is dup of abc.pdf
dotest: XDUP abc(2) is dup of abc

--------------------------------------------------------------------------------
dotest: FILE abc 'xyz'
dotest: FILE abc.pdf 'jkl'
dotest: FILE abc(1).pdf 'jkl'
dotest: FILE abc(2) 'xyz'
dotest: FILE abc(3) 'xx'
dotest: FILE abc(3) 'xzy'
dotest: FILE def 'blah'
dotest: XDUP abc(1).pdf is dup of abc.pdf
dotest: XDUP abc(2) is dup of abc

--------------------------------------------------------------------------------
dotest: FILE abc 'xyz'
dotest: FILE abc.pdf 'jkl'
dotest: FILE abc(1).pdf 'jkl'
dotest: FILE abc(2) 'xyz'
dotest: FILE abc(3) 'xx'
dotest: FILE abc(3) 'xzy'
dotest: FILE def 'blah'
dotest: FILE def(3) 'blah'
dotest: XDUP def(3) is dup of def
dotest: XDUP abc(1).pdf is dup of abc.pdf
dotest: XDUP abc(2) is dup of abc

--------------------------------------------------------------------------------
dotest: FILE abc 'xyz'
dotest: FILE abc.pdf 'jkl'
dotest: FILE abc(1).pdf 'jkl'
dotest: FILE abc(2) 'xyz'
dotest: FILE abc(3) 'xx'
dotest: FILE abc(3) 'xzy'
dotest: FILE def 'blah'
dotest: FILE def(3) 'blah'
dotest: FILE def.pdf 'blah'
dotest: XDUP def(3) is dup of def
dotest: XDUP abc(1).pdf is dup of abc.pdf
dotest: XDUP abc(2) is dup of abc