Home > Back-end >  Opening file with specific format on Linux systems without iteration
Opening file with specific format on Linux systems without iteration

Time:05-09

I have a directory with many thousands of files, all of which have the general format of ***-***.txt with the exception of one file with the format ***.txt, which I am guaranteed to have exactly one of & whose exact name I have no way of knowing.

I would like to know if it is possible to open that singular file without iterating over all of the directory, using only C or Linux functions and without invoking system().

CodePudding user response:

As noted, there is no way to avoid iterating over filenames. The good news is that it really doesn’t matter. Even with thousands of entries (wut?!) it should not take much time, so if you only need to do it once, you are pretty much good to go.

C

// Life is easy in C  

#include <filesystem>
#include <iostream>

namespace fs = std::filesystem;

fs::path find_magic_file( const fs::path directory )
{
  for (auto & entry : fs::directory_iterator( directory ))
    if (entry.path().stem().string().find( '-' ) == std::string::npos)
      return entry.path();
  return "";
}

int main( int num_directories, char ** directory_names )
{
  for (int n = 1;  n < num_directories;  n  = 1)
  {
    auto filepath = find_magic_file( directory_names[n] );
    if (!filepath.empty())
      std::cout << filepath.string() << "\n";
  }
}

C

// C requires you to recur to the OS facilities directly
// This example handles Windows and Linux/POSIX systems

#ifdef _WIN32
  #define _CRT_SECURE_NO_WARNINGS
#endif

#include <iso646.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef _WIN32
  #include <windows.h>
  
  char * find_magic_file( const char * directory )
  {
    WIN32_FIND_DATA entry;

    char * dirname = malloc( strlen( directory )   5 );
    if (!dirname) return NULL;

    HANDLE h = FindFirstFile( strcat( strcpy( dirname, directory ), "/*.*" ), &entry );
    if (h == INVALID_HANDLE_VALUE) return NULL;
    
    while ((strcmp( entry.cFileName, "."  ) == 0) 
        or (strcmp( entry.cFileName, ".." ) == 0)
        or (strchr( entry.cFileName, '-' )))
      if (!FindNextFile( h, &entry )) 
        break;
      
    FindClose( h );
    free( dirname );
    if (strchr( entry.cFileName, '-' )) return NULL;
    
    char * filepath = calloc( strlen( directory )   1   strlen( entry.cFileName )   1, 1 );
    if (filepath) strcat( strcat( strcpy( filepath, directory ), "/" ), entry.cFileName );
    return filepath;
  }
  
#else
  #include <dirent.h>
  #include <sys/types.h>

  char * find_magic_file( const char * directory )
  {
    char * filepath = NULL;
    DIR * dir = opendir( directory );
    if (dir)
    {
      struct dirent * entry;
      while ((entry = readdir( dir )))
      {
        if (!strchr( entry->d_name, '-' )
            and (strcmp( entry->d_name, "."  ) != 0)
            and (strcmp( entry->d_name, ".." ) != 0))
        {
          filepath = malloc( strlen( directory )   1   strlen( entry->d_name )   1 );
          if (filepath) strcat( strcat( strcpy( filepath, directory ), "/" ), entry->d_name );
          break;
        }
      }
      closedir( dir );
    }
    return filepath;
  }
#endif

int main( int num_directories, char ** directory_names )
{
  for (int n = 1;  n < num_directories;  n  = 1)
  {
    char * filepath = find_magic_file( directory_names[n] );
    if (filepath)
    {
      puts( filepath );
      free( filepath );
    }
  }
}

You will notice that the only real difference in main() between the two languages is the type of object returned by find_magic_file().

CodePudding user response:

In Linux, you can use POSIX.1 scandir() function, with a filter function that rejects file names with dashes.

This way, the scanning/iteration is done within the C library, in a manner that is suitable to the operating system at hand, and should not get confused if e.g. files are renamed during the scanning. In cases where the dashless file is renamed to a dashed one, and another dashed one to a dashless name, exactly during the directory scan, it is possible to not find either name. (And, if they are done in the reverse order, it is possible to see both names.)

Here is an example program, with full error checking:

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <dirent.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>

static int dashless_filter(const struct dirent *ent)
{
    /* Reject file names that begin with a dot '.' ("hidden" files) */
    if (ent->d_name[0] == '.')
        return 0;

    /* Reject file names with dash in them */
    if (strchr(ent->d_name, '-'))
        return 0;

    return 1;
}

char *dashless(const char *dirpath)
{
    struct dirent **list = NULL;
    char           *result = NULL;
    int             num, err;

    /* Return NULL if dirpath is null or empty */
    if (!dirpath || !*dirpath) {
        errno = ENOTDIR;
        return NULL;
    }

    do {
        num = scandir(dirpath, &list, dashless_filter, NULL);
        if (num == -1) {
            err = errno;  /* errno set by scandir() */
            break;
        }

        if (num < 1 || !list || !list[0] || list[0]->d_name[0] == '\0') {
            err = ENOENT;  /* No matching files found */
            break;
        } else
        if (num > 1) {
            err = EMFILE;  /* More than one matching file found. */
            break;
        }

        result = strdup(list[0]->d_name);
        if (!result) {
            err = ENOMEM;
        } else {
            err = 0;
        }
    } while (0);

    /* Free all entries in the list, */
    while (num-->0)
        free(list[num]);
    /* and the list itself. */
    free(list);

    errno = err;
    return result;
}

int main(int argc, char *argv[])
{
    int  status = EXIT_SUCCESS;
    int  arg;

    if (argc < 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        const char *arg0 = (argc > 0 && argv && argv[0] && argv[0][0]) ? argv[0] : "(this)";
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", arg0);
        fprintf(stderr, "       %s DIRECTORY [ DIRECTORY ... ]\n", arg0);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program reports if each specified directory contains\n");
        fprintf(stderr, "a single file without a dash '-' in its name.\n");
        fprintf(stderr, "\n");
        return EXIT_SUCCESS;
    }

    for (arg = 1; arg < argc; arg  ) {
        char *filepath = dashless(argv[arg]);
        if (filepath) {
            printf("%s: %s\n", argv[arg], filepath);
            free(filepath);
        } else {
            if (errno == EMFILE)
                fprintf(stderr, "%s: Multiple dashless files exist.\n", argv[arg]);
            else
                fprintf(stderr, "%s: %s.\n", argv[arg], strerror(errno));
            status = EXIT_FAILURE;
        }
    }

    return status;
}

Note that it explicitly checks for a single file name.

It would be better if the application or script that generates the files did also create a hard link with a fixed, known name, to the one dashless file, whatever that might be. That way, one could always use the fixed symlink/hardlink name, and there would not be any race window during which either none or both dashless files would be discovered, if the dashless file is renamed.

  • Related