Home > Net >  Windows 10: fopen, fread, fgets and mixed EOL characters in large text file
Windows 10: fopen, fread, fgets and mixed EOL characters in large text file

Time:09-22

I am developing a program for computational material scientists:

https://atomes.ipcms.fr/

Atomes can import large text files that contains atomic coordinates, to do that I read the file as a whole using fread, then split the text buffer over the CPU cores using OpenMP.

Works wonder on Linux, and Windows, until someone came along with an issue I wasn't expecting. A file with mixed, and not regularly placed, EOL symbols (\n and \r).

I found a way to overcome the problem on Windows, and Windows only, and I would really appreciate your advise/comment on what I did to know if I used a proper correction.

Before that solution I tried to add option(s) to the fopen function like -t or -b but it had no effect.

Finally I noticed that there was no problem if I change compilation options and use the fgets function to read the data from the file, only in that case and for large files the treatment of the data is more complicated, so far I no way // using OpenMP, and takes more time.

Here is the code I wrote to read the file:

int open_coord_file (gchar * filename)
{
  int res;
#ifdef OPENMP
  // In that case I read the file in a single buffer, then work on that buffer
  struct stat status;
  res = stat (filename, & status);
  if (res == -1)
  {
    // Basic function to store information on the reading process
    add_reader_info ("Error - cannot get file statistics !");
    return 1;
  }
  int fsize = status.st_size;
#endif
  coordf = fopen (filename, "r");
  if (! coordf)
  {
    add_reader_info ("Error - cannot open coordinates file !");
    return 1;
  }
  int i, j, k;
#ifdef OPENMP
  gchar * coord_content = g_malloc0(fsize*sizeof*coord_content);
  // Using fread to read the entire file
  fread (coord_content, fsize, 1, coordf);
  fclose (coordf);
  int linecount = 0;
  // Evaluating the number of lines in the file:
  for (j=0; j<fsize; j  ) if (coord_content[j] == '\n') linecount   ;
#ifdef G_OS_WIN32
  // What happen in Windows is that some '\r' symbols were found
  // and not on all lines, so I decided to check for \r symbols: 
  int neolr = 0;
  for (j=0; j<fsize; j  ) if (coord_content[j] == '\r') neolr   ;
  // And mofidy the number of lines accordingly
  linecount -= neolr;
#endif
  coord_line = g_malloc0 (linecount*sizeof*coord_line);
  coord_line[0] = & coord_content[0];
  i = 1;
  int nfsize = fsize;
#ifdef G_OS_WIN32
  // Now deleting the corresponding EOL symbols in the text buffer
  // This is only required for Windows, and I am not sure that it is 
  // the proper way to do thing, any though on the matter would be appreciated.
  for (j=0; j<fsize; j  )
  {
    if (coord_content[j] == '\n')
    {
      coord_content[j] = '\0';
    }
    else if (coord_content[j] == '\r')
    {
      for (k=j; k<fsize-1; k  )
      {
        coord_content[k] = coord_content[k 1];
      }
      nfsize --;
    }
  }
#endif
  // And referencing properly the lines to work on the buffer:
  for (j=0; j<nfsize; j  )
  {
    if (coord_content[j] == '\0')
    {
      if (i < linecount)
      {
        coord_line[i] = & coord_content[j 1];
        i  ;
      }
    }
  }
#else
  // On the other side if turn down OpenMP, then I use the fgets function
  // to read the data from the text file, then there no problem what so ever
  // with the EOL symbols and everything work smoothly. 
  // The fopen options being the same I am somewhat confused by this result. 
  gchar * buf = g_malloc0(LINE_SIZE*sizeof*buf);
  struct line_node
  {
    gchar * line;//[LINE_SIZE];
    struct line_node * next;
    struct line_node * prev;
  };
  struct line_node * head = NULL;
  struct line_node * tail = NULL;
  i = 0;
  while (fgets(buf, LINE_SIZE, coordf))
  {
    if (head == NULL)
    {
      head = g_malloc0 (sizeof*head);
      tail = g_malloc0 (sizeof*tail);
      tail = head;
    }
    else
    {
      tail -> next = g_malloc0 (sizeof*tail -> next);
      tail = tail -> next;
    }
    tail -> line = g_strdup_printf ("%s", buf);
    tail -> line = substitute_string (tail -> line, "\n", "\0");
    i   ;
  }
  g_free (buf);
  fclose (coordf);
#endif 

// And then latter in the code I process the data
// providing i the number of lines as an input value.

return read_xyz_file (i); 

Any advise would be really appreciated.

[EDIT]

I found a way arround my freadissue using a temporary buffer with fgets so I could get my data // with OpenMP easily again:

coord_line = g_malloc0 (i*sizeof*coord_line);
tail = head;
j = 0;
while (tail)
{
  coord_line[j] = & tail -> line[0];
  j   ;
  tail = tail -> next;
}

Now everything is fine, though I still have no clues why I am having issues with fread

[/EDIT]

CodePudding user response:

SO is not a free code writing service. However, you've shown real effort trying to work this out for yourself. Point-by-point correction would take forever, so here is a "code dump" that should be easy to follow and does (I hope) what you've tried hard to achieve.

This is in "plain, ordinary" C, without the "gXXXX" functions shown in your code. This opens & loads an entire (presumed-to-be) text file, squishes out CR if present, then slices the lines assigning pointers into a growing array of pointers to each line. (Empty lines will be assigned a pointer, too) Some printf lines report on some statistics of the process.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// Load file contents to alloc'd memory, return pointer to buffer (to be free'd!)
char *loadFile( char *fname ) {
    FILE *fp;
    if( ( fp = fopen( fname, "rb" ) ) == NULL )
        fprintf( stderr, "Cannot open '%s'\n", fname ), exit(1);

    fseek( fp, 0, SEEK_END );
    size_t size = ftell( fp );
    fseek( fp, 0, SEEK_SET );

    char *buf;
    if( ( buf = (char*)malloc( size   1) ) == NULL )
        fprintf( stderr, "Malloc() failed\n" ), exit(1);

    if( fread( buf, sizeof *buf, size, fp ) != size )
        fprintf( stderr, "Read incomplete\n" ), exit(1);

    fclose( fp );

    *(buf   size) = '\0'; // set xtra byte allocated to NULL (allows str functions to work)
    return buf;
}

int main() {
    char *fname = "FOO.BAR"; // To be defined...

    char *fCont = loadFile( fname ), *d, *s;

    // crush out '\r', if any
    for( d = fCont, s = fCont; (*d = *s) != '\0'; s   )
        d  = *d != '\r';
    fprintf( stderr, "Orig %ld. Without CR %ld\n", s - fCont, d - fCont );

    char **arr = NULL;
    int lcnt = 0;
    for( char *t = fCont; ( t = strtok( t, "\n" ) ) != NULL; t = NULL ) {
        char **tmp = (char**)realloc( arr, (lcnt 1) * sizeof *tmp );
        if( tmp == NULL )
            fprintf( stderr, "realloc() failed\n" ), exit(1);
        arr = tmp;
        arr[lcnt  ] = t;
    }
    fprintf( stderr, "%ld lines loaded\n", lcnt );

    // "demo" the first 5 lines
    for( int i = 0; i < 5 && i < lcnt; i   )
        fprintf( stderr, "%d - '%s'\n", i 1, arr[i] );

    /* process from arr[0] to arr[lcnt-1] */

    free( arr );
    free( fCont );

    return 0;
}

Hope this helps. Ball's in your court now...

CodePudding user response:

This is solved using the b option with the fopen function:

coordf = fopen (filename, "rb");

After that fread behaves properly.

Note that in my first attempts I likely used the following and wrong order of parameters:

coordf = fopen (filename, "br");

That does not work.

  • Related