Home > Enterprise >  Reading a CSV file in C and dealing with commas in the data
Reading a CSV file in C and dealing with commas in the data

Time:11-02

I have written a code to read a csv file in c. The file contains data of games and i am supposed to read it and sort it according to the score and print the top 10 rated games. The code is as follows:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define tablesize 18626

typedef struct
{
    char title[200];
    char platform[20];
    char Score[20];
    char release_year[20];
} dict;

void printValues(dict *values)
{
    for (int i = 0; i < 100; i  )
    {
        printf("title->%s,platform->%s,Score->%s,release->%s\n", values[i].title, values[i].platform, values[i].Score, values[i].release_year);
    }
}

void sort(dict *values)
{
    for (int i = 0; i < tablesize; i  )
    {
        for (int j = i   1; j < tablesize; j  )
        {
            int a = *values[i].Score - '0';
            int b = *values[j].Score - '0';
            // printf("%d %d\n",values[i].Score,values[j].Score);
            if (a < b)
            {
                dict temp = values[i];
                values[i] = values[j];
                values[j] = temp;
            }
        }
    }
}

int main()
{
    FILE *fp = fopen("t4_ign.csv", "r");
    if (!fp)
    {
        printf("Error");
        return 0;
    }
    char buff[1024];
    int row = 0, column = 0;
    int count = 0;
    dict *values = NULL;
    int i = 0;
    while (fgets(buff, 1024, fp))
    {
        column = 0;
        row  ;
        count  ;
        values = realloc(values, sizeof(dict) * count);
        if (NULL == values)
        {
            perror("realloc");
            break;
        }
        if (row == 1)
        {
            continue;
        }
        char *field = strtok(buff, ",");
        while (field)
        {
            if (column == 0)
            {
                strcpy(values[i].title, field);
            }
            if (column == 1)
            {
                strcpy(values[i].platform, field);
            }
            if (column == 2)
            {
                strcpy(values[i].Score, field);
            }
            if (column == 3)
            {
                strcpy(values[i].release_year, field);
            }
            field = strtok(NULL, ",");
            column  ;
        }
        i  ;
    }
    fclose(fp);
    printf("File loaded!\n", fp);
    sort(values);
    printValues(values);
    free(values);
    return 0;
}

The problem i am facing is that the CSV file's Title field has commas in it and it thus differentiates the data separated by the commas as different columns which gives an error in loading the data in the struct.

Here are two example lines of the input file. Quotes are used when the title contains commas.

"The Chronicles of Narnia: The Lion, The Witch and The Wardrobe",PlayStation 2,8,2005  
The Chronicles of Narnia: Prince Caspian,Wireless,5,2008

Any suggestions? Thanks in advance.

CodePudding user response:

Since quotes are used for the title field when it contains commas, I suggest you check to see if the " has been used. If so, use that delimiter for the first item.

char *field;
if(buff[0] == '"') {
    field = strtok(buff, "\"");
}
else {
    field = strtok(buff, ",");
}

The first one will leave a comma as the first character of the next field, but the next strtok will filter that off, since it does not allow "empty" fields.

CodePudding user response:

The function strtok does not suit your needs, because it considers the quotation marks as characters like any other. Therefore, when strtok sees a comma, it won't care whether the comma is inside quotation marks or not.

Also, as someone else pointed out in the comments section, another problem with strtok is that it skips empty fields.

Therefore, I do not recommend using strtok for what you want to do.

In order to solve your problem, I recommend that you write your own function that does something very similar to strtok and strsep, but if the first non-whitespace character is a quotation mark, it considers the next quotation mark as the delimiter instead of the next comma. In the code below, I named this function my_strsep.

Here is an example:

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#define NUM_LINES 2

//this function is equivalent to the POSIX function "strsep", except
//that it always uses "," as a delimiter, unless the first
//non-whitespace character is a quotation mark, in which case it //skips the quotation mark and uses the next quotation mark as a
//delimiter, also consuming the next comma
char *my_strsep( char **restrict stringp )
{
    char *p = *stringp;
    char *start;
    char delimiter = ',';

    //do nothing if *stringp is 
    if ( *stringp == NULL )
        return NULL;

    //skip all whitespace characters
    while ( isspace( (unsigned char)*p ) )
        p  ;

    //remember start of field
    start = p;

    //determine whether this field uses quotation marks
    if ( *p == '"' )
    {
        //set delimiter to quotation mark instead of comma
        delimiter = '\"';

        //skip the first quotation mark
        p  ;
    }

    //remember the start of the string
    start = p;

    while ( *p != delimiter )
    {
        if ( *p == '\0' )
        {
            if ( delimiter == '\"' )
            {
                fprintf( stderr,
                    "Warning: Encountered end of string before the "
                    "second quotation mark!\n"
                );
            }

            //pass information back to calling function
            *stringp = NULL;
            return start;
        }

        p  ;
    }

    //overwrite the delimiter with a null character
    *p = '\0';

    //go past the delimiter
    p  ;

    //skip the comma too, if quotation marks are being used
    if ( delimiter == '\"' )
    {
        //skip all whitespace characters
        while ( isspace( (unsigned char)*p ) )
            p  ;

        //skip the comma
        if ( *p == ',' )
            p  ;
    }

    //pass information back to calling function
    *stringp = p;
    return start;
}

int main( void )
{
    char lines[NUM_LINES][200] = {
        "\"The Chronicles of Narnia: The Lion, The Witch and The Wardrobe\",PlayStation 2,8,2005",
        "The Chronicles of Narnia: Prince Caspian,Wireless,5,2008"
    };

    for ( int i = 0; i < NUM_LINES; i   )
    {
        char *p, *q;

        printf( "Processing line #%d:\n", i   1 );

        p = lines[i];

        while ( ( q = my_strsep( &p ) ) != NULL )
        {
            printf( "Found field: %s\n", q );
        }

        printf( "\n" );
    }
}

This program has the following output:

Processing line #1:
Found field: The Chronicles of Narnia: The Lion, The Witch and The Wardrobe
Found field: PlayStation 2
Found field: 8
Found field: 2005

Processing line #2:
Found field: The Chronicles of Narnia: Prince Caspian
Found field: Wireless
Found field: 5
Found field: 2008

As you can see, the function my_strsep can handle fields both with and without quotation marks.

  • Related