C/C90/Counting words in large text file-CodePudding

I have a text file which consists of about 30000 words. My goal is to count the actual number of the words (keep in mind that multiple punctuation marks and consecutive spaces are included, as well as words connected with - (for example three-legged), so counting just the spaces isn't correct).

I have managed to count the total characters but I am struggling with the words. Any help?

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define SIZE 50

char *getfile(void);
void stats(char *filename);

int main() {
    char *file;
    file = getfile();
    stats(file);
    return 0;
}

char *getfile(void) {
    char *filename;
    FILE *fp;
    filename = malloc(SIZE);

    printf("Enter the name of the text file: ");
    scanf("Is", filename);

    fp = fopen(filename, "r");
    printf("\n");

    if (fp == NULL) {
        printf("The entered file does not exist.");
        printf("\n");
    } else {
        printf("The file exists.");
        fclose(fp);
    }

    return filename;
}

void stats(char *filename) {
    int cnt = 0, space = 0, lines = 0;
    int c;
    int count = 0;

    FILE *fp;
    fp = fopen(filename, "r");
    while (((c = fgetc(fp)) != EOF)) {
        cnt  ;

        if (c == ' ') {
            space  ;
        }
        if (c == '\n' || c == '\0') {
            lines  ;
        }
    }

    printf("\nTotal characters in file: %d", cnt);
    printf("\nTotal characters (excluding spaces) in file: %d", cnt - space);
    
    fclose(fp);
    return;
}

CodePudding user response：

You should make a list of all the chars that can separate between words, and count every sequence of separating characters.

CodePudding user response：

The reason you are having trouble is you have no state. That is, classifying context about what came before. You can use other methods to break the file into words, but a state-machine is simple and fast. As suggested in the comments and by other answers, you need two states, a white-space came before, and a word character came before. It's sort of like the one-bit derivative, with rising edge, white-space space to word, as a the thing you count.

Stripping off most of the extraneous stuff, this might be how you do a state machine.

#include <stdio.h>

int main(void) {
    unsigned char buf[16384 /*50*/]; /* 50 is small. */
    enum { WHITE, WORD } state = WHITE;
    size_t cnt = 0, lines = 0, words = 0, nread, i;
    do { /* Fill `buf`. */
        nread = fread(buf, 1, sizeof buf, stdin);
        if(ferror(stdin)) { perror("wc"); return 1; }
        cnt  = nread;
        for(i = 0; i < nread; i  ) { /* Char-by-char in `buf`. */
            unsigned char c = buf[i];
            /* https://en.cppreference.com/w/cpp/string/byte/isspace */
            switch(c) {
            case '\n':
                lines  ; /* Fall-though. Doesn't handle CRs properly. */
            case '\0': case ' ': case '\f': case '\r': case '\t': case '\v':
                state = WHITE;
                break;
            default:
                if(state == WORD) break;
                state = WORD;
                words  ;
                break;
            }
        }
    } while(nread == sizeof buf);
    printf("Total characters in file: %lu\n", (unsigned long)(cnt - lines));
    printf("Total lines in file: %lu\n", (unsigned long)lines);
    printf("Total words in file: %lu\n", (unsigned long)words);
    return 0;
}

I off-loaded some work on the hosted-environment for brevity, ./wc < file.txt and I used a buffer.