Home > Mobile >  C - scanning only words between "--"s from txt file without regex
C - scanning only words between "--"s from txt file without regex

Time:10-16

For example, I have lots of text files and they're like

Tags: --Maths--, --Physics--, --Programming--, ...

Some other text about subject and --another tag-- inside this text..

Every word between "--"s are tags. Each one of text files have different tags and some of them have same tags. Number of tags can be more or less than 3 in each file. I need to scan words between "--"s and count them. Then print it to screen.

Example output that I need:

Tag Name    - Number
Maths       - 4 times
Physics     - 6 times
Programming - 2 times

I've tried this code to get tag names:

FILE *srcFile;

    for (int i=0; i<count; i  ) {
        srcFile = fopen(pathArr[i], "r"); // pathArr has all paths of my txt files to search
        
        char tagArr[100][100]; // array to store all tags
        char tagName[100];
        int arrIndex = 0;

        fscanf(srcFile, "--%s--", tagName);
        strcpy(tagArr[arrIndex  ], tagName);

        fclose(srcFile);
    }

but it doesn't work because there's more than one tag in each file and this only gets one of them, I guess?

I don't know I can do this or not with regex but I'm not allowed to use it anyway. Thanks for any help.

CodePudding user response:

Here's an extremely naive approach that should get you started:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

struct tag {
    const char *name;
    int count;
    struct tag *p[2];
};

int
get_tag(char *buf, const char *end)
{
    int c, prev = 0;
    /* Discard data until `--` */
    while( (c = getchar()) != EOF ){
        if( c == '-' && prev == '-' ){
            break;
        }
        prev = c;
    }
    prev = 0;
    /* Read in a tag.  TODO: handle whitespace better */
    if( c != EOF ){
        while( (c = getchar()) != EOF && buf < end ){
            if( c == '-'  && prev == '-' ){
                assert(buf[-1] == '-');
                buf[-1] = '\0';
                break;
            }
            prev = *buf   = c;
        }
    }
    if( c == EOF ){
        return EOF;
    }
    if( buf == end ){
        fputs("Absurdly long tag found! Aborting!\n", stderr);
        exit(1);
    }
    return 0;
}


void
push(struct tag **tp, const char *n)
{
    struct tag *t = *tp;
    int cmp;
    if( t == NULL ){
        *tp = t = malloc(sizeof *t);
        if( t == NULL ){
            perror("malloc");
            exit(1);
        }
        t->name = strdup(n);
        t->count = 1;
        t->p[0] = t->p[1] = NULL;
    } else if( (cmp = strcmp(n, t->name)) == 0 ){
        t->count  = 1;
    } else {
        push(&t->p[cmp > 0], n);
    }
}


void
walk(struct tag *t)
{
    if( t ){
        walk(t->p[0]);
        printf("%s - %d\n", t->name, t->count);
        walk(t->p[1]);
    }
}


int
main(void)
{
    char buf[1024];
    struct tag *tags = NULL;

    while( get_tag(buf, buf   sizeof buf) != EOF ){
        push(&tags, buf);
    }

    printf("%s - %s\n", "Tag Name", "Number");
    walk(tags);

}

This does not handle invalid input at all well. (For example, a reasonable person would take --Maths-- and -- Maths-- (with whitespace) to be the same token, and the program should probably discard unmatched -- rather than incorporating an enclosing newline into the tag name, and strings like ------- should be handled somewhat cleanly.) Dealing with such details is left as an exercise for the reader.

  •  Tags:  
  • c
  • Related