For example, I have lots of text files and they're like
Tags: --Maths--, --Physics--, --Programming--, ...
Some other text about subject and --another tag-- inside this text..
Every word between "--"s are tags. Each one of text files have different tags and some of them have same tags. Number of tags can be more or less than 3 in each file. I need to scan words between "--"s and count them. Then print it to screen.
Example output that I need:
Tag Name - Number
Maths - 4 times
Physics - 6 times
Programming - 2 times
I've tried this code to get tag names:
FILE *srcFile;
for (int i=0; i<count; i ) {
srcFile = fopen(pathArr[i], "r"); // pathArr has all paths of my txt files to search
char tagArr[100][100]; // array to store all tags
char tagName[100];
int arrIndex = 0;
fscanf(srcFile, "--%s--", tagName);
strcpy(tagArr[arrIndex ], tagName);
fclose(srcFile);
}
but it doesn't work because there's more than one tag in each file and this only gets one of them, I guess?
I don't know I can do this or not with regex but I'm not allowed to use it anyway. Thanks for any help.
CodePudding user response:
Here's an extremely naive approach that should get you started:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
struct tag {
const char *name;
int count;
struct tag *p[2];
};
int
get_tag(char *buf, const char *end)
{
int c, prev = 0;
/* Discard data until `--` */
while( (c = getchar()) != EOF ){
if( c == '-' && prev == '-' ){
break;
}
prev = c;
}
prev = 0;
/* Read in a tag. TODO: handle whitespace better */
if( c != EOF ){
while( (c = getchar()) != EOF && buf < end ){
if( c == '-' && prev == '-' ){
assert(buf[-1] == '-');
buf[-1] = '\0';
break;
}
prev = *buf = c;
}
}
if( c == EOF ){
return EOF;
}
if( buf == end ){
fputs("Absurdly long tag found! Aborting!\n", stderr);
exit(1);
}
return 0;
}
void
push(struct tag **tp, const char *n)
{
struct tag *t = *tp;
int cmp;
if( t == NULL ){
*tp = t = malloc(sizeof *t);
if( t == NULL ){
perror("malloc");
exit(1);
}
t->name = strdup(n);
t->count = 1;
t->p[0] = t->p[1] = NULL;
} else if( (cmp = strcmp(n, t->name)) == 0 ){
t->count = 1;
} else {
push(&t->p[cmp > 0], n);
}
}
void
walk(struct tag *t)
{
if( t ){
walk(t->p[0]);
printf("%s - %d\n", t->name, t->count);
walk(t->p[1]);
}
}
int
main(void)
{
char buf[1024];
struct tag *tags = NULL;
while( get_tag(buf, buf sizeof buf) != EOF ){
push(&tags, buf);
}
printf("%s - %s\n", "Tag Name", "Number");
walk(tags);
}
This does not handle invalid input at all well. (For example, a reasonable person would take --Maths--
and -- Maths--
(with whitespace) to be the same token, and the program should probably discard unmatched --
rather than incorporating an enclosing newline into the tag name, and strings like -------
should be handled somewhat cleanly.) Dealing with such details is left as an exercise for the reader.