Home > Back-end >  Capture groups of regex pattern in C using POSIX regex functions
Capture groups of regex pattern in C using POSIX regex functions

Time:11-01

source: https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP

I'm trying to imitate the following 2 SQL queries in C. The first works; the second failed:

SELECT regexp_match('hello world test', 'world.{3}');   
SELECT regexp_match('foobarbequebaz', '(bar)(beque)');     

#include<regex.h>
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#define MAX_MATCHES 1024
int main(void)
{
    regex_t regex;
    int reti;
    char msgbuf[100];
    char buff0[20];
    char buff[20];
    char buff1[20];
    char *sz1 = "hello world test";
    //char *sz2= "foobarbequebaz";
    char *pattern1 = "world.{3}";
    //char *pattern2 = "(bar)(beque)";
    regmatch_t matches[MAX_MATCHES];

    /* Compile regular expression */
    reti = regcomp(&regex,pattern1,REG_EXTENDED);
    if(reti){
        fprintf(stderr,"could not compile\n");
        exit(EXIT_FAILURE);
    }

    reti = regexec(&regex,sz1,MAX_MATCHES,matches,0);
    if(!reti){
        printf("szso=%d\n",matches[1].rm_so);
        printf("szeo=%d\n",matches[1].rm_eo);
        memcpy(buff0,sz1 matches[0].rm_so,matches[0].rm_eo-matches[0].rm_so);
        memcpy(buff,sz1 matches[1].rm_so,matches[1].rm_eo-matches[1].rm_so);
        memcpy(buff1,sz1 matches[2].rm_so,matches[2].rm_eo-matches[2].rm_so);
        printf("group0: %s\n",buff0);
        printf("group1: %s\n",buff);
        printf("group2: %s\n",buff1);
    }
    else if(reti == REG_NOMATCH){
        puts("No match");
    }
    else{
        regerror(reti,&regex,msgbuf,sizeof(msgbuf));
        fprintf(stderr,"Regex match failed: %s\n",msgbuf);
        exit(EXIT_FAILURE);
    }
    regfree(&regex);
    exit(EXIT_SUCCESS);
}

output

szso=3
szeo=11
group1: barbeque

expect two groups, so group1 only return bar.


Update to the question:

  • pattern2 match again sz2 behavior as expected.
  • However, if only if only one part of the pattern matches then matches[0] should be the same as matches[1].
  • So in this new context, should I expect group0 is the same as group1?

CodePudding user response:

Read the man page. matches[0] contains the whole match, matches[1] the first parenthesized group, matches[2] the second, and so on.

CodePudding user response:

Note that memcpy() gives you a byte array, not a string. You can't reliably use buff in the printf() statement without some extra work. You could use:

int i = 0;
int len = matches[i].rm_eo - matches[i].rm_so;
printf("group1 [%*.*s]\n", len, len, sz2);

or you could use:

printf("group1 [%*.*s]\n", len, len, buff);

to print only the relevant portion of the data. And note the comments from nwellnhof in their answer — which is a reason why I added the variable i into the mix: you can iterate over the matches.

This code works for me:

#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_MATCHES 10

int main(void)
{
    regex_t regex;
    int reti;
    char msgbuf[100];
    char *sz1 = "hello world test";
    char *pattern1 = "world.{3}";
    regmatch_t matches[MAX_MATCHES];

    /* Compile regular expression */
    reti = regcomp(&regex, pattern1, REG_EXTENDED);
    if (reti)
    {
        fprintf(stderr, "could not compile\n");
        exit(EXIT_FAILURE);
    }
    printf("Nsubs = %zu\n", regex.re_nsub);

    reti = regexec(&regex, sz1, MAX_MATCHES, matches, 0);
    if (!reti)
    {
        for (size_t i = 0; i <= regex.re_nsub; i  )
        {
            char buff[20] = "";
            printf("szso[%zu]=%lld\n", i, matches[i].rm_so);
            printf("szeo[%zu]=%lld\n", i, matches[i].rm_eo);
            memcpy(buff, sz1   matches[i].rm_so, matches[i].rm_eo - matches[i].rm_so);
            printf("group[%zu]: [%s]\n", i, buff);
        }
    }
    else if (reti == REG_NOMATCH)
    {
        puts("No match");
    }
    else
    {
        regerror(reti, &regex, msgbuf, sizeof(msgbuf));
        fprintf(stderr, "Regex match failed: %s\n", msgbuf);
        exit(EXIT_FAILURE);
    }
    regfree(&regex);

    return(EXIT_SUCCESS);
}

It produces:

Nsubs = 0
szso[0]=6
szeo[0]=14
group[0]: [world te]

Because there are no grouping parentheses in the regular expression, there are no subgroups — the only information you get is about the whole of the matched text.

If you used pattern2 from the question as the regex, with the two parenthesized sub-expressions, you would get more data in the matches array when the data matches.

By zapping the entire buff array with nulls on each iteration, there's no problem with the copying not being null-terminated on this sample data. You'd be wise to ensure that it doesn't overflow in real-life code.

If only one of the two parts of the regex matches (in pattern2 in the question), you will get an error back from regexec() and may not rely on anything in the matches array.

  •  Tags:  
  • c
  • Related