source: https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP
I'm trying to imitate the following 2 SQL queries in C. The first works; the second failed:
SELECT regexp_match('hello world test', 'world.{3}');
SELECT regexp_match('foobarbequebaz', '(bar)(beque)');
#include<regex.h>
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#define MAX_MATCHES 1024
int main(void)
{
regex_t regex;
int reti;
char msgbuf[100];
char buff0[20];
char buff[20];
char buff1[20];
char *sz1 = "hello world test";
//char *sz2= "foobarbequebaz";
char *pattern1 = "world.{3}";
//char *pattern2 = "(bar)(beque)";
regmatch_t matches[MAX_MATCHES];
/* Compile regular expression */
reti = regcomp(®ex,pattern1,REG_EXTENDED);
if(reti){
fprintf(stderr,"could not compile\n");
exit(EXIT_FAILURE);
}
reti = regexec(®ex,sz1,MAX_MATCHES,matches,0);
if(!reti){
printf("szso=%d\n",matches[1].rm_so);
printf("szeo=%d\n",matches[1].rm_eo);
memcpy(buff0,sz1 matches[0].rm_so,matches[0].rm_eo-matches[0].rm_so);
memcpy(buff,sz1 matches[1].rm_so,matches[1].rm_eo-matches[1].rm_so);
memcpy(buff1,sz1 matches[2].rm_so,matches[2].rm_eo-matches[2].rm_so);
printf("group0: %s\n",buff0);
printf("group1: %s\n",buff);
printf("group2: %s\n",buff1);
}
else if(reti == REG_NOMATCH){
puts("No match");
}
else{
regerror(reti,®ex,msgbuf,sizeof(msgbuf));
fprintf(stderr,"Regex match failed: %s\n",msgbuf);
exit(EXIT_FAILURE);
}
regfree(®ex);
exit(EXIT_SUCCESS);
}
output
szso=3
szeo=11
group1: barbeque
expect two groups, so group1 only return bar
.
Update to the question:
pattern2
match againsz2
behavior as expected.- However, if only if only one part of the pattern matches then
matches[0]
should be the same asmatches[1]
. - So in this new context, should I expect
group0
is the same asgroup1
?
CodePudding user response:
Read the man page. matches[0]
contains the whole match, matches[1]
the first parenthesized group, matches[2]
the second, and so on.
CodePudding user response:
Note that memcpy()
gives you a byte array, not a string. You can't reliably use buff
in the printf()
statement without some extra work. You could use:
int i = 0;
int len = matches[i].rm_eo - matches[i].rm_so;
printf("group1 [%*.*s]\n", len, len, sz2);
or you could use:
printf("group1 [%*.*s]\n", len, len, buff);
to print only the relevant portion of the data. And note the comments from nwellnhof in their answer — which is a reason why I added the variable i
into the mix: you can iterate over the matches.
This code works for me:
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_MATCHES 10
int main(void)
{
regex_t regex;
int reti;
char msgbuf[100];
char *sz1 = "hello world test";
char *pattern1 = "world.{3}";
regmatch_t matches[MAX_MATCHES];
/* Compile regular expression */
reti = regcomp(®ex, pattern1, REG_EXTENDED);
if (reti)
{
fprintf(stderr, "could not compile\n");
exit(EXIT_FAILURE);
}
printf("Nsubs = %zu\n", regex.re_nsub);
reti = regexec(®ex, sz1, MAX_MATCHES, matches, 0);
if (!reti)
{
for (size_t i = 0; i <= regex.re_nsub; i )
{
char buff[20] = "";
printf("szso[%zu]=%lld\n", i, matches[i].rm_so);
printf("szeo[%zu]=%lld\n", i, matches[i].rm_eo);
memcpy(buff, sz1 matches[i].rm_so, matches[i].rm_eo - matches[i].rm_so);
printf("group[%zu]: [%s]\n", i, buff);
}
}
else if (reti == REG_NOMATCH)
{
puts("No match");
}
else
{
regerror(reti, ®ex, msgbuf, sizeof(msgbuf));
fprintf(stderr, "Regex match failed: %s\n", msgbuf);
exit(EXIT_FAILURE);
}
regfree(®ex);
return(EXIT_SUCCESS);
}
It produces:
Nsubs = 0
szso[0]=6
szeo[0]=14
group[0]: [world te]
Because there are no grouping parentheses in the regular expression, there are no subgroups — the only information you get is about the whole of the matched text.
If you used pattern2
from the question as the regex, with the two parenthesized sub-expressions, you would get more data in the matches
array when the data matches.
By zapping the entire buff
array with nulls on each iteration, there's no problem with the copying not being null-terminated on this sample data. You'd be wise to ensure that it doesn't overflow in real-life code.
If only one of the two parts of the regex matches (in pattern2
in the question), you will get an error back from regexec()
and may not rely on anything in the matches
array.