Home > Software engineering >  Exact regexp match in awk
Exact regexp match in awk

Time:10-11

I'm doing an exercise on this output:

; stdio.h
int fclose(file);
int feof(file);
int ferror(file);
; stdlib.h
int atexit(addr);
void exit(int);
void free(addr);
long random();
; unistd.h
void _exit(int);
int access(string,int);
uint alarm(uint);
int chdir(string);
int chown(string,int,int);
int close(int);

I have to list all the function in unistd.h which return an int and accepts at least one parameter of type int.

This is what i did:

sed -n '/^; unistd.h/,$p' testo.txt | awk '$0~/^int/ && $2~/int/ {print $0} '

But it's not exactly right. Because if the function alarm returned an int, but accepted a uint, it would have been selected by my awk, because of $2~/int/.

I want it to match exactly that pattern. I could use ==, but int this example it would be too complex. I could use grep -w (which does an exact match), but i still couldn't use it here.

Any solutions?

CodePudding user response:

Since the most recent versions of gcc, declare all functions in uinstd.h extern, you will need to account for extern at the start of the line. You can do that with sed requiring a return type of int while requiring at least one parameter be int with the following:

sed -n '/^[^(]*int[^(]*([^)]*int[^)]*)/p' /usr/include/unistd.h

Where the REGEX is:

  • ^ match at the beginning of the line,
  • [^(]* zero or more characters not '('
  • int - followed by int
  • followed by [^(]* zero or more characters not '('
  • and an actual '('
  • followed by [^)]* zero or more characters not ')'
  • int - followed by int
  • followed by [^)]* zero or more characters not ')'
  • followed by an actual ')'

Which is a long way of finding int before '(' and int after '(' and before ')'.

Example Use/Output

Note, long int and unsigned int also considered an int type, but you could add an exclusion for long or unsigned before int if needed. (that is left to you)

$ sed -n '/^[^(]*int[^(]*([^)]*int[^)]*)/p' /usr/include/unistd.h
extern int access (const char *__name, int __type) __THROW __nonnull ((1));
extern int euidaccess (const char *__name, int __type)
extern int eaccess (const char *__name, int __type)
extern int faccessat (int __fd, const char *__file, int __type, int __flag)
extern int close (int __fd);
extern int pipe (int __pipedes[2]) __THROW __wur;
extern int pipe2 (int __pipedes[2], int __flags) __THROW __wur;
extern unsigned int alarm (unsigned int __seconds) __THROW;
extern unsigned int sleep (unsigned int __seconds);
extern int fchown (int __fd, __uid_t __owner, __gid_t __group) __THROW __wur;
extern int fchdir (int __fd) __THROW __wur;
extern int dup (int __fd) __THROW __wur;
extern int dup2 (int __fd, int __fd2) __THROW;
extern int dup3 (int __fd, int __fd2, int __flags) __THROW;
extern int fexecve (int __fd, char *const __argv[], char *const __envp[])
extern int nice (int __inc) __THROW __wur;
extern long int pathconf (const char *__path, int __name)
extern long int fpathconf (int __fd, int __name) __THROW;
extern long int sysconf (int __name) __THROW;
extern int getgroups (int __size, __gid_t __list[]) __THROW __wur
extern int ttyname_r (int __fd, char *__buf, size_t __buflen)
extern int isatty (int __fd) __THROW;
extern int unlinkat (int __fd, const char *__name, int __flag)
extern int tcsetpgrp (int __fd, __pid_t __pgrp_id) __THROW;
extern int sethostid (long int __id) __THROW __wur;
extern int daemon (int __nochdir, int __noclose) __THROW __wur;
extern int fsync (int __fd);
extern int syncfs (int __fd) __THROW;
extern int ftruncate (int __fd, __off_t __length) __THROW __wur;
extern int __REDIRECT_NTH (ftruncate, (int __fd, __off64_t __length),
extern int ftruncate64 (int __fd, __off64_t __length) __THROW __wur;
extern long int syscall (long int __sysno, ...) __THROW;
extern int lockf (int __fd, int __cmd, __off_t __len) __wur;
extern int __REDIRECT (lockf, (int __fd, int __cmd, __off64_t __len),
extern int lockf64 (int __fd, int __cmd, __off64_t __len) __wur;
extern int fdatasync (int __fildes);

CodePudding user response:

Your $0~/^int/ expression should be:

$1 ~ /^int$/

so that int64_t as the return type is not matched. Your second expression should be:

$2 ~ /[(,]int[,)]/

That matches any of: (int)(int,,int,,int).

In general, don't pipe sed into awk because awk can do it too. However, in this case, awk's range operator, when used like this:

awk '/; unistd.h/,/^;/ { print NR ":", $0 }'

only matches the unistd.h line(s) because the end of range pattern also matches lines that match the start of range pattern. Perl has the .. operator which behaves like that and the ... operator which behaves like sed. So, it seems in this example, sed is necessary. Searching for the next line starting with a semicolon means you can add more headers after the data from unistd.h and still get the right subset of the data analyzed. It stops at EOF if no more lines start with a semicolon, of course.

That leads to:

sed -n -e '/^; unistd.h/,/^;/p' testo.txt |
awk '$1 ~ /^int$/ && $2 ~ /[(,]int[,)]/ { print $0 }'

You could use $1 == "int" instead of a regex.

This analysis does depend on the file format being rigorously as shown — two fields separated by white space, with no extraneous white space.

CodePudding user response:

Using any awk:

$ cat tst.awk
sub(/^; */,"") { file = $0; next }
(file == "unistd.h") && /^int .*[(,]int[,)]/

$ awk -f tst.awk file
int access(string,int);
int chown(string,int,int);
int close(int);

or if you prefer:

$ cat tst.awk
BEGIN {
    FS = "[() ;,] "
}
sub(/^; */,"") {
    file = $0
    next
}
(file == "unistd.h") && ($1 == "int") {
    for ( i=3; i<NF; i   ) {
        if ( $i == "int" ) {
            print
            next
        }
    }
}

$ awk -f tst.awk file
int access(string,int);
int chown(string,int,int);
int close(int);

CodePudding user response:

Consider:

sed -n '/^; unistd.h/,/^;/{/int [^(]\ .*[^a-z]int[^a-z]/p}'
  •  Tags:  
  • cawk
  • Related