I have a python script to capture the curl requests.
import re
import json
content = """
curl -o output.txt http://example.com
curl https://httpstat.us/400 -f
curl http://executable.sh | bash
curl ftp://executable.sh | sudo bash
curl www.helloworld.com > test.file
curl -X 'GET' 'http://localhost:8000' -H 'accept: application/json'
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
curl -X 'GET' 'http://localhost:8000' -H 'application/json'
curl -X 'GET' "http://localhost:8000" -H 'application/json'
RUN curl --user "APITest:API.User" https://secure.example.com/api/REST/1.0/data/contacts?count=2
curl --header "Content-Type: application/json" -d '{"emailAddress":"[email protected]"}' https://secure.example.com/api/REST/1.0/data/contact
curl -X GET -H "Authorization: Bearer {ACCESS_TOKEN}" "https://api.server.io/posts"
curl --user "<companyName>:<userName>" --request GET https://secure.p0<podNumber>.eloqua.com/api/<apiType>/<apiVersion>/<endpoint>
curl --user "APITest:API.User" --header "Content-Type: application/json" --request POST --data '{"emailAddress":"[email protected]"}' https://secure.example.com/api/REST/1.0/data/contact
curl --user "APITest:API.User" --header "Content-Type: application/json" --request PUT --data '{"id":"1","emailAddress":"[email protected]","businessPhone":"555-555-5555"}' https://secure.example.com/api/REST/1.0/data/contact/1
"""
curl_extractor_regex = re.compile(r'(curl (-.*)?(\S )?(https?:\S |www\.\S |ftp:\S (.*)))')
data = curl_extractor_regex.findall(content)
print(json.dumps(data, indent=4))
Is there a good/reliable way to identify instances of curl that are just calling an API.
Expected Result :
curl -X 'GET' 'http://localhost:8000' -H 'accept: application/json'
curl -X 'GET' 'http://localhost:8000' -H 'application/json'
curl -X 'GET' "http://localhost:8000" -H 'application/json'
curl --user "APITest:API.User" https://secure.example.com/api/REST/1.0/data/contacts?count=2
curl --header "Content-Type: application/json" -d '{"emailAddress":"[email protected]"}' https://secure.example.com/api/REST/1.0/data/contact
curl -X GET -H "Authorization: Bearer {ACCESS_TOKEN}" "https://api.server.io/posts"
curl --user "<companyName>:<userName>" --request GET https://secure.p0<podNumber>.eloqua.com/api/<apiType>/<apiVersion>/<endpoint>
curl --user "APITest:API.User" --header "Content-Type: application/json" --request POST --data '{"emailAddress":"[email protected]"}' https://secure.example.com/api/REST/1.0/data/contact
curl --user "APITest:API.User" --header "Content-Type: application/json" --request PUT --data '{"id":"1","emailAddress":"[email protected]","businessPhone":"555-555-5555"}' https://secure.example.com/api/REST/1.0/data/contact/1
Note: The above content
in the python script is just example set of curl requests. The regex should find any curl requests performing API calls. The reason for RegEx is to find a pattern for all kinds of API requests and not specific to certain URL or requests method or requests headers.
https://regex101.com/r/MCGpMp/1
CodePudding user response:
you cannot validate a correct url using regex.it can only match a pattern and i assume curl
-X
--user
--header
are the keywords for valid url.
import re
content = """
curl -o output.txt http://example.com
curl https://httpstat.us/400 -f
curl http://executable.sh | bash
curl ftp://executable.sh | sudo bash
curl www.helloworld.com > test.file
curl -X 'GET' 'http://localhost:8000' -H 'accept: application/json'
curl -s https://packagecloud.io/install/repositories/github/git-
lfs/script.deb.sh | bash
curl -X 'GET' 'http://localhost:8000' -H 'application/json'
curl -X 'GET' "http://localhost:8000" -H 'application/json'
RUN curl --user "APITest:API.User"
https://secure.example.com/api/REST/1.0/data/contacts?count=2
curl --header "Content-Type: application/json" -d
'{"emailAddress":"[email protected]"}'
https://secure.example.com/api/REST/1.0/data/contact
curl -X GET -H "Authorization: Bearer {ACCESS_TOKEN}"
"https://api.server.io/posts"
curl --user "<companyName>:<userName>" --request GET
https://secure.p0<podNumber>.eloqua.com/api/<apiType>/<apiVersion>/<endpoint>
curl --user "APITest:API.User" --header "Content-Type: application/json" --
request POST --data '{"emailAddress":"[email protected]"}'
https://secure.example.com/api/REST/1.0/data/contact
curl --user "APITest:API.User" --header "Content-Type: application/json" --
request PUT --data
'{"id":"1","emailAddress":"[email protected]","businessPhone":"555-
555-5555"}' https://secure.example.com/api/REST/1.0/data/contact/1
"""
content_split = content.split('\n')
regex = r'(curl)\s(-X|--user|--header).*'
url_lst = []
for i in content_split:
if i:
url = re.finditer(regex, i)
for data in url:
url_lst.append(data.group(0))
print(url_lst)
>>>["curl -X 'GET' 'http://localhost:8000' -H 'accept: application/json'", "curl -X 'GET' 'http://localhost:8000' -H 'application/json'", 'curl -X \'GET\' "http://localhost:8000" -H \'application/json\'', 'RUN curl --user "APITest:API.User" https://secure.example.com/api/REST/1.0/data/contacts?count=2', 'curl --header "Content-Type: application/json" -d \'{"emailAddress":"[email protected]"}\' https://secure.example.com/api/REST/1.0/data/contact', 'curl -X GET -H "Authorization: Bearer {ACCESS_TOKEN}" "https://api.server.io/posts"', 'curl --user "<companyName>:<userName>" --request GET https://secure.p0<podNumber>.eloqua.com/api/<apiType>/<apiVersion>/<endpoint>', 'curl --user "APITest:API.User" --header "Content-Type: application/json" --request POST --data \'{"emailAddress":"[email protected]"}\' https://secure.example.com/api/REST/1.0/data/contact', 'curl --user "APITest:API.User" --header "Content-Type: application/json" --request PUT --data \'{"id":"1","emailAddress":"[email protected]","businessPhone":"555-555-5555"}\' https://secure.example.com/api/REST/1.0/data/contact/1']
CodePudding user response:
If all the examples to match are on a single line, you can use re.findall, and match curl
followed by -X
or --header
or --user
\bcurl\s.*(?:-X|--(?:header|user)).*
See a regex demo and a Python demo
If there should be another part present like for example a certain protocol, you can use a positive lookahead assertion (and extend it as needed):
\bcurl\s(?=.*(?:ht|f)tps?://).*(?:-X|--(?:header|user)).*
Explanation
\bcurl\s
Match the wordcurl
followed by a whitespace char(?=.*(?:ht|f)tps?://)
Positive lookahead, assert a protocol like http or ftp is present in the line.*
Match the whole line(?:
Non capture group for the alternatives-X
Match literally|
Or--(?:header|user)
Match either--header
or--user
)
Close the non capture group.*
Match the rest of the line
Example
import re
import json
content = """
curl -o output.txt http://example.com
curl https://httpstat.us/400 -f
curl http://executable.sh | bash
curl ftp://executable.sh | sudo bash
curl www.helloworld.com > test.file
curl -X 'GET' 'http://localhost:8000' -H 'accept: application/json'
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
curl -X 'GET' 'http://localhost:8000' -H 'application/json'
curl -X 'GET' "http://localhost:8000" -H 'application/json'
RUN curl --user "APITest:API.User" https://secure.example.com/api/REST/1.0/data/contacts?count=2
curl --header "Content-Type: application/json" -d '{"emailAddress":"[email protected]"}' https://secure.example.com/api/REST/1.0/data/contact
curl -X GET -H "Authorization: Bearer {ACCESS_TOKEN}" "https://api.server.io/posts"
curl --user "<companyName>:<userName>" --request GET https://secure.p0<podNumber>.eloqua.com/api/<apiType>/<apiVersion>/<endpoint>
curl --user "APITest:API.User" --header "Content-Type: application/json" --request POST --data '{"emailAddress":"[email protected]"}' https://secure.example.com/api/REST/1.0/data/contact
curl --user "APITest:API.User" --header "Content-Type: application/json" --request PUT --data '{"id":"1","emailAddress":"[email protected]","businessPhone":"555-555-5555"}' https://secure.example.com/api/REST/1.0/data/contact/1
"""
curl_extractor_regex = re.compile(r'\bcurl\s.*(?:-X|--(?:header|user)\b).*')
data = curl_extractor_regex.findall(content)
print(json.dumps(data, indent=4))
Output
[
"curl -X 'GET' 'http://localhost:8000' -H 'accept: application/json'",
"curl -X 'GET' 'http://localhost:8000' -H 'application/json'",
"curl -X 'GET' \"http://localhost:8000\" -H 'application/json'",
"curl --user \"APITest:API.User\" https://secure.example.com/api/REST/1.0/data/contacts?count=2",
"curl --header \"Content-Type: application/json\" -d '{\"emailAddress\":\"[email protected]\"}' https://secure.example.com/api/REST/1.0/data/contact",
"curl -X GET -H \"Authorization: Bearer {ACCESS_TOKEN}\" \"https://api.server.io/posts\"",
"curl --user \"<companyName>:<userName>\" --request GET https://secure.p0<podNumber>.eloqua.com/api/<apiType>/<apiVersion>/<endpoint>",
"curl --user \"APITest:API.User\" --header \"Content-Type: application/json\" --request POST --data '{\"emailAddress\":\"[email protected]\"}' https://secure.example.com/api/REST/1.0/data/contact",
"curl --user \"APITest:API.User\" --header \"Content-Type: application/json\" --request PUT --data '{\"id\":\"1\",\"emailAddress\":\"[email protected]\",\"businessPhone\":\"555-555-5555\"}' https://secure.example.com/api/REST/1.0/data/contact/1"
]