I am trying to Python web scrape this webpage daily for a school project:
I am trying to mimic that same post request in Python so that I can get the txt file that the request would generate.
from urllib import request, parse
data_dict = {
'Data':'Stamp_1',
'Title':'Retired Offset Credits',
'Exclude':',rhid,ftType,Other Attributes here,Make Public,ahid,',
'Columns':'all,Account Holder,Quantity of Offset Credits,FacilityName,Email,Status Effective',
'Masks':'|||||MM/DD/YYYY',
'ClassMasks':',,#.0,,,',
'Headings':',,,Project Name,,',
'FormatType':'txt'
}
data = parse.urlencode(data_dict).encode()
req = request.Request('https://thereserve2.apx.com/myModule/include/rptdownload.asp', data=data_dict)
resp = request.urlretrieve(req, 'download.txt')
This isn't working - I'm getting a "TypeError: expected string or bytes-like object." I feel like I'm getting close here, but I just can't seem to translate the post request into the file download or table pull that I desire. Any help would be super appreciated.
CodePudding user response:
Needed the cookie as well to make it work~
import requests
from io import StringIO
import pandas as pd
data = {
'myFilter': '',
'Data': 'Stamp_0',
'Title': 'Retired Offset Credits',
'Exclude': ',rhid,ftType,Other Attributes here,Make Public,ahid,',
'Columns': 'all,Account Holder,Quantity of Offset Credits,FacilityName,Email,Status Effective',
'Masks': '|||||MM/DD/YYYY',
'ClassMasks': ',,#.0,,,',
'Headings': ',,,Project Name,,',
'Parameters': '',
'ParametersOriginal': '',
'SortORder': '',
'FormatType': 'txt',
'ReplaceExpression': '',
'ReplaceValue': '',
}
cookies = {
'ASPSESSIONIDCGTRQSDS': 'DFDMDAFDFEPACLKJAAPHHBDH',
}
# Get the file
response = requests.post('https://thereserve2.apx.com/myModule/include/rptdownload.asp', cookies=cookies, data=data)
# Look at the file
df = pd.read_table(StringIO(response.text), sep=',', on_bad_lines='warn')
print(df.head())
# Write the file
with open('download.txt', 'wb') as f:
f.write(response.content)
Output:
Vintage Offset Credit Serial Numbers Quantity of Offset Credits Status Effective Project ID Project Name Project Type Protocol Version Project Site Location Project Site State Project Site Country Additional Certification(s) CORSIA Eligible Account Holder Retirement Reason Retirement Reason Details Unnamed: 16
0 2021 CAR-1-US-888-4-666-TX-2021-6665-1 to 17444 17444 12/09/2021 CAR888 Angelina County Landfill Landfill Gas Capture/Combustion Version 3.0 Lufkin TEXAS US NaN No Element Markets Emissions, LLC On Behalf of Third Party NaN NaN
1 2021 CAR-1-US-1247-37-234-MT-2021-6653-1 to 110 110 04/20/2022 CAR1247 Bluesource - Carroll Avoided Grassland Convers... Avoided Grassland Conversion Version 1.0 Valley County, MT MONTANA US NaN No Cool Effect Environmental Benefit NaN NaN
2 2021 CAR-1-MX-1282-42-938-PU-2021-6736-1 to 1604 1604 02/17/2022 CAR1282 Captura de carbono en San Rafael Ixtapalucan Forestry - MX Version 1.5 San Rafael Ixtapalucan PUEBLA MX NaN No Cultivo Land PBC On Behalf of Third Party Meta / Facebook Sustainability Goals NaN
3 2021 CAR-1-MX-1282-42-938-PU-2021-6734-1 to 5 5 02/17/2022 CAR1282 Captura de carbono en San Rafael Ixtapalucan Forestry - MX Version 1.5 San Rafael Ixtapalucan PUEBLA MX NaN No Cultivo Land PBC On Behalf of Third Party Meta / Facebook Sustainability Goals NaN
4 2021 CAR-1-MX-1415-42-938-OA-2021-6719-1 to 213 213 12/06/2021 CAR1415 Carbono, Agua y Biodiversidad Indígena Capulálpam Forestry - MX Version 2.0 Capulálpam de Méndez, Oaxaca OAXACA MX NaN No Cool Effect Environmental Benefit NaN NaN