I've been trying to fetch the first provider name of a table from a webpage. As the data of that table spread across next pages, I have created a loop within the script to fetch the first provider name from all the next pages. I just can't figure out what exactly I'm doing wrong as I'm always getting the first provider name of the table located in the first page, not from the next pages
.
If you wish to know how to populate result manually from that page:
- Go to this site website
- Choose
US
from the dropdown right next toCountry
- Hit the
Search
button
I used the same logic in python and found success. However, the only thing differs from here is that I created session in python script and reused the same session in every loop.
This is how my current approach looks like:
function fetchInformation() {
const url = 'http://carf.org/advancedProviderSearch.aspx';
var options = {
'method': 'GET',
'muteHttpExceptions': true,
'headers': {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
},
};
var response = UrlFetchApp.fetch(url, options);
var $ = Cheerio.load(response.getContentText());
var payload = {};
payload['__VIEWSTATE'] = $('[id="__VIEWSTATE"]').first().attr('value');
payload['EktronClientManager'] = $('[id="EktronClientManager"]').first().attr('value');
payload['__VIEWSTATEGENERATOR'] = $('[id="__VIEWSTATEGENERATOR"]').first().attr('value');
payload['__EVENTVALIDATION'] = $('[id="__EVENTVALIDATION"]').first().attr('value');
payload['__EVENTTARGET'] = '';
payload['ctl00$usrWebsiteSearch$txtWebsiteSearch'] = 'Search CARF.org';
payload['ctl00$body$ddlCity'] = '0';
payload['ctl00$body$ddlZipcode'] = '0';
payload['ctl00$body$ddlCountry'] = 'US';
payload['ctl00$body$ddlProgram'] = '';
payload['ctl00$body$ddlProgramFocus'] = '';
payload['ctl00$body$ddlPopulation'] = '';
payload['ctl00$body$btnAdvSearch'] = 'Search';
var i = 0;
while (i<5){
var options = {
'method': 'POST',
'payload': payload,
'muteHttpExceptions': true,
'headers': {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
},
};
var response = UrlFetchApp.fetch(url, options);
var $ = Cheerio.load(response.getContentText());
var elem = $('table#ctl00_body_gvProviderAdvSearch tr td > a').first().text();
console.log(elem);
var payload = {};
payload['__VIEWSTATE'] = $('[id="__VIEWSTATE"]').first().attr('value');
payload['EktronClientManager'] = $('[id="EktronClientManager"]').first().attr('value');
payload['__VIEWSTATEGENERATOR'] = $('[id="__VIEWSTATEGENERATOR"]').first().attr('value');
payload['__EVENTVALIDATION'] = $('[id="__EVENTVALIDATION"]').first().attr('value');
payload['ctl00$usrWebsiteSearch$txtWebsiteSearch'] = 'Search CARF.org';
payload['ctl00$body$ddlCity'] = '0';
payload['ctl00$body$ddlZipcode'] = '0';
payload['ctl00$body$ddlCountry'] = 'US';
payload['ctl00$body$ddlProgram'] = '';
payload['ctl00$body$ddlProgramFocus'] = '';
payload['ctl00$body$ddlPopulation'] = '';
payload['__EVENTTARGET'] = 'ctl00$body$gvProviderAdvSearch$ctl13$ctl01';
i ;
}
}
How to grab the first provider name from a table of each page spread across next pages?
CodePudding user response:
Modification points:
- When I saw your script, I thought that in your situation, the cookie might be required to be included.
- In the current stage, unfortunately,
User-Agent
cannot be changed with UrlFetchApp.
When these points are reflected in your script, it becomes as follows.
Modified script:
function fetchInformation() {
const url = 'http://carf.org/advancedProviderSearch.aspx';
// Modified
var options = {
'method': 'GET',
'muteHttpExceptions': true,
};
var response = UrlFetchApp.fetch(url, options);
var $ = Cheerio.load(response.getContentText());
var payload = {};
payload['__VIEWSTATE'] = $('[id="__VIEWSTATE"]').first().attr('value');
payload['EktronClientManager'] = $('[id="EktronClientManager"]').first().attr('value');
payload['__VIEWSTATEGENERATOR'] = $('[id="__VIEWSTATEGENERATOR"]').first().attr('value');
payload['__EVENTVALIDATION'] = $('[id="__EVENTVALIDATION"]').first().attr('value');
payload['__EVENTTARGET'] = '';
payload['ctl00$usrWebsiteSearch$txtWebsiteSearch'] = 'Search CARF.org';
payload['ctl00$body$ddlCity'] = '0';
payload['ctl00$body$ddlZipcode'] = '0';
payload['ctl00$body$ddlCountry'] = 'US';
payload['ctl00$body$ddlProgram'] = '';
payload['ctl00$body$ddlProgramFocus'] = '';
payload['ctl00$body$ddlPopulation'] = '';
payload['ctl00$body$btnAdvSearch'] = 'Search';
var cookie = response.getAllHeaders()["Set-Cookie"][1]; // Added
var i = 0;
while (i < 5) {
var options = {
'method': 'POST',
'payload': payload,
'muteHttpExceptions': true,
'headers': { cookie }, // Modified
};
var response = UrlFetchApp.fetch(url, options);
var $ = Cheerio.load(response.getContentText());
var elem = $('table#ctl00_body_gvProviderAdvSearch tr td > a').first().text();
console.log(elem);
var payload = {};
payload['__VIEWSTATE'] = $('[id="__VIEWSTATE"]').first().attr('value');
payload['EktronClientManager'] = $('[id="EktronClientManager"]').first().attr('value');
payload['__VIEWSTATEGENERATOR'] = $('[id="__VIEWSTATEGENERATOR"]').first().attr('value');
payload['__EVENTVALIDATION'] = $('[id="__EVENTVALIDATION"]').first().attr('value');
payload['ctl00$usrWebsiteSearch$txtWebsiteSearch'] = 'Search CARF.org';
payload['ctl00$body$ddlCity'] = '0';
payload['ctl00$body$ddlZipcode'] = '0';
payload['ctl00$body$ddlCountry'] = 'US';
payload['ctl00$body$ddlProgram'] = '';
payload['ctl00$body$ddlProgramFocus'] = '';
payload['ctl00$body$ddlPopulation'] = '';
payload['__EVENTTARGET'] = 'ctl00$body$gvProviderAdvSearch$ctl13$ctl01';
i ;
}
}
Result:
When the above-modified script is run, the following result can be seen in the log.
Greenleaf Job Training Services Inc
06Maren
10th District Substance Abuse Program dba New Beginnings, C.A.S.A.
12 Queen Street
12th Street House