I need some help to extract some data from a website. Unfortunately requires logins so no point in sharing the link. Hopefully, for someone with HTML understanding will be straight forward to help me.
I'm using the following code to extract Profiles for the web-page (see image below).
Sub testWebScap()
' Code uses the links from the webpage, create sheets with the customer name and paste all Profile details
Dim oHttp As Object, Html As HTMLDocument
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim HTMLClCol As MSHTML.IHTMLElementCollection
Dim HTMLCl As MSHTML.IHTMLElement
Dim i As Integer, j As Integer
Dim contentPage As String
Set Html = New HTMLDocument
Set oHttp = CreateObject("MSXML2.XMLHTTP")
j = 2
' ClientList sheet contains the webpage link for each customer.
For i = 2 To ThisWorkbook.Sheets("ClientList").Range("A65000").End(xlUp).Row
' Loop through each customer profile webpage
Sheets.Add(After:=Sheets(Sheets.Count)).Name = ThisWorkbook.Sheets("ClientList").Range("A" & i)
contentPage = "https://my.powerdiary.com" & ThisWorkbook.Sheets("ClientList").Range("B" & i)
j = 2
With oHttp
.Open "GET", contentPage, False
.send
sResp = .responseText
End With
HTMLDoc.body.innerHTML = oHttp.responseText
Set HTMLClCol = HTMLDoc.getElementsByClassName("form-control-label col-lg-4 col-12")
' Loop through Label-elements From Salutation to Country but skipping the Middle Name and Telephone Home. Labels copied in Column A
For Each HTMLCl In HTMLClCol
ActiveSheet.Range("A" & j).Value = HTMLCl.innerHTML
j = j 1
Next HTMLCl
Set HTMLClCol = HTMLDoc.getElementsByClassName("form-control-label col-xl-4 col-12")
' Loop through Label-elements From Status to NDIS but skipping the Medicare Ref, Month and Year. Labels copied in Column A
For Each HTMLCl In HTMLClCol
ActiveSheet.Range("A" & j).Value = HTMLCl.innerHTML
j = j 1
Next HTMLCl
Set HTMLClCol = HTMLDoc.getElementsByClassName("form-control")
j = 2
' Loop through All values (including those correspond to the missing labels). Values copied in Column B
For Each HTMLCl In HTMLClCol
ActiveSheet.Range("B" & j).Value = HTMLCl.getAttribute("value")
j = j 1
Next HTMLCl
Next i
End Sub
The code works partially as:
- I'm missing some label-elements (check the Data Sheet image). For instance can't get the middle name label (the missing one are in red font on the image).
- The value-elements are shifted by one row (I guess the code picks up an empty one at the start but can't figure why)
As I can't share with you the webpage, I have added the HTML part that corresponds to the Profile Data.
<form action="/Profile/33" data-prompt-unsaved="" method="post"> <div class="panel-body p-4">
<div class="row mt10 mb20">
<div class="col-xl-6 col-12 pr-md-4">
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="NameView_Salutation">Salutation</label>
<div class="col-sm-12 col-lg-8">
<select class="form-control" id="NameView_Salutation" name="NameView.Salutation"><option value="">No Salutation</option>
<option value="Mr">Mr</option>
<option value="Ms">Ms</option>
<option value="Mrs">Mrs</option>
<option value="Miss">Miss</option>
<option value="Mx">Mx</option>
<option selected="selected" value="Master">Master</option>
<option value="Dr">Dr</option>
<option value="Prof">Prof</option>
</select>
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="NameView_ClientFirstName">Name</label>
<div class="col-lg-4 col-6">
<input class="form-control medicare-first-name" data-val="true" data-val-length="The field First Name must be a string with a maximum length of 100." data-val-length-max="100" data-val-required="The First Name field is required." id="NameView_ClientFirstName" name="NameView.ClientFirstName" placeholder="First Name" type="text" value="TestFName" />
<span class="field-validation-valid" data-valmsg-for="NameView.ClientFirstName" data-valmsg-replace="true"></span>
</div>
<label class="sr-only" for="NameView_ClientMiddleName">Middle Name</label>
<div class="col-lg-4 col-6">
<input class="form-control" data-val="true" data-val-length="The field ClientMiddleName must be a string with a maximum length of 80." data-val-length-max="80" id="NameView_ClientMiddleName" name="NameView.ClientMiddleName" placeholder="Middle Name" type="text" value="TestMName" />
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="NameView_ClientLastName">Last Name</label>
<div class="col-sm-12 col-lg-8">
<input class="form-control medicare-last-name" data-val="true" data-val-length="The field ClientLastName must be a string with a maximum length of 80." data-val-length-max="80" id="NameView_ClientLastName" name="NameView.ClientLastName" placeholder="Last Name" type="text" value="TestLName" />
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="NameView_PreferredClientName">Preferred Name</label>
<div class="col-sm-12 col-lg-8">
<input class="form-control" data-val="true" data-val-length="The field PreferredClientName must be a string with a maximum length of 200." data-val-length-max="200" id="NameView_PreferredClientName" name="NameView.PreferredClientName" type="text" value="TestPName" />
</div>
</div>
<div class="form-group row mb20" id="clientDateofBirthBlock">
<label class="form-control-label col-lg-4 col-12" for="BirthView_DateOfBirth">Date of Birth</label>
<div class="col-lg-8 col-sm-12 col-12">
<div class="input-group date bs-datepicker">
<input CurrentMenuItem="ClientDetails" autocomplete="off" class="form-control form-control medicare-date-of-birth" data-val="true" data-val-date="The field DateOfBirth must be a date." id="BirthView_DateOfBirth" name="BirthView.DateOfBirth" style="min-width: 93px" type="text" value="03/11/2021" />
<div class="input-group-append input-group-addon">
<span class="input-group-text input-group-addon-gray"><i class="far fa-calendar-alt"></i></span>
</div>
</div>
<span class="field-validation-valid" data-valmsg-for="BirthView.DateOfBirth" data-valmsg-replace="true"></span>
<span class="fsize11 d-inline-block" id="clientAge">
(Age: <span id="ageYears">0</span> yrs, <span id="ageMonths">0</span> mths)
</span>
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="BirthView_Gender">Gender</label>
<div class="col-lg-8 col-12">
<select class="form-control" id="BirthView_Gender" name="BirthView.Gender"><option value="">Select...</option>
<option value="0">Female</option>
<option selected="selected" value="1">Male</option>
<option value="2">Other</option>
</select>
</div>
</div>
<script>
//Update client's age
var $clientDob = $('#BirthView_DateOfBirth');
$clientDob.change(function() {
var date = $(this).val();
if (!date) {
$('#clientAge').removeClass("d-inline-block").addClass("d-none");
$('#clientDateofBirthBlock').addClass("mb20").removeClass("mb10");
return;
}
$('#clientAge').removeClass("d-none").addClass("d-inline-block");
$('#clientDateofBirthBlock').removeClass("mb20").addClass("mb10");
ajaxHelper.post('/Client/GetAge', { dob: date }, $clientDob, {
onJsonSuccess: function(response) {
if (response) {
var age = response.data;
$('#ageYears').html(age.years);
$('#ageMonths').html(age.months);
$('#clientAge').show();
}
}
});
});
</script>
<div class="form-group row mb20 mobile-number-group">
<label class="form-control-label col-lg-4 col-12" for="PhoneView_ClientMobile">Mobile Number</label>
<div class="col-sm-12 col-lg-8">
<input class="form-control" data-val="true" data-val-length="The mobile number must be less than 25 characters" data-val-length-max="25" data-val-mobilenumber="" data-val-phonenumberbasic="" id="PhoneView_ClientMobile" name="PhoneView.ClientMobile" type="text" value="123456789" />
<span class="field-validation-valid" data-valmsg-for="PhoneView.ClientMobile" data-valmsg-replace="true"></span>
</div>
</div>
<div class="form-group row mb10">
<label class="form-control-label col-lg-4 col-12" for="PhoneView_WorkPhone">Telephone</label>
<div class="col-lg-4 col-6">
<input class="form-control" data-val="true" data-val-landlineormobilenumber="" data-val-length="The work phone number must be less than 25 characters" data-val-length-max="25" data-val-phonenumberbasic="" id="PhoneView_WorkPhone" name="PhoneView.WorkPhone" placeholder="Work Phone" type="text" value="987654321" />
<span class="fsize11">Work</span>
<span class="field-validation-valid" data-valmsg-for="PhoneView.WorkPhone" data-valmsg-replace="true"></span>
</div>
<div class="col-lg-4 col-6">
<label class="sr-only" for="PhoneView_HomePhone">Home Phone</label>
<input class="form-control" data-val="true" data-val-landlineormobilenumber="" data-val-length="The home phone number must be less than 25 characters" data-val-length-max="25" data-val-phonenumberbasic="" id="PhoneView_HomePhone" name="PhoneView.HomePhone" placeholder="Home Phone" type="text" value="0147258369" />
<span class="fsize11 ">Home</span>
<span class="field-validation-valid" data-valmsg-for="PhoneView.HomePhone" data-valmsg-replace="true"></span>
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="PhoneView_ClientEmail">Email</label>
<div class="col-sm-12 col-lg-8">
<input class="form-control" id="PhoneView_ClientEmail" name="PhoneView.ClientEmail" title="Filter by any part of the client's email address." type="text" value="[email protected]" />
<span class="field-validation-valid" data-valmsg-for="PhoneView.ClientEmail" data-valmsg-replace="true"></span>
</div>
<script>
$(function (){
let emailControl = $('#PhoneView_ClientEmail');
emailControl.on('blur', function (){
let trimmed = emailControl.val().trim();
emailControl.val(trimmed);
});
});
</script>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="AddressView_Address">Address</label>
<div class="col-sm-12 col-lg-8">
<textarea class="form-control" data-val="true" data-val-maxlength="The field Address must be a string or array type with a maximum length of '255'." data-val-maxlength-max="255" id="AddressView_Address" name="AddressView.Address" rows="2">
1 Address Road</textarea>
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="AddressView_Suburb">City</label>
<div class="col-sm-12 col-lg-8">
<input class="form-control" data-val="true" data-val-maxlength="The field Suburb must be a string or array type with a maximum length of '50'." data-val-maxlength-max="50" id="AddressView_Suburb" name="AddressView.Suburb" placeholder="Suburb/Town" type="text" value="TestSuburb" />
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="AddressView_Region">State</label>
<div class="col-lg-4 col-6">
<input class="form-control" data-val="true" data-val-maxlength="The field Region must be a string or array type with a maximum length of '50'." data-val-maxlength-max="50" id="AddressView_Region" name="AddressView.Region" placeholder="State" type="text" value="VIC" />
</div>
<div class="col-lg-4 col-6">
<label class="sr-only" for="AddressView_Postcode">Postcode</label>
<input class="form-control" data-val="true" data-val-maxlength="The field Postcode must be a string or array type with a maximum length of '8'." data-val-maxlength-max="8" id="AddressView_Postcode" name="AddressView.Postcode" placeholder="Postcode" type="text" value="1234" />
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12">Country</label>
<div class="col-sm-12 col-lg-8">
<select class="form-control" data-val="true" data-val-maxlength="The field Country must be a string or array type with a maximum length of '100'." data-val-maxlength-max="100" id="AddressView_Country" name="AddressView.Country"><option value="">Select a Country...</option>
<option value="AF">Afghanistan</option>
<option selected="selected" value="AU">Australia</option>
<option value="ZW">Zimbabwe</option>
</select>
</div>
</div>
</div>
<div class="col-xl-6 col-12 mb20 pl-md-4">
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="ClientStatusId">Status</label>
<div class="col-sm-12 col-lg-8">
<select class="form-control" data-val="true" data-val-number="The field ClientStatusId must be a number." data-val-required="The ClientStatusId field is required." id="ClientStatusId" name="ClientStatusId"><option value="0"></option>
<option selected="selected" value="1">Active</option>
<option value="4">Closed</option>
<option value="2">Follow Up Required</option>
<option value="3">On Hold</option>
</select>
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="ClientTypeId">Client Type</label>
<div class="col-sm-12 col-lg-8">
<select class="form-control" data-val="true" data-val-number="The field ClientTypeId must be a number." id="ClientTypeId" name="ClientTypeId"><option value=""></option>
<option value="53372">Example Client Type 1</option>
<option value="53371">Example Client Type 2</option>
</select>
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="FileUnder">File Under</label>
<div class="col-sm-12 col-lg-8">
<input class="form-control" data-val="true" data-val-maxlength="The field FileUnder must be a string or array type with a maximum length of '80'." data-val-maxlength-max="80" id="FileUnder" name="FileUnder" type="text" value="" />
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="ClientCompanyNameView_CompanyName">Business Name</label>
<div class="col-sm-12 col-lg-8">
<input class="form-control" data-val="true" data-val-length="The field Business Name must be a string with a maximum length of 100." data-val-length-max="100" id="ClientCompanyNameView_CompanyName" name="ClientCompanyNameView.CompanyName" type="text" value="TestBName" />
<span class="field-validation-valid" data-valmsg-for="ClientCompanyNameView.CompanyName" data-valmsg-replace="true"></span>
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="Occupation">Occupation</label>
<div class="col-sm-12 col-lg-8">
<input class="form-control" data-val="true" data-val-maxlength="Maximum 100 characters allowed." data-val-maxlength-max="100" id="Occupation" name="Occupation" type="text" value="TestOcc" />
</div>
</div>
<div class="form-group row mb20">
<label class="form-control-label col-lg-4 col-12" for="HowHeardId">How Client Heard About Us</label>
<div class="col-sm-12 col-lg-8">
<select class="form-control" data-val="true" data-val-number="The field HowHeardId must be a number." id="HowHeardId" name="HowHeardId"><option value=""></option>
<option selected="selected" value="1">Internet</option>
<option value="2">Referral</option>
</select>
</div>
</div>
<div class="form-group row mb25">
<label class="form-control-label col-lg-4 col-12" for="DateAdded">Date Added</label>
<div class="col-sm-12 col-lg-8">
6/11/2021
</div>
</div>
<input class="medicare-client-id" data-val="true" data-val-number="The field Id must be a number." data-val-required="The Id field is required." id="ClientBase_Id" name="ClientBase.Id" type="hidden" value="9083410" />
<div><input data-val="true" data-val-number="The field Id must be a number." data-val-required="The Id field is required." id="ClientInsurerCardViews_0__Id" name="ClientInsurerCardViews[0].Id" type="hidden" value="4836399" /></div>
<div class="form-group row mb5">
<label class="form-control-label col-xl-4 col-12" for="ClientInsurerCardViews_0__Number">Medicare</label>
<div class="col-xl-8 col-12">
<div class="row">
<div class="col-xl-7 col-sm-8 col-12">
<div class="row">
<div class="col-9">
<input class="form-control" id="ClientInsurerCardViews_0__Number" name="ClientInsurerCardViews[0].Number" placeholder="Number" style="min-width:107px;" type="text" value="0123456789" />
<span class="fsize11">Medicare Number </span>
</div>
<div class="col-3 pl0">
<label class="sr-only" for="ClientInsurerCardViews_0__Irn">Reference</label>
<input class="form-control small-number pr0" id="ClientInsurerCardViews_0__Irn" name="ClientInsurerCardViews[0].Irn" placeholder="IRN" style="margin-left: 10px;" type="text" value="1" />
<span class="fsize11" style="margin-left: 10px;">Ref</span>
</div>
</div>
</div>
<div class="col-xl-5 col-sm-4 col-12">
<div class="row">
<div class="col-6">
<label class="sr-only" for="ClientInsurerCardViews_0__Month">Month</label>
<input class="form-control small-number" data-val="true" data-val-number="The field Month must be a number." id="ClientInsurerCardViews_0__Month" name="ClientInsurerCardViews[0].Month" placeholder="MM" type="text" value="1" />
<span class="fsize11">Month</span>
</div>
<div class="col-6 pl0">
<label class="sr-only" for="ClientInsurerCardViews_0__Year">Year</label>
<input class="form-control small-number" data-val="true" data-val-number="The field Year must be a number." id="ClientInsurerCardViews_0__Year" name="ClientInsurerCardViews[0].Year" placeholder="YYYY" type="text" value="2025" />
<span class="fsize11">Year</span>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="form-group row mb20">
<div class="col text-right">
<div data-toggle="tooltip" title="" data-original-title="">
<button class="btn btn-success btn-xs shadow validate-medicare-button" type="button" id="validate-medicare-button-id-0">
<i classname="fa fa-check fa-margin"></i> Validate Medicare Data
</button>
</div>
</div>
</div>
<script type="text/javascript">
$(function () {
$("#validate-medicare-button-id-0").click(function (e) {
e.preventDefault();
var $this = $(this);
$this.setDisabled(true);
// medicare-client-id is declared outside
var clientId = $(".medicare-client-id").val();
var number = $("#ClientInsurerCardViews_0__Number").val();
var irn = $("#ClientInsurerCardViews_0__Irn").val();
var dob = $(".medicare-date-of-birth").val();
var lastName = $(".medicare-last-name").val();
var firstName = $(".medicare-first-name").val();
PD.Client.Medicare.validateMedicareNumber(clientId,
number,
irn,
dob,
firstName,
lastName,
function() { $this.setDisabled(false); });
});
$("#ClientInsurerCardViews_0__Month").rules("add", { digits: true, range: [1, 12] });
var currentYear = (new Date()).getFullYear();
$("#ClientInsurerCardViews_0__Year").rules("add", { digits: true, range: [currentYear - 10, currentYear 20] });
$("#ClientInsurerCardViews_0__Irn").rules("add", { digits: true, range: [1, 99] });
$("#ClientInsurerCardViews_0__Number").rules("add", { digits: true, minlength: 10, maxlength: 10 });
});
</script>
<div><input data-val="true" data-val-number="The field Id must be a number." data-val-required="The Id field is required." id="ClientInsurerCardViews_1__Id" name="ClientInsurerCardViews[1].Id" type="hidden" value="4836404" /></div>
<div class="form-group row mb5">
<label class="form-control-label col-xl-4 col-12" for="ClientInsurerCardViews_1__Number">NDIS</label>
<div class="col-xl-8 col-12">
<div class="row">
<div class="col-xl-7 col-sm-8 col-12">
<div class="row">
<div class="col-9">
<input class="form-control" id="ClientInsurerCardViews_1__Number" name="ClientInsurerCardViews[1].Number" placeholder="Number" style="min-width:107px;" type="text" value="231353151351" />
<span class="fsize11">Number</span>
</div>
</div>
</div>
</div>
</div>
</div>
<input data-val="true" data-val-number="The field ClientNumber must be a number." data-val-required="The ClientNumber field is required." id="clientNumber" name="clientNumber" type="hidden" value="33" />
</div>
</div>
<div class="row">
<div class="col-12">
<div class="form-group row">
<label class="form-control-label col-12" for="ClientNotes">Additional Comments</label>
<div class="col-12">
<textarea class="form-control" data-val="true" data-val-maxlength="The field ClientNotes must be a string or array type with a maximum length of '60000'." data-val-maxlength-max="60000" id="ClientNotes" name="ClientNotes" rows="7">
</textarea>
</div>
</div>
</div>
</div>
<div class="row">
<div class="col-12 mt-4">
<button type="submit" id="btnSubmit" class="float-right btn btn-lg btn-green save validate-form action-form"><i class="fa fa-check fa-margin"></i>SAVE CHANGES</button>
</div>
</div>
</div>
</form></div>
<iframe name="sif1" sandbox="allow-forms allow-modals allow-scripts" frameborder="0"></iframe>
CodePudding user response:
Try
Option Explicit
Sub testWebScrape()
' read html from file
Dim fso As Object, ts As Object, s As String
Set fso = CreateObject("Scripting.Filesystemobject")
Set ts = fso.openTextFile("C:\temp\so\1.html")
s = ts.readall
ts.Close
Dim HTMLClCol As MSHTML.IHTMLElementCollection
Dim HTMLCl As MSHTML.IHTMLElement
Dim j As Integer, sLabel As String
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim e1 As MSHTML.IHTMLElement, e2 As MSHTML.IHTMLElement, e3 As MSHTML.IHTMLElement
HTMLDoc.body.innerHTML = s
' results
j = 1
Sheet1.Cells.Clear
Sheet1.Columns("A:B").NumberFormat = "@" ' text
Set HTMLClCol = HTMLDoc.getElementsByTagName("div")
For Each HTMLCl In HTMLClCol
For Each e1 In HTMLCl.Children
If e1.className Like "form-group row mb[125]*" Then
For Each e2 In e1.all
If e2.tagName = "LABEL" Then
j = j 1
Sheet1.Cells(j, 1) = e2.innerText
sLabel = e2.innerText
ElseIf e2.tagName = "OPTION" And e2.getAttribute("Selected") = "selected" Then
Sheet1.Cells(j, 2) = e2.innerText
ElseIf e2.className Like "form-control*" Then
Sheet1.Cells(j, 2) = e2.getAttribute("value")
ElseIf e1.className = "form-group row mb25" Then
Sheet1.Cells(j, 2) = Format(CDate(e2.innerText), "dd mmm yyyy")
End If
Next
End If
Next
Next
MsgBox "Done"
End Sub