Home > front end >  Excel VBA - Web Scraping MSXML2.XMLHTTP Library
Excel VBA - Web Scraping MSXML2.XMLHTTP Library

Time:11-07

I need some help to extract some data from a website. Unfortunately requires logins so no point in sharing the link. Hopefully, for someone with HTML understanding will be straight forward to help me.

I'm using the following code to extract Profiles for the web-page (see image below). Profile Webpage Screenshot

Sub testWebScap()

    ' Code uses the links from the webpage, create sheets with the customer name and paste all Profile details

    Dim oHttp As Object, Html As HTMLDocument
    Dim HTMLDoc As New MSHTML.HTMLDocument
    Dim HTMLClCol As MSHTML.IHTMLElementCollection
    Dim HTMLCl As MSHTML.IHTMLElement
    Dim i As Integer, j As Integer
    Dim contentPage As String
    
    Set Html = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    
    j = 2

    ' ClientList sheet contains the webpage link for each customer.
    For i = 2 To ThisWorkbook.Sheets("ClientList").Range("A65000").End(xlUp).Row
        ' Loop through each customer profile webpage

        Sheets.Add(After:=Sheets(Sheets.Count)).Name = ThisWorkbook.Sheets("ClientList").Range("A" & i)
        contentPage = "https://my.powerdiary.com" & ThisWorkbook.Sheets("ClientList").Range("B" & i)
        j = 2
        
        With oHttp
            .Open "GET", contentPage, False
            .send
            sResp = .responseText
            
        End With
        
        HTMLDoc.body.innerHTML = oHttp.responseText
        Set HTMLClCol = HTMLDoc.getElementsByClassName("form-control-label col-lg-4 col-12")

        ' Loop through Label-elements From Salutation to Country but skipping the Middle Name and Telephone Home. Labels copied in Column A
        For Each HTMLCl In HTMLClCol
            ActiveSheet.Range("A" & j).Value = HTMLCl.innerHTML
            j = j   1
        Next HTMLCl
        
        Set HTMLClCol = HTMLDoc.getElementsByClassName("form-control-label col-xl-4 col-12")

        ' Loop through Label-elements From Status to NDIS but skipping the Medicare Ref, Month and Year. Labels copied in Column A
        For Each HTMLCl In HTMLClCol
            ActiveSheet.Range("A" & j).Value = HTMLCl.innerHTML
            j = j   1
        Next HTMLCl
        
        Set HTMLClCol = HTMLDoc.getElementsByClassName("form-control")
        j = 2
        
        ' Loop through All values (including those correspond to the missing labels). Values copied in Column B
        For Each HTMLCl In HTMLClCol
            ActiveSheet.Range("B" & j).Value = HTMLCl.getAttribute("value")
            j = j   1
        Next HTMLCl
    Next i
    
End Sub

The code works partially as:

  • I'm missing some label-elements (check the Data Sheet image). For instance can't get the middle name label (the missing one are in red font on the image).
  • The value-elements are shifted by one row (I guess the code picks up an empty one at the start but can't figure why)

Sheet with copied data. L-side after the code, R-side after someone help me ;-)

As I can't share with you the webpage, I have added the HTML part that corresponds to the Profile Data.

<form action="/Profile/33" data-prompt-unsaved="" method="post">            <div class="panel-body p-4">
                <div class="row mt10 mb20">
                    <div class="col-xl-6 col-12 pr-md-4">


<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12" for="NameView_Salutation">Salutation</label>
    <div class="col-sm-12 col-lg-8">
        <select class="form-control" id="NameView_Salutation" name="NameView.Salutation"><option value="">No Salutation</option>
<option value="Mr">Mr</option>
<option value="Ms">Ms</option>
<option value="Mrs">Mrs</option>
<option value="Miss">Miss</option>
<option value="Mx">Mx</option>
<option selected="selected" value="Master">Master</option>
<option value="Dr">Dr</option>
<option value="Prof">Prof</option>
</select>
    </div>
</div>

<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12" for="NameView_ClientFirstName">Name</label>
    <div class="col-lg-4 col-6">
        
    <input class="form-control medicare-first-name" data-val="true" data-val-length="The field First Name must be a string with a maximum length of 100." data-val-length-max="100" data-val-required="The First Name field is required." id="NameView_ClientFirstName" name="NameView.ClientFirstName" placeholder="First Name" type="text" value="TestFName" />

        <span class="field-validation-valid" data-valmsg-for="NameView.ClientFirstName" data-valmsg-replace="true"></span>
    </div>

        <label class="sr-only" for="NameView_ClientMiddleName">Middle Name</label>
        <div class="col-lg-4 col-6">
            <input class="form-control" data-val="true" data-val-length="The field ClientMiddleName must be a string with a maximum length of 80." data-val-length-max="80" id="NameView_ClientMiddleName" name="NameView.ClientMiddleName" placeholder="Middle Name" type="text" value="TestMName" />
        </div>
</div>

<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12" for="NameView_ClientLastName">Last Name</label>
    <div class="col-sm-12 col-lg-8">
        <input class="form-control medicare-last-name" data-val="true" data-val-length="The field ClientLastName must be a string with a maximum length of 80." data-val-length-max="80" id="NameView_ClientLastName" name="NameView.ClientLastName" placeholder="Last Name" type="text" value="TestLName" />
    </div>
</div>

    <div class="form-group row mb20">
        <label class="form-control-label col-lg-4 col-12" for="NameView_PreferredClientName">Preferred Name</label>
        <div class="col-sm-12 col-lg-8">
            <input class="form-control" data-val="true" data-val-length="The field PreferredClientName must be a string with a maximum length of 200." data-val-length-max="200" id="NameView_PreferredClientName" name="NameView.PreferredClientName" type="text" value="TestPName" />
        </div>
    </div>


<div class="form-group row mb20" id="clientDateofBirthBlock">
    <label class="form-control-label col-lg-4 col-12" for="BirthView_DateOfBirth">Date of Birth</label>
    <div class="col-lg-8 col-sm-12 col-12">    
        

<div class="input-group date bs-datepicker">
    <input CurrentMenuItem="ClientDetails" autocomplete="off" class="form-control  form-control medicare-date-of-birth" data-val="true" data-val-date="The field DateOfBirth must be a date." id="BirthView_DateOfBirth" name="BirthView.DateOfBirth" style="min-width: 93px" type="text" value="03/11/2021" />

    
    <div class="input-group-append input-group-addon">
        <span class="input-group-text input-group-addon-gray"><i class="far fa-calendar-alt"></i></span>
    </div>
</div>

        <span class="field-validation-valid" data-valmsg-for="BirthView.DateOfBirth" data-valmsg-replace="true"></span>
            


        <span class="fsize11 d-inline-block" id="clientAge">
            (Age: <span id="ageYears">0</span> yrs, <span id="ageMonths">0</span> mths)
        </span>
    </div>
</div>

<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12" for="BirthView_Gender">Gender</label>
    <div class="col-lg-8 col-12">
        <select class="form-control" id="BirthView_Gender" name="BirthView.Gender"><option value="">Select...</option>
<option value="0">Female</option>
<option selected="selected" value="1">Male</option>
<option value="2">Other</option>
</select>
    </div>
</div>

<script>
    //Update client's age
    var $clientDob = $('#BirthView_DateOfBirth');
    $clientDob.change(function() {
        var date = $(this).val();
        if (!date) {
            $('#clientAge').removeClass("d-inline-block").addClass("d-none");
            $('#clientDateofBirthBlock').addClass("mb20").removeClass("mb10");
            return;
        }

        $('#clientAge').removeClass("d-none").addClass("d-inline-block");
        $('#clientDateofBirthBlock').removeClass("mb20").addClass("mb10");

        ajaxHelper.post('/Client/GetAge', { dob: date }, $clientDob, {
            onJsonSuccess: function(response) {
                if (response) {
                    var age = response.data;
                    $('#ageYears').html(age.years);
                    $('#ageMonths').html(age.months);
                    $('#clientAge').show();
                }
            }
        });
    });
</script>
                        <div class="form-group row mb20 mobile-number-group">
    <label class="form-control-label col-lg-4 col-12" for="PhoneView_ClientMobile">Mobile Number</label>
    <div class="col-sm-12 col-lg-8">
        <input class="form-control" data-val="true" data-val-length="The mobile number must be less than 25 characters" data-val-length-max="25" data-val-mobilenumber="" data-val-phonenumberbasic="" id="PhoneView_ClientMobile" name="PhoneView.ClientMobile" type="text" value="123456789" />                                                    
        <span class="field-validation-valid" data-valmsg-for="PhoneView.ClientMobile" data-valmsg-replace="true"></span>
    </div>
</div>
<div class="form-group row mb10">
    <label class="form-control-label col-lg-4 col-12" for="PhoneView_WorkPhone">Telephone</label>
    <div class="col-lg-4 col-6">
        <input class="form-control" data-val="true" data-val-landlineormobilenumber="" data-val-length="The work phone number must be less than 25 characters" data-val-length-max="25" data-val-phonenumberbasic="" id="PhoneView_WorkPhone" name="PhoneView.WorkPhone" placeholder="Work Phone" type="text" value="987654321" />
        <span class="fsize11">Work</span>
        <span class="field-validation-valid" data-valmsg-for="PhoneView.WorkPhone" data-valmsg-replace="true"></span>
    </div>
    <div class="col-lg-4 col-6">
        <label class="sr-only" for="PhoneView_HomePhone">Home Phone</label>
        <input class="form-control" data-val="true" data-val-landlineormobilenumber="" data-val-length="The home phone number must be less than 25 characters" data-val-length-max="25" data-val-phonenumberbasic="" id="PhoneView_HomePhone" name="PhoneView.HomePhone" placeholder="Home Phone" type="text" value="0147258369" />
        <span class="fsize11 ">Home</span>
        <span class="field-validation-valid" data-valmsg-for="PhoneView.HomePhone" data-valmsg-replace="true"></span>
    </div>                                                
</div>
<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12" for="PhoneView_ClientEmail">Email</label>
    <div class="col-sm-12 col-lg-8">
        <input class="form-control" id="PhoneView_ClientEmail" name="PhoneView.ClientEmail" title="Filter by any part of the client&#39;s email address." type="text" value="[email protected]" />
        <span class="field-validation-valid" data-valmsg-for="PhoneView.ClientEmail" data-valmsg-replace="true"></span>
    </div>
    <script>
        $(function (){
            let emailControl = $('#PhoneView_ClientEmail');
            emailControl.on('blur', function (){
                let trimmed = emailControl.val().trim();
                emailControl.val(trimmed);
            });
        });
    </script>
</div>
                        

<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12" for="AddressView_Address">Address</label>
    <div class="col-sm-12 col-lg-8">
        <textarea class="form-control" data-val="true" data-val-maxlength="The field Address must be a string or array type with a maximum length of &#39;255&#39;." data-val-maxlength-max="255" id="AddressView_Address" name="AddressView.Address" rows="2">
1 Address Road</textarea>
    </div>
</div>
<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12" for="AddressView_Suburb">City</label>
    <div class="col-sm-12 col-lg-8">
        <input class="form-control" data-val="true" data-val-maxlength="The field Suburb must be a string or array type with a maximum length of &#39;50&#39;." data-val-maxlength-max="50" id="AddressView_Suburb" name="AddressView.Suburb" placeholder="Suburb/Town" type="text" value="TestSuburb" />
    </div>
</div>
<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12" for="AddressView_Region">State</label>
    <div class="col-lg-4 col-6">
        <input class="form-control" data-val="true" data-val-maxlength="The field Region must be a string or array type with a maximum length of &#39;50&#39;." data-val-maxlength-max="50" id="AddressView_Region" name="AddressView.Region" placeholder="State" type="text" value="VIC" />
    </div>
    <div class="col-lg-4 col-6">
        <label class="sr-only" for="AddressView_Postcode">Postcode</label>
        <input class="form-control" data-val="true" data-val-maxlength="The field Postcode must be a string or array type with a maximum length of &#39;8&#39;." data-val-maxlength-max="8" id="AddressView_Postcode" name="AddressView.Postcode" placeholder="Postcode" type="text" value="1234" />
    </div>
</div>
<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12">Country</label>
    <div class="col-sm-12 col-lg-8">
        <select class="form-control" data-val="true" data-val-maxlength="The field Country must be a string or array type with a maximum length of &#39;100&#39;." data-val-maxlength-max="100" id="AddressView_Country" name="AddressView.Country"><option value="">Select a Country...</option>
<option value="AF">Afghanistan</option>

<option selected="selected" value="AU">Australia</option>

<option value="ZW">Zimbabwe</option>
</select>
    </div>
</div>
                    </div>
                    <div class="col-xl-6 col-12 mb20 pl-md-4">
                        <div class="form-group row mb20">
                            <label class="form-control-label col-lg-4 col-12" for="ClientStatusId">Status</label>
                            <div class="col-sm-12 col-lg-8">
                                <select class="form-control" data-val="true" data-val-number="The field ClientStatusId must be a number." data-val-required="The ClientStatusId field is required." id="ClientStatusId" name="ClientStatusId"><option value="0"></option>
<option selected="selected" value="1">Active</option>
<option value="4">Closed</option>
<option value="2">Follow Up Required</option>
<option value="3">On Hold</option>
</select>
                            </div>
                        </div>
                            <div class="form-group row mb20">
                                <label class="form-control-label col-lg-4 col-12" for="ClientTypeId">Client Type</label>
                                <div class="col-sm-12 col-lg-8">
                                    <select class="form-control" data-val="true" data-val-number="The field ClientTypeId must be a number." id="ClientTypeId" name="ClientTypeId"><option value=""></option>
<option value="53372">Example Client Type 1</option>
<option value="53371">Example Client Type 2</option>
</select>
                                </div>
                            </div>
                                                    <div class="form-group row mb20">
                                <label class="form-control-label col-lg-4 col-12" for="FileUnder">File Under</label>
                                <div class="col-sm-12 col-lg-8">
                                    <input class="form-control" data-val="true" data-val-maxlength="The field FileUnder must be a string or array type with a maximum length of &#39;80&#39;." data-val-maxlength-max="80" id="FileUnder" name="FileUnder" type="text" value="" />
                                </div>
                            </div>

<div class="form-group row mb20">
    <label class="form-control-label col-lg-4 col-12" for="ClientCompanyNameView_CompanyName">Business Name</label>
    <div class="col-sm-12 col-lg-8">
        <input class="form-control" data-val="true" data-val-length="The field Business Name must be a string with a maximum length of 100." data-val-length-max="100" id="ClientCompanyNameView_CompanyName" name="ClientCompanyNameView.CompanyName" type="text" value="TestBName" />
        <span class="field-validation-valid" data-valmsg-for="ClientCompanyNameView.CompanyName" data-valmsg-replace="true"></span>
    </div>
</div>
                            <div class="form-group row mb20">
                                <label class="form-control-label col-lg-4 col-12" for="Occupation">Occupation</label>
                                <div class="col-sm-12 col-lg-8">
                                    <input class="form-control" data-val="true" data-val-maxlength="Maximum 100 characters allowed." data-val-maxlength-max="100" id="Occupation" name="Occupation" type="text" value="TestOcc" />
                                </div>
                            </div>
                            <div class="form-group row mb20">
                                <label class="form-control-label col-lg-4 col-12" for="HowHeardId">How Client Heard About Us</label>
                                <div class="col-sm-12 col-lg-8">
                                    <select class="form-control" data-val="true" data-val-number="The field HowHeardId must be a number." id="HowHeardId" name="HowHeardId"><option value=""></option>
<option selected="selected" value="1">Internet</option>
<option value="2">Referral</option>
</select>
                                </div>
                            </div>

                        <div class="form-group row mb25">
                            <label class="form-control-label col-lg-4 col-12" for="DateAdded">Date Added</label>
                            <div class="col-sm-12 col-lg-8">
                                6/11/2021
                            </div>
                        </div>

<input class="medicare-client-id" data-val="true" data-val-number="The field Id must be a number." data-val-required="The Id field is required." id="ClientBase_Id" name="ClientBase.Id" type="hidden" value="9083410" />
    <div><input data-val="true" data-val-number="The field Id must be a number." data-val-required="The Id field is required." id="ClientInsurerCardViews_0__Id" name="ClientInsurerCardViews[0].Id" type="hidden" value="4836399" /></div>
<div class="form-group row mb5">
    
    <label class="form-control-label col-xl-4 col-12" for="ClientInsurerCardViews_0__Number">Medicare</label>
    <div class="col-xl-8 col-12">
        <div class="row">
            <div class="col-xl-7 col-sm-8 col-12">
                <div class="row">
                    <div class="col-9">
                        <input class="form-control" id="ClientInsurerCardViews_0__Number" name="ClientInsurerCardViews[0].Number" placeholder="Number" style="min-width:107px;" type="text" value="0123456789" />

                        <span class="fsize11">Medicare Number </span>
                    </div>
                    <div class="col-3 pl0">
                        <label class="sr-only" for="ClientInsurerCardViews_0__Irn">Reference</label>
                        <input class="form-control small-number pr0" id="ClientInsurerCardViews_0__Irn" name="ClientInsurerCardViews[0].Irn" placeholder="IRN" style="margin-left: 10px;" type="text" value="1" />
                        <span class="fsize11" style="margin-left: 10px;">Ref</span>
                    </div>
                </div>
            </div>
            <div class="col-xl-5 col-sm-4 col-12">
                <div class="row">
                    <div class="col-6">
                        <label class="sr-only" for="ClientInsurerCardViews_0__Month">Month</label>
                        <input class="form-control small-number" data-val="true" data-val-number="The field Month must be a number." id="ClientInsurerCardViews_0__Month" name="ClientInsurerCardViews[0].Month" placeholder="MM" type="text" value="1" />
                        <span class="fsize11">Month</span>
                    </div>
                    <div class="col-6 pl0">
                        <label class="sr-only" for="ClientInsurerCardViews_0__Year">Year</label>
                        <input class="form-control small-number" data-val="true" data-val-number="The field Year must be a number." id="ClientInsurerCardViews_0__Year" name="ClientInsurerCardViews[0].Year" placeholder="YYYY" type="text" value="2025" />
                        <span class="fsize11">Year</span>
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>
<div class="form-group row mb20">
    <div class="col text-right">
        <div data-toggle="tooltip" title="" data-original-title="">
            <button class="btn btn-success btn-xs shadow validate-medicare-button" type="button" id="validate-medicare-button-id-0">
<i classname="fa fa-check fa-margin"></i> Validate Medicare Data
</button>

</div>

</div>
</div>
            <script type="text/javascript">
            $(function () {

                $("#validate-medicare-button-id-0").click(function (e) {

                    e.preventDefault();
                    var $this = $(this);
                    $this.setDisabled(true);

                    // medicare-client-id is declared outside
                    var clientId = $(".medicare-client-id").val();
                    var number = $("#ClientInsurerCardViews_0__Number").val();
                    var irn = $("#ClientInsurerCardViews_0__Irn").val();
                    var dob = $(".medicare-date-of-birth").val();
                    var lastName = $(".medicare-last-name").val();
                    var firstName = $(".medicare-first-name").val();


                    PD.Client.Medicare.validateMedicareNumber(clientId,
                        number,
                        irn,
                        dob,
                        firstName,
                        lastName,
                        function() { $this.setDisabled(false); });
                });

                $("#ClientInsurerCardViews_0__Month").rules("add", { digits: true, range: [1, 12] });

                var currentYear = (new Date()).getFullYear();

                $("#ClientInsurerCardViews_0__Year").rules("add", { digits: true, range: [currentYear - 10, currentYear   20] });

                $("#ClientInsurerCardViews_0__Irn").rules("add", { digits: true, range: [1, 99] });

                $("#ClientInsurerCardViews_0__Number").rules("add", { digits: true, minlength: 10, maxlength: 10 });
            });
            </script>
    <div><input data-val="true" data-val-number="The field Id must be a number." data-val-required="The Id field is required." id="ClientInsurerCardViews_1__Id" name="ClientInsurerCardViews[1].Id" type="hidden" value="4836404" /></div>
        <div class="form-group row mb5">
            <label class="form-control-label col-xl-4 col-12" for="ClientInsurerCardViews_1__Number">NDIS</label>
            <div class="col-xl-8 col-12">
                <div class="row">
                    <div class="col-xl-7 col-sm-8 col-12">
                        <div class="row">
                            <div class="col-9">
                                <input class="form-control" id="ClientInsurerCardViews_1__Number" name="ClientInsurerCardViews[1].Number" placeholder="Number" style="min-width:107px;" type="text" value="231353151351" />
                                <span class="fsize11">Number</span>
                            </div>
                        </div>
                    </div>
                </div>
            </div>
        </div>



                        <input data-val="true" data-val-number="The field ClientNumber must be a number." data-val-required="The ClientNumber field is required." id="clientNumber" name="clientNumber" type="hidden" value="33" />
                    </div>
                </div>
                <div class="row">
                    <div class="col-12">
                        <div class="form-group row">
                            <label class="form-control-label col-12" for="ClientNotes">Additional Comments</label>
                            <div class="col-12">
                                <textarea class="form-control" data-val="true" data-val-maxlength="The field ClientNotes must be a string or array type with a maximum length of &#39;60000&#39;." data-val-maxlength-max="60000" id="ClientNotes" name="ClientNotes" rows="7">
</textarea>
                            </div>
                        </div>
                    </div>
                </div>

                <div class="row">
                    <div class="col-12 mt-4">
                        <button type="submit" id="btnSubmit" class="float-right btn btn-lg btn-green save validate-form action-form"><i class="fa fa-check fa-margin"></i>SAVE CHANGES</button>
                    </div>
                </div>

            </div>
</form></div>
<iframe name="sif1" sandbox="allow-forms allow-modals allow-scripts" frameborder="0"></iframe>

CodePudding user response:

Try

Option Explicit

Sub testWebScrape()

    ' read html from file
    Dim fso As Object, ts As Object, s As String
    Set fso = CreateObject("Scripting.Filesystemobject")
    Set ts = fso.openTextFile("C:\temp\so\1.html")
    s = ts.readall
    ts.Close
    
    
    Dim HTMLClCol As MSHTML.IHTMLElementCollection
    Dim HTMLCl As MSHTML.IHTMLElement
    Dim j As Integer, sLabel As String
    
    Dim HTMLDoc As New MSHTML.HTMLDocument
    Dim e1 As MSHTML.IHTMLElement, e2 As MSHTML.IHTMLElement, e3 As MSHTML.IHTMLElement
    HTMLDoc.body.innerHTML = s
    
    ' results
    j = 1
    Sheet1.Cells.Clear
    Sheet1.Columns("A:B").NumberFormat = "@" ' text
    
    Set HTMLClCol = HTMLDoc.getElementsByTagName("div")
    For Each HTMLCl In HTMLClCol
        For Each e1 In HTMLCl.Children
            If e1.className Like "form-group row mb[125]*" Then
                For Each e2 In e1.all
                
                    If e2.tagName = "LABEL" Then
                        j = j   1
                        Sheet1.Cells(j, 1) = e2.innerText
                        sLabel = e2.innerText
                         
                    ElseIf e2.tagName = "OPTION" And e2.getAttribute("Selected") = "selected" Then
                        Sheet1.Cells(j, 2) = e2.innerText
                    ElseIf e2.className Like "form-control*" Then
                        Sheet1.Cells(j, 2) = e2.getAttribute("value")
                    ElseIf e1.className = "form-group row mb25" Then
                        Sheet1.Cells(j, 2) = Format(CDate(e2.innerText), "dd mmm yyyy")
                    End If
                    
                Next
            End If
        Next
    Next
    MsgBox "Done"
End Sub
  • Related