I have bee trying to extract the Business Name
from the Website and i am receiving an error that is
and this line causing an error For iCnt = 0 To .getElementsByTagName("h2").Length - 1
I am new and learning all this stuff. I need to extract all the details available in the Website like"
Business Name
Address
Telephone
Fax
Email
Website
in a sequence so all the details can be pasted into Excel file. I would appreciate your help thanks.
Option Explicit
Const sSiteName = "https://www.thoroughexamination.org/postcode-search/nationwide?page=1"
Private Sub getHTMLContents()
' Create Internet Explorer object.
Dim IE As Object
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = False ' Keep this hidden.
IE.Navigate sSiteName
' Wait till IE is fully loaded.
While IE.ReadyState <> 4
DoEvents
Wend
Dim oHDoc As HTMLDocument ' Create document object.
Set oHDoc = IE.Document
Dim oHEle As HTMLUListElement ' Create HTML element (<ul>) object.
Set oHEle = oHDoc.getElementById("search-item-header") ' Get the element reference using its ID.
Dim iCnt As Integer
' Loop through elements inside the <ul> element and find <h1>, which has the texts we want.
With oHEle
For iCnt = 0 To .getElementsByTagName("h2").Length - 1
Debug.Print .getElementsByTagName("h2").Item(iCnt).getElementsByTagName("a").Item(0).innerHTML
Next iCnt
End With
' Clean up.
IE.Quit
Set IE = Nothing
Set oHEle = Nothing
Set oHDoc = Nothing
End Sub
No Response from 2nd Code:
Sub TutorailsPoint()
Const URL = "https://www.thoroughexamination.org/postcode-search/nationwide?page=1"
Dim http As New MSXML2.XMLHTTP60, html As New HTMLDocument
Dim topics As Object, posts As Object, topic As Object
Dim x As Long
x = 1
http.Open "GET", URL, False
http.send
html.body.innerHTML = http.responseText
Set topics = html.getElementsByClassName("search-item-header")
For Each posts In topics
For Each topic In posts.getElementsByTagName("h2")
Cells(x, 1) = topic.innerText
x = x 1
Next topic
Next posts
End Sub
CodePudding user response:
You can do what you want with xhr if you use an UserAgent. Here is a code for all pages and all datasets per row with the present fields for all pages of your posted url.
Sub TutorailsPoint()
Dim doc As Object
Dim url As String
Dim page As Long
Dim hits As Long
Dim maxPage As Long
Dim maxPageKnown As Boolean
Dim currRow As Long
Dim nodeAllGroups As Object
Dim nodeOneGroup As Object
Dim nodeContactData As Object
Dim nodeWebSite As Object
Dim telephone As Boolean
page = 1
maxPage = 1
currRow = 2
Set doc = CreateObject("htmlFile")
With CreateObject("MSXML2.ServerXMLHTTP.6.0")
'Call all pages
Do
url = "https://www.thoroughexamination.org/postcode-search/nationwide?page=" & page
.Open "GET", url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0"
.send
If .Status = 200 Then
doc.body.innerHTML = .responseText
'How many pages to call
If Not maxPageKnown Then
hits = CLng(doc.getElementsByClassName("summary")(0).getElementsByTagName("b")(1).innertext)
maxPage = hits / 20
If hits Mod 20 <> 0 Then
maxPage = maxPage 1
End If
maxPageKnown = True
'Debug.Print maxPage
End If
Set nodeAllGroups = doc.getElementsByClassName("group")
For Each nodeOneGroup In nodeAllGroups
'Business name
Cells(currRow, 1) = nodeOneGroup.getElementsByTagName("h2")(0).innertext
'Address
Cells(currRow, 2) = nodeOneGroup.getElementsByTagName("p")(0).innertext
'Contact block
Set nodeContactData = nodeOneGroup.getElementsByClassName("depot")
If nodeContactData.Length <> 0 Then
'Telephone
If InStr(1, nodeContactData(0).innertext, "tel:") > 0 Then
Cells(currRow, 3).NumberFormat = "@"
Cells(currRow, 3) = Trim(nodeContactData(0).getElementsByTagName("strong")(0).innertext)
telephone = True
End If
'Fax
If InStr(1, nodeContactData(0).innertext, "fax:") > 0 Then
Cells(currRow, 4).NumberFormat = "@"
If telephone Then
Cells(currRow, 4) = Trim(Replace(nodeContactData(0).getElementsByTagName("p")(0).FirstChild.NextSibling.NextSibling.NextSibling.NodeValue, "fax:", ""))
Else
Cells(currRow, 4) = Trim(Replace(nodeContactData(0).getElementsByTagName("p")(0).FirstChild.NodeValue, "fax:", "")) 'not sure, not seen no telephone
End If
End If
'Email
If InStr(1, nodeContactData(0).innertext, "email:") > 0 Then
Cells(currRow, 5) = Trim(nodeContactData(0).getElementsByTagName("a")(0).innertext)
End If
'website
Set nodeWebSite = nodeContactData(0).getElementsByClassName("website")
If nodeWebSite.Length > 0 Then
Cells(currRow, 6) = Trim(nodeWebSite(0).innertext)
End If
End If
telephone = False
currRow = currRow 1
Next nodeOneGroup
page = page 1
Else
Cells(currRow, 1) = "Page not loaded. HTTP status " & .Status
Cells(currRow, 2) = url
currRow = currRow 1
End If
Loop While page <= maxPage
End With
End Sub
CodePudding user response:
Internet Explorer was dropped by MS so it's not a good idea to use it. From now on, in VBA, my best option is to use Selenium (Selenium Type Library) to scrape the WWW. To get started with Selenium the right way you have to take care of the following:
Update the related .Net Framework
Install Selenium Basic app
Download and install the version of Chromedriver.exe (see the latest version here) exactly compatible with the version of Google Chrome (which must be installed on the machine). Extract 'Chromedriver.exe' and put it at the same folder where Selenium Basic was installed (I've installed Selenium Basic here: 'C:\Program Files\SeleniumBasic')
Add the reference in the VBE to the ActiveX library: 'Selenium Type Library'
Put the code in a standard module on the VBE:
Sub fnGetDataFromWWW()
Dim oWD As WebDriver
Dim post As Selenium.WebElement
Dim groups As Selenium.WebElements
Dim strText As String
Dim intItem As Integer
Set oWD = New Selenium.WebDriver
oWD.Start "chrome"
DoEvents
oWD.Get "https://www.thoroughexamination.org/postcode-search/nationwide?page=1"
DoEvents
Set groups = oWD.FindElementsByClass("group")
For Each post In groups
strText = post.Attribute("outerText")
For intItem = 0 To UBound(Split(strText, Chr(10)))
If Trim(Split(strText, Chr(10))(intItem)) <> "" Then
Debug.Print Split(strText, Chr(10))(intItem)
End If
Next
Next post
End Sub