Good day -
I am reading the list of cars from Craigslist Miami using VBA. The code is working fine with printing the links for each vehicle, however, I need to extract the price and print it on the adjacent cell. I can get the class name, but I am unable to read the result. Please see code below. It s working fine except for the price:
Sub GetvehPosts()
Dim link As HTMLLinkElement
Dim blog As HTMLLinkElement
Dim price As HTMLLinkElement
Dim IE As Object
Dim html As HTMLDocument
Dim URL As String
Dim URLParameter As String
Dim page As Long, counter As Long
'Dim http As Object
Dim links As Object
Dim blogpost As Object
Dim priceonly As Object
Dim StartCell As Range
Dim increment As Integer
Dim htmlele1 As HTMLLinkElement
' This is the first cell that a blog post hyperlink is created in
Set StartCell = Range("A15")
URL = "https://miami.craigslist.org/search/cta"
Set IE = CreateObject("InternetExplorer.Application")
Application.ScreenUpdating = True
' CHnage this to False if you want to hide IE
IE.Visible = True
counter = 0
'Set the number of pages of the website to go through in the browser
For page = 0 To 480 Step 120 'increment by 120 - total 4 pages
'Debug.Print page
If page > 1 Then URLParameter = "?s=" & page
IE.navigate URL & URLParameter
'Wait for the browser to load the page
Do Until IE.readyState = 4
DoEvents
Loop
Set html = IE.document
Set links = html.getElementsByTagName("h3")
For Each link In links
If InStr(LCase(link.outerHTML), "result-heading") Then
Set blogpost = link.getElementsByTagName("a")
Set priceonly = link.getElementsByClassName("result-price")
For Each blog In blogpost
StartCell.Offset(counter, 0).Hyperlinks.Add _
Anchor:=StartCell.Offset(counter, 0), Address:=blog, _
TextToDisplay:=link.innerText
Next blog
For Each price In priceonly
If price.className = "result-price" Then
Debug.Print price.innerText
End If
Next price
counter = counter 1
End If
Next link
Next page
IE.Quit
Set IE = Nothing
End Sub
CodePudding user response:
Using InternetExplorer is usually a pain. (Selenium works fairly well with excel and has querySelector type functionality)
But sticking with InternetExplorer for now...
Try to work with IHTMLElementCollection(s)
instead of IHTMLElement(s)
because the collections still have the getElement(s)..
methods.
I've never managed to make getElementsByClassName
work properly. The markup of the data your scraping is likely to be fixed, so working with child indexes is fairly reliable.
I had a go at trying to extract the link text and price for each item on your target and this appears to work for me.
Sub getPosts()
Dim Browser As InternetExplorer
Dim HTML As IHTMLDocument
Dim Results As IHTMLElementCollection
Dim Result As IHTMLElementCollection
Dim URL As String
Dim URLParameter As String
Dim Index As Integer
URL = "https://miami.craigslist.org/search/cta"
URLParameter = ""
Set Browser = New InternetExplorer
Browser.Visible = True
Browser.navigate URL & URLParameter
Do Until Browser.readyState = 4
DoEvents
Loop
Set HTML = Browser.document
Set Results = HTML.getElementsByClassName("result-row")
For Index = 0 To Results.Length - 1
Debug.Print "LinkText:" & Results(Index).getElementsByTagName("h3")(0).getElementsByTagName("a")(0).innerText
Debug.Print "Price:" & Results(Index).getElementsByTagName("span")(0).innerText
Next Index
Browser.Quit
Set Browser = Nothing
End Sub
update
the above code only looks at the first page, and outputs the link text and price of each post to the debug window. (it was an example of how to access the elements you were looking for)
ive added your page navigation code back in and made it output to the worksheet instead of the debug window.
Sub getPosts()
Dim Browser As InternetExplorer
Dim HTML As IHTMLDocument
Dim Results As IHTMLElementCollection
Dim Result As IHTMLElementCollection
Dim URL As String
Dim URLParameter As String
Dim Index As Integer
Dim Counter As Integer: Counter = 0
Dim Page As Integer
Dim Link As IHTMLElement
Dim Price As IHTMLElement
Dim StartCell As Range: Set StartCell = Range("Sheet1!A15") ' adjust sheet name
URL = "https://miami.craigslist.org/search/cta"
URLParameter = ""
Set Browser = New InternetExplorer
Browser.Visible = True
For Page = 0 To 480 Step 120
If Page > 1 Then
URLParameter = "?s=" & Page
End If
Browser.navigate URL & URLParameter
Do Until Browser.readyState = 4
DoEvents
Loop
Set HTML = Browser.document
Set Results = HTML.getElementsByClassName("result-row")
For Index = 0 To Results.Length - 1
Set Link = Results(Index).getElementsByTagName("h3")(0).getElementsByTagName("a")(0)
StartCell.Offset(Counter, 0).Hyperlinks.Add StartCell.Offset(Counter, 0), Link.href, , , Link.innerText
Set Price = Results(Index).getElementsByClassName("result-price")(0)
StartCell.Offset(Counter, 1).Value = Price.innerText
Counter = Counter 1
Next Index
Next Page
Browser.Quit
Set Browser = Nothing
End Sub