The following code retrieves 'dossier Urls' for substances in Column A by scraping the ECHA website. I'm trying to error handle cases where a substance Url cannot be found.
I can't quite see why the following code fails. I have highlighted the Problematic line with a comment. this is highlighted in debugging as an Object Required error but I can't see where I'm going wrong.
Sub PopulateExposures()
Dim url, rw As Range
Set rw = Sheets("data").Range("A2:E2") 'first row with inputs
Do While Application.CountA(rw) > 0
url = SubstanceUrl(rw.Cells(1).Value, rw.Cells(2).Value) 'get the URL
rw.Cells(5).Resize(1, 3).Value = ExposureData(url) 'get exposure data (as array) and add to row
Set rw = rw.Offset(1, 0) 'next substance
Loop
End Sub
Public Function SubstanceUrl(SubstanceName, CASNumber) As String
Const url = "https://echa.europa.eu/information-on-chemicals/registered-substances?" & _
"p_p_id=dissregisteredsubstances_WAR_dissregsubsportlet&p_p_lifecycle=1&" & _
"p_p_state=normal&p_p_mode=view&" & _
"__dissregisteredsubstances_WAR_dissregsubsportlet_javax.portlet.action=dissRegisteredSubstancesAction"
Dim oHTML, oHttp, MyDict, payload, DictKey, sep
Set oHTML = New HTMLDocument
Set oHttp = CreateObject("MSXML2.XMLHTTP")
Set MyDict = CreateObject("Scripting.Dictionary")
MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_name") = SubstanceName
MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_cas-number") = CASNumber
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
payload = ""
For Each DictKey In MyDict
payload = payload & sep & DictKey & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey))
sep = "&"
Next DictKey
With oHttp
.Open "POST", url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send payload
oHTML.body.innerHTML = .responseText
End With
'PROBLEMATIC CODE
If oHTML.querySelector(".details").getAttribute("href") Is Error Then
SubstanceUrl = "-"
Else
'Sometimes output changes despite same input
SubstanceUrl = oHTML.querySelector(".details").getAttribute("href")
End If
Debug.Print SubstanceUrl
End Function
Function ExposureData(urlToGet)
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As HTMLDocument, dds
Dim Route(1 To 3) As String, Results(1 To 3) As String, c, Info, Data
Route(1) = "sGeneralPopulationHazardViaInhalationRoute"
Route(2) = "sGeneralPopulationHazardViaDermalRoute"
Route(3) = "sGeneralPopulationHazardViaOralRoute"
XMLReq.Open "Get", urlToGet & "/7/1", False
XMLReq.send
If XMLReq.Status <> 200 Then
Results(1) = "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText
Else
Set HTMLDoc = New HTMLDocument
HTMLDoc.body.innerHTML = XMLReq.responseText
For c = 1 To UBound(Route, 1)
Set Info = HTMLDoc.getElementById(Route(c))
If Not Info Is Nothing Then
Set Info = Info.NextSibling.NextSibling.NextSibling
Set dds = Info.getElementsByTagName("dd")
If dds.Length > 1 Then
Results(c) = dds(1).innerText
Else
Results(c) = "hazard unknown"
End If
Else
Results(c) = "no info"
End If
Next c
End If
ExposureData = Results
End Function
For this code to run values must be present in column A. Acetone and Benzene can be used respectively to test 2 rows. To test out the error handling Enter something made up like Benzenjaj.
I think this is a quick fix. Just can't see it.
Update:
Tetsing on made up substance name:
First 2 results are found as normal but the made-up chemical causes the following error:
Code:
Sub PopulateExposures() Dim url, rw As Range
Set rw = Sheets("data").Range("A2:E2") 'first row with inputs Do While Application.CountA(rw) > 0 url = SubstanceUrl(rw.Cells(1).Value, rw.Cells(2).Value) 'get the URL rw.Cells(5).Resize(1, 3).Value = ExposureData(url) 'get exposure data (as array) and add to row Set rw = rw.Offset(1, 0) 'next substance Loop
End Sub
Public Function SubstanceUrl(SubstanceName, CASNumber) As String
Const url = "https://echa.europa.eu/information-on-chemicals/registered-substances?" & _ "p_p_id=dissregisteredsubstances_WAR_dissregsubsportlet&p_p_lifecycle=1&" & _ "p_p_state=normal&p_p_mode=view&" & _ "__dissregisteredsubstances_WAR_dissregsubsportlet_javax.portlet.action=dissRegisteredSubstancesAction" Dim oHTML, oHttp, MyDict, payload, DictKey, sep Set oHTML = New HTMLDocument Set oHttp = CreateObject("MSXML2.XMLHTTP") Set MyDict = CreateObject("Scripting.Dictionary") MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_name") = SubstanceName MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_cas-number") = CASNumber MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true" MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on" payload = "" For Each DictKey In MyDict payload = payload & sep & DictKey & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)) sep = "&" Next DictKey With oHttp .Open "POST", url, False .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36" .setRequestHeader "Content-type", "application/x-www-form-urlencoded" .send payload oHTML.body.innerHTML = .responseText End With On Error Resume Next 'ignore error on following line SubstanceUrl = oHTML.querySelector(".details").getAttribute("href") On Error GoTo 0 'stop ignoring errors If Len(SubstanceUrl) = 0 Then SubstanceUrl = "<no URL>"
End Function
Function ExposureData(urlToGet)
Dim XMLReq As New MSXML2.XMLHTTP60 Dim HTMLDoc As HTMLDocument, dds Dim Route(1 To 3) As String, Results(1 To 3) As String, c, Info, Data Route(1) = "sGeneralPopulationHazardViaInhalationRoute" Route(2) = "sGeneralPopulationHazardViaDermalRoute" Route(3) = "sGeneralPopulationHazardViaOralRoute" XMLReq.Open "Get", urlToGet & "/7/1", False XMLReq.send If XMLReq.Status <> 200 Then Results(1) = "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText Else Set HTMLDoc = New HTMLDocument HTMLDoc.body.innerHTML = XMLReq.responseText For c = 1 To UBound(Route, 1) Set Info = HTMLDoc.getElementById(Route(c)) If Not Info Is Nothing Then Set Info = Info.NextSibling.NextSibling.NextSibling Set dds = Info.getElementsByTagName("dd") If dds.Length > 1 Then Results(c) = dds(1).innerText Else Results(c) = "hazard unknown" End If Else Results(c) = "no info" End If Next c End If ExposureData = Results
End Function
CodePudding user response:
You can just ignore any error:
With oHttp
.Open "POST", url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send payload
oHTML.body.innerHTML = .responseText
End With
On Error Resume Next 'ignore error on following line
SubstanceUrl = oHTML.querySelector(".details").getAttribute("href")
On Error Goto 0 'stop ignoring errors
If Len(SubstanceUrl) = 0 Then SubstanceUrl = "<no URL>"
CodePudding user response:
You can test the .Length of querySelectorAll when looking for the particular registered dossier url. You need to amend your code elsewhere to handle the "-" return url. I prefer Tim's solution in terms of ignoring error however for the first bit.
Option Explicit
Sub PopulateExposures()
Dim url, rw As Range
Set rw = Sheets("data").Range("A2:E2") 'first row with inputs
Do While Application.CountA(rw) > 0
url = SubstanceUrl(rw.Cells(1).Value, rw.Cells(2).Value) 'get the URL
If Left$(url, 5) = "https" Then
rw.Cells(5).Resize(1, 3).Value = ExposureData(url) 'get exposure data (as array) and add to row
Else
rw.Cells(5).Resize(1, 3).Value = url
End If
Set rw = rw.Offset(1, 0) 'next substance
Loop
End Sub
Public Function SubstanceUrl(SubstanceName, CASNumber) As String
Const url = "https://echa.europa.eu/information-on-chemicals/registered-substances?" & _
"p_p_id=dissregisteredsubstances_WAR_dissregsubsportlet&p_p_lifecycle=1&" & _
"p_p_state=normal&p_p_mode=view&" & _
"__dissregisteredsubstances_WAR_dissregsubsportlet_javax.portlet.action=dissRegisteredSubstancesAction"
Dim oHTML, oHttp, MyDict, payload, DictKey, sep
Set oHTML = New HTMLDocument
Set oHttp = CreateObject("MSXML2.XMLHTTP")
Set MyDict = CreateObject("Scripting.Dictionary")
MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_name") = SubstanceName
MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_cas-number") = CASNumber
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
payload = ""
For Each DictKey In MyDict
payload = payload & sep & DictKey & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey))
sep = "&"
Next DictKey
With oHttp
.Open "POST", url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send payload
oHTML.body.innerHTML = .responseText
End With
If oHTML.querySelectorAll("[href*=registered-dossier]").Length = 0 Then
SubstanceUrl = "-"
Else
'Sometimes output changes despite same input
SubstanceUrl = oHTML.querySelector(".details")
End If
Debug.Print SubstanceUrl
End Function
Function ExposureData(urlToGet)
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As HTMLDocument, dds
Dim Route(1 To 3) As String, Results(1 To 3) As String, c, Info, Data
Route(1) = "sGeneralPopulationHazardViaInhalationRoute"
Route(2) = "sGeneralPopulationHazardViaDermalRoute"
Route(3) = "sGeneralPopulationHazardViaOralRoute"
XMLReq.Open "Get", urlToGet & "/7/1", False
XMLReq.send
If XMLReq.Status <> 200 Then
Results(1) = "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText
Else
Set HTMLDoc = New HTMLDocument
HTMLDoc.body.innerHTML = XMLReq.responseText
For c = 1 To UBound(Route, 1)
Set Info = HTMLDoc.getElementById(Route(c))
If Not Info Is Nothing Then
Set Info = Info.NextSibling.NextSibling.NextSibling
Set dds = Info.getElementsByTagName("dd")
If dds.Length > 1 Then
Results(c) = dds(1).innerText
Else
Results(c) = "hazard unknown"
End If
Else
Results(c) = "no info"
End If
Next c
End If
ExposureData = Results
End Function