I'm revisiting web scraping to try and develop a tool that can pull data from Databases.
Here I am using a substance dossier found at:
So for this Example, the entire desired output would be across 3 columns (not important it's in 3 columns right now though just wanting to pull the data):
Workers - Hazard via inhalation route, DNEL (Derived No Effect Level), 238 mg/m³
Workers - Hazard via dermal route, DNEL (Derived No Effect Level), 84 mg/kg bw/day
General Population - Hazard via inhalation route, DNEL (Derived No Effect Level), 70 mg/m³
General Population - Hazard via dermal route, DNEL (Derived No Effect Level), 51 mg/kg bw/day
General Population - Hazard via oral route, DNEL (Derived No Effect Level), 24 mg/kg bw/day
The issue I have is that I am using the class element "HorDL" to get this information but unfortunately this class is not limited to just the section highlighted in blue for each route. So ("HorDL")(0) works find but ("HorDL")(1) pulls the information immediately below for the same route.
For this reason, I suspect that using this class element to pull the information is not the best way to do this however I cant think of any other way to do this.
I already have a way to pull the relevant Dossier, so If this works It would make for a Neat tool that Just pulled the relevant Information. I have considered Pulling all the information and then applying a filter in excel but I don't think this is a particularly elegant solution.
Much appreciated for any responses.
CodePudding user response:
This assumes that you only want DNEL with keywords Workers
and General Population
in the title and among them, exclude DNEL with Hazard for the eyes
Note: You should declare all your variables, insert Option Explicit
at the top of your module to help you enforce it.
Option Explicit
Public Sub GetContents()
Const DNELTitle As Long = 1
Const DNELAssessment As Long = 2
Const DNELValue As Long = 3
Const resultFirstCell As String = "A1" 'Change the first cell address to insert the result accordingly
Dim ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1") 'Change worksheet name accordingly
'Start ECHA Search via XML HTTP Request
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
XMLReq.Open "Get", "https://echa.europa.eu/registration-dossier/-/registered-dossier/16016/7/1", False
XMLReq.send
If XMLReq.Status = 200 Then
HTMLDoc.body.innerHTML = XMLReq.responseText
'==== Loop through each anchors and get the relevant ID for interested DNEL
Dim anchors As Object
Set anchors = HTMLDoc.getElementById("SectionAnchors")
Set anchors = anchors.getElementsByTagName("a")
Dim anchorsColl As Collection
Set anchorsColl = New Collection
Dim i As Long
For i = 0 To anchors.Length - 1
Dim anchorText As String
anchorText = anchors(i).innerText
If InStr(anchorText, "Workers - ") <> 0 Or _
InStr(anchorText, "General Population - ") <> 0 Then
If InStr(anchorText, "Additional Information") = 0 And _
InStr(anchorText, "Hazard for the eyes") = 0 Then
anchorsColl.Add Replace(anchors(i).href, "about:blank#", vbNullString)
End If
End If
Next i
'====
If anchorsColl.Count <> 0 Then
Dim outputArr() As String
ReDim outputArr(1 To anchorsColl.Count, 1 To 3) As String
For i = 1 To anchorsColl.Count
Dim anchorEle As Object
Set anchorEle = HTMLDoc.getElementById(anchorsColl(i))
outputArr(i, DNELTitle) = anchorEle.innerText
'Loop through the anchor's sibling until it finds the DL tag to extract the values
Do While anchorEle.nodeName <> "DL"
Set anchorEle = anchorEle.NextSibling
Loop
'Assumes that the assessment conclusion is in the first DD tag
'Assumes that the value is in the second DD tag
outputArr(i, DNELAssessment) = anchorEle.getElementsByTagName("dd")(0).innerText
outputArr(i, DNELValue) = anchorEle.getElementsByTagName("dd")(1).innerText
Next i
'Write the extraction result to the worksheet starting from A1
ws.Range(resultFirstCell).Resize(UBound(outputArr, 1), 3).Value = outputArr
Else
Debug.Print "No DNEL found."
End If
Set ws = Nothing
Set HTMLDoc = Nothing
Else
MsgBox "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText
End If
Set XMLReq = Nothing
End Sub
CodePudding user response:
My own answer so Far.
Next I will have this loop through a list of Values to return the DNEL for each. Also need to include some sort of error handling.
Sub GetData()
'Start ECHA Search via XML HTTP Request
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
XMLReq.Open "Get", "https://echa.europa.eu/registration-dossier/-/registered-dossier/16016/7/1", False
XMLReq.send
If XMLReq.Status <> 200 Then
MsgBox "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText
Exit Sub
End If
HTMLDoc.body.innerHTML = XMLReq.responseText
'Retrieve Data for General population
'Defines class element for each route
Dim Route(1 To 3) As String
Route(1) = "sGeneralPopulationHazardViaInhalationRoute"
Route(2) = "sGeneralPopulationHazardViaDermalRoute"
Route(3) = "sGeneralPopulationHazardViaOralRoute"
'Loops through each element
r = 4
c = 6
Dim i As Long
For i = 1 To UBound(Route, 1)
Set Info = HTMLDoc.getElementById(Route(i))
Debug.Print Info.innerText
Set Info = HTMLDoc.getElementById(Route(i)).NextSibling.NextSibling.NextSibling
Set Data = Info.getElementsByTagName("dd")(0)
Debug.Print Data.innerText
Set Data = Info.getElementsByTagName("dd")(1)
Debug.Print Data.innerText
Cells(r, c) = Data.innerText
c = c 1
Next i
r = r 1
End Sub