Home > Enterprise >  Vba getting rid of table while web scraping
Vba getting rid of table while web scraping

Time:11-10

Can anyone help for a step further?

What I did here, I took from the website given inside code block the tablename by getElementById, tagName although there is only class of div "data". Then I will put all of the data - only currency rates and date to the excel cells on a worksheet. But It gives me also the calendar days, I want to get rid of calendar days which is shown in Debug.Print mode. But cannot find the right tag name of calendar for excluding it from the code. For now, I only need,assistance for getting rid of calendar days; the code is below

Sub gettingTablesfromCBR()
Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDOC As MSHTML.HTMLDocument
Dim HTMLTABLES As MSHTML.IHTMLElementCollection
Dim HTMLTABLE As MSHTML.IHTMLElement
Dim HTMLDIV As MSHTML.IHTMLElement
Dim TableSection As MSHTML.IHTMLElement
Dim TableRow As MSHTML.IHTMLElement
Dim TableCell As MSHTML.IHTMLElement

IE.Visible = True
IE.navigate "https://www.cbr.ru/eng/currency_base/dynamics/?UniDbQuery.Posted=True&UniDbQuery.so=1&UniDbQuery.mode=1&UniDbQuery.date_req1=&UniDbQuery.date_req2=&UniDbQuery.VAL_NM_RQ=R01100&UniDbQuery.From=07.08.2021&UniDbQuery.To=05.11.2021"

    Do While IE.ReadyState <> READYSTATE_COMPLETE
    Loop
   
Set HTMLDOC = IE.Document
Set HTMLDIV = HTMLDOC.getElementById("data")
Set HTMLTABLES = HTMLDOC.getElementsByTagName("table")
Dim RowText As String
For Each HTMLTABLE In HTMLTABLES
    
    For Each TableSection In HTMLTABLE.Children

    
            For Each TableRow In TableSection.Children
                RowText = ""
                For Each TableCell In TableRow.Children
                    RowText = RowText & vbTab & TableCell.innerText
                Next TableCell
                Debug.Print , , RowText
            Next TableRow
    
    Next TableSection
    
Next HTMLTABLE

End Sub

CodePudding user response:

Updated: You can add a counter to your innermost For loop that is reset to 1 before that loop starts. Then you can test this counter to see if it's the first cell of the row and skip it.

Example of this with some small tweaks to your code to just target the table (that has a classname of data):

Sub gettingTablesfromCBR()
Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDOC As MSHTML.HTMLDocument
Dim HTMLTABLES As MSHTML.IHTMLElementCollection
Dim HTMLTABLE As MSHTML.IHTMLElement
Dim HTMLDIV As MSHTML.IHTMLElement
Dim TableSection As MSHTML.IHTMLElement
Dim TableRow As MSHTML.IHTMLElement
Dim TableCell As MSHTML.IHTMLElement
Dim Unit As String
Dim Rate As String
Dim TableCellCount As Integer

IE.Visible = True
IE.navigate "https://www.cbr.ru/eng/currency_base/dynamics/?UniDbQuery.Posted=True&UniDbQuery.so=1&UniDbQuery.mode=1&UniDbQuery.date_req1=&UniDbQuery.date_req2=&UniDbQuery.VAL_NM_RQ=R01100&UniDbQuery.From=07.08.2021&UniDbQuery.To=05.11.2021"

    Do While IE.ReadyState <> READYSTATE_COMPLETE
    Loop
   
Set HTMLDOC = IE.Document
Set HTMLTABLES = HTMLDOC.getElementsByClassName("data")
For Each HTMLTABLE In HTMLTABLES 'Table
    For Each TableSection In HTMLTABLE.Children 'Body
            For Each TableRow In TableSection.Children
                RowText = ""
                TableCellCount = 1
                For Each TableCell In TableRow.Children
                    If TableCellCount = 2 Then Unit = TableCell.innerText
                    If TableCellCount = 3 Then Rate = TableCell.innerText
                    TableCellCount = TableCellCount   1
                Next TableCell
                Debug.Print Unit, Rate
            Next TableRow
    Next TableSection
Next


End Sub

CodePudding user response:

With help of @JNevill, I solved and improved the code. Now if anyone interested can easily put the numbers as a date then the result of rates will be put in a table form in xl sheet.

Sub gettingTablesfromCBR2()
Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDOC As MSHTML.HTMLDocument
Dim HTMLTABLES As MSHTML.IHTMLElementCollection
Dim HTMLTABLE As MSHTML.IHTMLElement
Dim HTMLDIV As MSHTML.IHTMLElement
Dim TableSection As MSHTML.IHTMLElement
Dim TableRow As MSHTML.IHTMLElement
Dim TableCell As MSHTML.IHTMLElement
Dim Unit As String
Dim Rate, DateR As String
Dim TableCellCount, RowNum As Integer
Dim inputdate_from, inputdate_to, inputcurr_val As Variant

inputdate_from = ThisWorkbook.Worksheets("PrimoPagina").Range("A3").Value
inputdate_to = ThisWorkbook.Worksheets("PrimoPagina").Range("E3").Value
inputcurr_val = ThisWorkbook.Worksheets("PrimoPagina").Range("B1").Value
If inputcurr_val = "BLG" Then
inputcurr_val = "R01100"
Else
inputcurr_val = "R01239" 'EUR
End If

IE.Visible = True
IE.navigate "https://www.cbr.ru/eng/currency_base/dynamics/?UniDbQuery.Posted=True&UniDbQuery.so=1&UniDbQuery.mode=1&UniDbQuery.date_req1=&UniDbQuery.date_req2=&UniDbQuery.VAL_NM_RQ=" & inputcurr_val & "&UniDbQuery.From=" & inputdate_from & "&UniDbQuery.To=" & inputdate_to


    Do While IE.ReadyState <> READYSTATE_COMPLETE
    Loop
   
Set HTMLDOC = IE.Document
Set HTMLTABLES = HTMLDOC.getElementsByClassName("data")
For Each HTMLTABLE In HTMLTABLES 'Table
    For Each TableSection In HTMLTABLE.Children 'Body
            For Each TableRow In TableSection.Children
                RowText = ""
                TableCellCount = 1
                For Each TableCell In TableRow.Children
                    If TableCellCount = 1 Then DateR = TableCell.innerText
                    If TableCellCount = 2 Then Unit = TableCell.innerText
                    If TableCellCount = 3 Then Rate = TableCell.innerText
                    TableCellCount = TableCellCount   1
                Next TableCell
                Debug.Print Unit, Rate
                
                RowNum = RowNum   1
                ThisWorkbook.Worksheets("Results").Cells(RowNum, 3).Value = Unit
                ThisWorkbook.Worksheets("Results").Cells(RowNum, 2).Value = Rate
               ThisWorkbook.Worksheets("Results").Cells(RowNum, 1).Value = DateR
                
            Next TableRow
    Next TableSection
Next


End Sub
  • Related