Home > Net >  Beautifulsoup convert multi-row table into list of dictionaries
Beautifulsoup convert multi-row table into list of dictionaries

Time:08-25

I have an HTML table that I need converted into a list of dictionaries. There are minimal attributes on the table but I need to be able to parse it correctly. Here's the text:

<div ><table><thead><tr><th  colspan="100%"><div >Runtime end of support dates</div></th></tr>
      <tr>
        <th>Name</th>
        <th>Identifier</th>
        <th>Operating system</th>
        <th>Deprecation Phase 1</th>
        <th>Deprecation Phase 2</th>
      </tr>
    </thead>
      <tr>
        <td>
          <p>.NET Core 3.1</p>
        </td>
        <td>
          <p><code >dotnetcore3.1</code></p>
        </td>
        <td>
          <p>Amazon Linux 2</p>
        </td>
        <td>
          <p>Jan 20, 2023</p>
        </td>
        <td>
          <p>Feb 20, 2023</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>Node.js 12</p>
        </td>
        <td>
          <p><code >nodejs12.x</code></p>
        </td>
        <td>
          <p>Amazon Linux 2</p>
        </td>
        <td>
          <p>Nov 14, 2022</p>
        </td>
        <td>
          <p>Dec 14, 2022</p>
        </td>
      </tr> 
      <tr>
        <td>
          <p>.NET Core 2.1</p>
        </td>
        <td>
          <p><code >dotnetcore2.1</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p>Jan 5, 2022</p>
        </td>
        <td>
          <p>Apr 13, 2022</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>Python 3.6</p>
        </td>
        <td>
          <p><code >python3.6</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p>July 18, 2022</p>
        </td>
        <td>
          <p>Aug 29, 2022</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>Python 2.7</p>
        </td>
        <td>
          <p><code >python2.7</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p>July 15, 2021</p>
        </td>
        <td>
          <p>May 30, 2022</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>Ruby 2.5</p>
        </td>
        <td>
          <p><code >ruby2.5</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p>July 30, 2021</p>
        </td>
        <td>
          <p>March 31, 2022</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>Node.js 10.x</p>
        </td>
        <td>
          <p><code >nodejs10.x</code></p>
        </td>
        <td>
          <p>Amazon Linux 2</p>
        </td>
        <td>
          <p>July 30, 2021</p>
        </td>
        <td>
          <p>Feb 14, 2022</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>Node.js 8.10</p>
        </td>
        <td>
          <p><code >nodejs8.10</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p> </p>
        </td>
        <td>
          <p>March 6, 2020</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>Node.js 6.10</p>
        </td>
        <td>
          <p><code >nodejs6.10</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p> </p>
        </td>
        <td>
          <p>August 12, 2019</p>
        </td>
      </tr> 
      <tr>
        <td>
          <p>Node.js 4.3 edge</p>
        </td>
        <td>
          <p><code >nodejs4.3-edge</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p> </p>
        </td>
        <td>
          <p>April 30, 2019</p>
        </td>
      </tr>
      <tr> 
        <td>
          <p>Node.js 4.3</p>
        </td>
        <td>
          <p><code >nodejs4.3</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p> </p>
        </td>
        <td>
          <p>March 6, 2020</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>Node.js 0.10</p>
        </td>
        <td>
          <p><code >nodejs</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p> </p>
        </td>
        <td>
          <p>October 31, 2016</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>.NET Core 2.0</p>
        </td>
        <td>
          <p><code >dotnetcore2.0</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p> </p>
        </td>
        <td>
          <p>May 30, 2019</p>
        </td>
      </tr>
      <tr>
        <td>
          <p>.NET Core 1.0</p>
        </td>
        <td>
          <p><code >dotnetcore1.0</code></p>
        </td>
        <td>
          <p>Amazon Linux</p>
        </td>
        <td>
          <p> </p>
        </td>
        <td>
          <p>July 30, 2019</p>
        </td>
      </tr>
      </table></div>

This is what I've got so far:

from bs4 import BeautifulSoup
import pprint

headers = []
tables = soup.find_all("div", class_="table-container")
for table in tables:
    correct_table = bool(table.findAll(text="Runtime end of support dates"))
    if correct_table:
        rows = table.findAll("tr")
        for row in rows:
            # Header row
            if row.findAll("th", class_=None):
                for header in row.findAll("th"):
                    headers.append(header.string)
        for row in rows:
            pprint.pprint(row.findAll("td"))

CodePudding user response:

Try:

from bs4 import BeatifulSoup

html_doc = """... your HTML from the question ..."""

soup = BeautifulSoup(html_doc, "html.parser")

header = [th.get_text(strip=True) for th in soup.select("th")[1:]]

out = []
for row in soup.select("tr:has(td)"):
    tds = [td.get_text(strip=True) for td in row.select("td")]
    out.append(dict(zip(header, tds)))

print(out)

Prints

[
    {
        "Name": ".NET Core 3.1",
        "Identifier": "dotnetcore3.1",
        "Operating system": "Amazon Linux 2",
        "Deprecation Phase 1": "Jan 20, 2023",
        "Deprecation Phase 2": "Feb 20, 2023",
    },
    {
        "Name": "Node.js 12",
        "Identifier": "nodejs12.x",
        "Operating system": "Amazon Linux 2",
        "Deprecation Phase 1": "Nov 14, 2022",
        "Deprecation Phase 2": "Dec 14, 2022",
    },
    {
        "Name": ".NET Core 2.1",
        "Identifier": "dotnetcore2.1",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "Jan 5, 2022",
        "Deprecation Phase 2": "Apr 13, 2022",
    },
    {
        "Name": "Python 3.6",
        "Identifier": "python3.6",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "July 18, 2022",
        "Deprecation Phase 2": "Aug 29, 2022",
    },
    {
        "Name": "Python 2.7",
        "Identifier": "python2.7",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "July 15, 2021",
        "Deprecation Phase 2": "May 30, 2022",
    },
    {
        "Name": "Ruby 2.5",
        "Identifier": "ruby2.5",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "July 30, 2021",
        "Deprecation Phase 2": "March 31, 2022",
    },
    {
        "Name": "Node.js 10.x",
        "Identifier": "nodejs10.x",
        "Operating system": "Amazon Linux 2",
        "Deprecation Phase 1": "July 30, 2021",
        "Deprecation Phase 2": "Feb 14, 2022",
    },
    {
        "Name": "Node.js 8.10",
        "Identifier": "nodejs8.10",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "",
        "Deprecation Phase 2": "March 6, 2020",
    },
    {
        "Name": "Node.js 6.10",
        "Identifier": "nodejs6.10",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "",
        "Deprecation Phase 2": "August 12, 2019",
    },
    {
        "Name": "Node.js 4.3 edge",
        "Identifier": "nodejs4.3-edge",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "",
        "Deprecation Phase 2": "April 30, 2019",
    },
    {
        "Name": "Node.js 4.3",
        "Identifier": "nodejs4.3",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "",
        "Deprecation Phase 2": "March 6, 2020",
    },
    {
        "Name": "Node.js 0.10",
        "Identifier": "nodejs",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "",
        "Deprecation Phase 2": "October 31, 2016",
    },
    {
        "Name": ".NET Core 2.0",
        "Identifier": "dotnetcore2.0",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "",
        "Deprecation Phase 2": "May 30, 2019",
    },
    {
        "Name": ".NET Core 1.0",
        "Identifier": "dotnetcore1.0",
        "Operating system": "Amazon Linux",
        "Deprecation Phase 1": "",
        "Deprecation Phase 2": "July 30, 2019",
    },
]
  • Related