I have an HTML table that I need converted into a list of dictionaries. There are minimal attributes on the table but I need to be able to parse it correctly. Here's the text:
<div ><table><thead><tr><th colspan="100%"><div >Runtime end of support dates</div></th></tr>
<tr>
<th>Name</th>
<th>Identifier</th>
<th>Operating system</th>
<th>Deprecation Phase 1</th>
<th>Deprecation Phase 2</th>
</tr>
</thead>
<tr>
<td>
<p>.NET Core 3.1</p>
</td>
<td>
<p><code >dotnetcore3.1</code></p>
</td>
<td>
<p>Amazon Linux 2</p>
</td>
<td>
<p>Jan 20, 2023</p>
</td>
<td>
<p>Feb 20, 2023</p>
</td>
</tr>
<tr>
<td>
<p>Node.js 12</p>
</td>
<td>
<p><code >nodejs12.x</code></p>
</td>
<td>
<p>Amazon Linux 2</p>
</td>
<td>
<p>Nov 14, 2022</p>
</td>
<td>
<p>Dec 14, 2022</p>
</td>
</tr>
<tr>
<td>
<p>.NET Core 2.1</p>
</td>
<td>
<p><code >dotnetcore2.1</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p>Jan 5, 2022</p>
</td>
<td>
<p>Apr 13, 2022</p>
</td>
</tr>
<tr>
<td>
<p>Python 3.6</p>
</td>
<td>
<p><code >python3.6</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p>July 18, 2022</p>
</td>
<td>
<p>Aug 29, 2022</p>
</td>
</tr>
<tr>
<td>
<p>Python 2.7</p>
</td>
<td>
<p><code >python2.7</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p>July 15, 2021</p>
</td>
<td>
<p>May 30, 2022</p>
</td>
</tr>
<tr>
<td>
<p>Ruby 2.5</p>
</td>
<td>
<p><code >ruby2.5</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p>July 30, 2021</p>
</td>
<td>
<p>March 31, 2022</p>
</td>
</tr>
<tr>
<td>
<p>Node.js 10.x</p>
</td>
<td>
<p><code >nodejs10.x</code></p>
</td>
<td>
<p>Amazon Linux 2</p>
</td>
<td>
<p>July 30, 2021</p>
</td>
<td>
<p>Feb 14, 2022</p>
</td>
</tr>
<tr>
<td>
<p>Node.js 8.10</p>
</td>
<td>
<p><code >nodejs8.10</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>March 6, 2020</p>
</td>
</tr>
<tr>
<td>
<p>Node.js 6.10</p>
</td>
<td>
<p><code >nodejs6.10</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>August 12, 2019</p>
</td>
</tr>
<tr>
<td>
<p>Node.js 4.3 edge</p>
</td>
<td>
<p><code >nodejs4.3-edge</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>April 30, 2019</p>
</td>
</tr>
<tr>
<td>
<p>Node.js 4.3</p>
</td>
<td>
<p><code >nodejs4.3</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>March 6, 2020</p>
</td>
</tr>
<tr>
<td>
<p>Node.js 0.10</p>
</td>
<td>
<p><code >nodejs</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>October 31, 2016</p>
</td>
</tr>
<tr>
<td>
<p>.NET Core 2.0</p>
</td>
<td>
<p><code >dotnetcore2.0</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>May 30, 2019</p>
</td>
</tr>
<tr>
<td>
<p>.NET Core 1.0</p>
</td>
<td>
<p><code >dotnetcore1.0</code></p>
</td>
<td>
<p>Amazon Linux</p>
</td>
<td>
<p> </p>
</td>
<td>
<p>July 30, 2019</p>
</td>
</tr>
</table></div>
This is what I've got so far:
from bs4 import BeautifulSoup
import pprint
headers = []
tables = soup.find_all("div", class_="table-container")
for table in tables:
correct_table = bool(table.findAll(text="Runtime end of support dates"))
if correct_table:
rows = table.findAll("tr")
for row in rows:
# Header row
if row.findAll("th", class_=None):
for header in row.findAll("th"):
headers.append(header.string)
for row in rows:
pprint.pprint(row.findAll("td"))
CodePudding user response:
Try:
from bs4 import BeatifulSoup
html_doc = """... your HTML from the question ..."""
soup = BeautifulSoup(html_doc, "html.parser")
header = [th.get_text(strip=True) for th in soup.select("th")[1:]]
out = []
for row in soup.select("tr:has(td)"):
tds = [td.get_text(strip=True) for td in row.select("td")]
out.append(dict(zip(header, tds)))
print(out)
Prints
[
{
"Name": ".NET Core 3.1",
"Identifier": "dotnetcore3.1",
"Operating system": "Amazon Linux 2",
"Deprecation Phase 1": "Jan 20, 2023",
"Deprecation Phase 2": "Feb 20, 2023",
},
{
"Name": "Node.js 12",
"Identifier": "nodejs12.x",
"Operating system": "Amazon Linux 2",
"Deprecation Phase 1": "Nov 14, 2022",
"Deprecation Phase 2": "Dec 14, 2022",
},
{
"Name": ".NET Core 2.1",
"Identifier": "dotnetcore2.1",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "Jan 5, 2022",
"Deprecation Phase 2": "Apr 13, 2022",
},
{
"Name": "Python 3.6",
"Identifier": "python3.6",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "July 18, 2022",
"Deprecation Phase 2": "Aug 29, 2022",
},
{
"Name": "Python 2.7",
"Identifier": "python2.7",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "July 15, 2021",
"Deprecation Phase 2": "May 30, 2022",
},
{
"Name": "Ruby 2.5",
"Identifier": "ruby2.5",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "July 30, 2021",
"Deprecation Phase 2": "March 31, 2022",
},
{
"Name": "Node.js 10.x",
"Identifier": "nodejs10.x",
"Operating system": "Amazon Linux 2",
"Deprecation Phase 1": "July 30, 2021",
"Deprecation Phase 2": "Feb 14, 2022",
},
{
"Name": "Node.js 8.10",
"Identifier": "nodejs8.10",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "",
"Deprecation Phase 2": "March 6, 2020",
},
{
"Name": "Node.js 6.10",
"Identifier": "nodejs6.10",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "",
"Deprecation Phase 2": "August 12, 2019",
},
{
"Name": "Node.js 4.3 edge",
"Identifier": "nodejs4.3-edge",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "",
"Deprecation Phase 2": "April 30, 2019",
},
{
"Name": "Node.js 4.3",
"Identifier": "nodejs4.3",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "",
"Deprecation Phase 2": "March 6, 2020",
},
{
"Name": "Node.js 0.10",
"Identifier": "nodejs",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "",
"Deprecation Phase 2": "October 31, 2016",
},
{
"Name": ".NET Core 2.0",
"Identifier": "dotnetcore2.0",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "",
"Deprecation Phase 2": "May 30, 2019",
},
{
"Name": ".NET Core 1.0",
"Identifier": "dotnetcore1.0",
"Operating system": "Amazon Linux",
"Deprecation Phase 1": "",
"Deprecation Phase 2": "July 30, 2019",
},
]