Scrap Data Project Python-CodePudding

Following code should go on the website "https://www.destatis.de/DE/Themen/Gesellschaft-Umwelt/Bevoelkerung/Geburten/Tabellen/lebendgeborene-vorl.html" and get the data of that table there. The x axis data is in the column "Monate" and the y axis values are in "Geborene Kinder". I also put the xpath of the columns, where the data should be scrapped. But it doesn't work.

import tkinter as tk
from tkinter import ttk
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import json
import matplotlib.pyplot as plt
import os
from time import sleep as wait

options = Options()
options.binary_location = r'C:\Program Files\Mozilla Firefox\firefox.exe'


class DataManager:
    def __init__(self):
        # Create the main window
        self.root = tk.Tk()
        self.root.title("Data Manager")

        # Create the grid layout
        self.mainframe = ttk.Frame(self.root, padding="3 3 12 12")
        self.mainframe.grid(column=0, row=0, sticky=(tk.N, tk.W, tk.E, tk.S))
        self.mainframe.columnconfigure(0, weight=1)
        self.mainframe.rowconfigure(0, weight=1)

        # Create the "Get Data" button
        self.get_data_button = ttk.Button(self.mainframe, text="Get Data", command=self.get_data)
        self.get_data_button.grid(column=1, row=1)

        # Create the "Load Data" button
        self.load_data_button = ttk.Button(self.mainframe, text="Load Data", command=self.load_data)
        self.load_data_button.grid(column=2, row=1)

        # Create the combo box for selecting plot type
        self.plot_type = tk.StringVar()
        self.plot_type_combo = ttk.Combobox(self.mainframe, textvariable=self.plot_type)
        self.plot_type_combo['values'] = ('Scatter', 'Line')
        self.plot_type_combo.grid(column=3, row=1)

        # Create the "Generate" button
        self.generate_button = ttk.Button(self.mainframe, text="Generate", command=self.generate_plot)
        self.generate_button.grid(column=4, row=1)

        # Create the "Quit" button
        self.quit_button = ttk.Button(self.mainframe, text="Quit", command=self.root.destroy)
        self.quit_button.grid(column=5, row=1)

    def get_data(self):
        """
        Scrapes data from the website and saves it as a json file
        """
        # Open the website using selenium webdriver
        driver = webdriver.Firefox(options=options)
        driver.get("https://www.destatis.de/DE/Themen/Gesellschaft-Umwelt/Bevoelkerung/Geburten/Tabellen/lebendgeborene-vorl.html")

        # wait 5 seconds
        wait(5)

        # Close "Hinweis zum Datenschutz"
        datenschutz = driver.find_element_by_xpath("/html/body/div[3]/div/div[1]/div/div/div/div/div[2]/div/p/button")
        datenschutz.click()

        # Wait 5 Seconds
        wait(5)

        # Scrape the data using xpath
        x_axis_data = driver.find_elements_by_xpath("/html/body/div[2]/div/div/main/div/div[3]/div/table/thead/tr/th[2]")
        y_axis_data = driver.find_elements_by_xpath("/html/body/div[2]/div/div/main/div/div[3]/div/table/thead/tr/th[3]")

        # Convert the data to lists
        x_axis_data = [x.text for x in x_axis_data]
        y_axis_data = [y.text for y in y_axis_data]

        # Create a dictionary with the data
        data = {"Month": x_axis_data, "Births": y_axis_data}

        # Create the data directory if it does not exist
        if not os.path.exists("data"):
            os.mkdir("data")

        # Save the data to a json file
        with open("data/scrapped_data.json", "w") as f:
            json.dump(data, f)

        # Close the webdriver
        driver.close()

    def load_data(self):
        """
        Loads the data from the json file and changes the column names
        """
        # Load the data from the json file
        with open("data/scrapped_data.json", "r") as f:
            data = json.load(f)

        # Change the column names
        data["Month"] = data.pop("Monate")
        data["Births"] = data.pop("Geborene Kinder")

        # Save the data to the class variable
        self.data = data

    def generate_plot(self):
        """
        Generates a plot based on the selected plot type and the loaded data
        """
        # Get the selected plot type
        plot_type = self.plot_type.get()

        # Check if data is loaded
        if not hasattr(self, "data"):
            print("Please load the data first")
            return

        # Check the plot type and generate the plot
        if plot_type == "Scatter":
            plt.scatter(self.data["Month"], self.data["Births"])
        elif plot_type == "Line":
            plt.plot(self.data["Month"], self.data["Births"])

        # Show the plot
        plt.show()


data_manager = DataManager()
tk.mainloop()

I tried to get the data with the Xpath, but it didn't help. I expect, that the code can scrap the data off the columns, save it to a json file and then plot a graph.

CodePudding user response：

Those xpaths for x_axis_data and y_axis_data select just the header titles of the two columns, that's why you don't get the row values.

Looking at the html of the table, we notice that the cells containing the months have a unique class Vorspalte, so we can select them with the css selector td[class=Vorspalte]. The cells containing the values are simply the siblings of the month cells, so we can select them with td[class=Vorspalte] td.

x_axis_data = driver.find_elements(By.CSS_SELECTOR, 'td[class=Vorspalte]')
y_axis_data = driver.find_elements(By.CSS_SELECTOR, 'td[class=Vorspalte] td')

And the output is

>>> x_axis_data = [x.text for x in x_axis_data]
>>> x_axis_data
['Januar',
 'Februar',
 'März',
 'April',
 'Mai',
 'Juni',
 'Juli',
 'August',
 'September',
 'Oktober']

CodePudding user response：

I now changed the code and now it is working:

        for i in range(1, 11):
        if i == 1:
            x_path = f'/html/body/div[2]/div/div/main/div/div[3]/div/table/tbody/tr[{i}]/td[2]'
            y_path = f'/html/body/div[2]/div/div/main/div/div[3]/div/table/tbody/tr[{i}]/td[3]'
            x_axis.append(driver.find_element_by_xpath(x_path).text)
            y_axis.append(driver.find_element_by_xpath(y_path).text)
        else:
            x_path = f'/html/body/div[2]/div/div/main/div/div[3]/div/table/tbody/tr[{i}]/td[1]'
            y_path = f'/html/body/div[2]/div/div/main/div/div[3]/div/table/tbody/tr[{i}]/td[2]'
            x_axis.append(driver.find_element_by_xpath(x_path).text)
            y_axis.append(driver.find_element_by_xpath(y_path).text)