Home > front end >  Reddit Community List using Python
Reddit Community List using Python

Time:12-09

I am trying to scrap the Reddit Data using Python. The result I got is only for a single subreddit information not for the complete list.

What I Tried:

import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd




driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url="https://www.reddit.com/"
driver.get(url)
Communities=[]
#content = driver.page_source
time.sleep(15)

driver.find_element("id", "header-search-bar").send_keys("BTC")
time.sleep(5)
driver.find_element("id", "header-search-bar").send_keys(Keys.ENTER)
time.sleep(5)
community=driver.find_element('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[1]/div/div[1]/a[3]/button')
community.click()
time.sleep(10)
colist=driver.find_elements('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')

for comunity in colist:
    #getting all the Communities
    Name=comunity.find_element('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
    Members=comunity.find_element('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/p/span')
    Description=comunity.find_element('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/p')

   # Saving community info 
    community_info = [Name.text, Members.text, Description.text]
   
    Communities.append(community_info)
driver.quit()


communitydf = pd.DataFrame(Communities)
communitydf.columns = ['Community', 'Members', 'Description']
communitydf.to_csv('community_details.csv', index=False)

time.sleep(5)

What I Want:

The above code only fetches the first record, but I want to access all the subreddits which I get from the search query. I am new to Python and I think I mix-up the logic.

Any help will be appreciated.

CodePudding user response:

Firstly, you do not wait for all the communities to load, for this you need to scroll the page to the end. Secondly, you are looking for the same xpath, which ll always only point to a specific element

import time
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd


def ger_communities(name: str):
    driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
    url = f"https://www.reddit.com/search/?q={name}&type=sr"
    driver.get(url)
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    communities = []
    soup = BeautifulSoup(driver.page_source, 'lxml')
    for x in soup.find('div', {'data-testid': 'communities-list'}).find_all('a', {'data-testid': 'subreddit-link'}):
        communities.append({
            'Name': x.find('h6').get_text(),
            'Members': x.find('span').get_text(),
            'Description': x.find_all('p')[-1].get_text()
        })
    return communities


df = pd.DataFrame(ger_communities('BTC'))
df.to_csv('community_details.csv', index=False)

But i reccomend use Reddit API

  • Related