I am trying to scrap the Reddit Data using Python. The result I got is only for a single subreddit information not for the complete list.
What I Tried:
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url="https://www.reddit.com/"
driver.get(url)
Communities=[]
#content = driver.page_source
time.sleep(15)
driver.find_element("id", "header-search-bar").send_keys("BTC")
time.sleep(5)
driver.find_element("id", "header-search-bar").send_keys(Keys.ENTER)
time.sleep(5)
community=driver.find_element('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[1]/div/div[1]/a[3]/button')
community.click()
time.sleep(10)
colist=driver.find_elements('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
for comunity in colist:
#getting all the Communities
Name=comunity.find_element('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/h6')
Members=comunity.find_element('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/div/p/span')
Description=comunity.find_element('xpath', '//*[@id="AppRouter-main-content"]/div/div/div[2]/div/div/div[2]/div/div[2]/div/div/div[1]/div/a/div/div[1]/p')
# Saving community info
community_info = [Name.text, Members.text, Description.text]
Communities.append(community_info)
driver.quit()
communitydf = pd.DataFrame(Communities)
communitydf.columns = ['Community', 'Members', 'Description']
communitydf.to_csv('community_details.csv', index=False)
time.sleep(5)
What I Want:
The above code only fetches the first record, but I want to access all the subreddits which I get from the search query. I am new to Python and I think I mix-up the logic.
Any help will be appreciated.
CodePudding user response:
Firstly, you do not wait for all the communities to load, for this you need to scroll the page to the end. Secondly, you are looking for the same xpath, which ll always only point to a specific element
import time
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
def ger_communities(name: str):
driver = webdriver.Chrome(r"C:\Users\MSZ\Reddit-scrap\chromedriver")
url = f"https://www.reddit.com/search/?q={name}&type=sr"
driver.get(url)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
communities = []
soup = BeautifulSoup(driver.page_source, 'lxml')
for x in soup.find('div', {'data-testid': 'communities-list'}).find_all('a', {'data-testid': 'subreddit-link'}):
communities.append({
'Name': x.find('h6').get_text(),
'Members': x.find('span').get_text(),
'Description': x.find_all('p')[-1].get_text()
})
return communities
df = pd.DataFrame(ger_communities('BTC'))
df.to_csv('community_details.csv', index=False)
But i reccomend use Reddit API