site that will be webscraped: https://api.github.com/repos/angular/angular-cli/issues?state=all&per_page=1&page=1
import requests
import bs4
page_number=1
number=[]
state=[]
created_at=[]
closed_at=[]
while page_number<100:
base_url=f'https://api.github.com/repos/angular/angular-cli/issues?state=all&per_page=1&page={page_number}'
result=requests.get(base_url)
soup=bs4.BeautifulSoup(result.text,'lxml')
test_string=soup.select('p')[0].getText()[2:]
my_list=test_string.split(',')
number.append(my_list[8])
state.append(my_list[29])
created_at.append(my_list[35])
closed_at.append(my_list[37])
page_number=page_number 1
my data of interest are number, state, created_at,closed_at when I print their lists, the results are incorrect probably because their index changes in other pages and I based the code for indexing just in the first page.
CodePudding user response:
No need to parse with beautifulsoup. The data comes in a nice json format. However, there is a rate limit of 60 request per hour for the api.
import requests
import pandas as pd
page_number=1
rows = []
while page_number<100:
base_url=f'https://api.github.com/repos/angular/angular-cli/issues?state=all&per_page=1&page={page_number}'
result=requests.get(base_url).json()
rows = result
print(page_number)
page_number=page_number 1
df = pd.DataFrame(rows)
output = df[['number', 'state', 'created_at', 'closed_at']]
Output sample:
print(output)
number state created_at closed_at
0 23719 open 2022-08-10T09:38:18Z None
1 23718 open 2022-08-10T09:20:48Z None
2 23717 open 2022-08-10T08:56:29Z None
3 23716 open 2022-08-10T08:13:01Z None
4 23715 open 2022-08-10T07:08:05Z None
5 23714 open 2022-08-09T23:36:26Z None
6 23713 open 2022-08-09T22:44:22Z None
7 23712 open 2022-08-09T22:11:42Z None
8 23711 open 2022-08-09T20:22:43Z None
CodePudding user response:
I changed the url to show 100 per page instead of 1 to reduce the number of requests needed. I am able now to get more data if I set the page number say 230. it worked perfect until I wanna get everything from page 1. The error i get is: 'str' object has no attribute 'keys'. That doesnt happen when I dont get all data from page 1 like when i start at page 230. In addition, I believe I was able to increase my rate limit from 60 to 5000 per hour by authentication token
#page_number=230
page_number=1
rows = []
while requests.get(f'https://api.github.com/repos/angular/angular-cli/issues?state=all&per_page=100&page={page_number}').json() != []:
base_url=f'https://api.github.com/repos/angular/angular-cli/issues?state=all&per_page=100&page={page_number}'
result=requests.get(base_url).json()
rows = result
#print(page_number)
page_number=page_number 1
df = pd.DataFrame(rows)
df[['number', 'state', 'created_at', 'closed_at']]