I have an issue, I would like to scrap all french stocks (10 pages) through this website : https://www.zonebourse.com/bourse/actions/Europe-3/France-51
but when I go to the 2nd page, the end of the url is random, for example :
once it's zonebourse.com/bourse/actions/Europe-3/France-51/_BF3L50/
then another time it's zonebourse.com/bourse/actions/Europe-3/France-51/_4FXnaa/
etc
it's the same issue for the others pages too ..
there is no pattern ! How can I do ?
Here my code if the url was
zonebourse.com/bourse/actions/Europe-3/France-51/1
zonebourse.com/bourse/actions/Europe-3/France-51/2
...
import requests
from bs4 import BeautifulSoup
links = []
for i in range(10):
url = 'https://www.zonebourse.com/bourse/actions/Europe-3/France-51/' str(i)
response = requests.get(url)
if response.ok:
soup = BeautifulSoup(response.text, 'lxml')
tds = soup.findAll('td')
for td in tds:
a = td.find('a')
if a != None:
link = a['href']
if link.startswith('/cours/action/'):
links.append('https://www.zonebourse.com' link)
print(links)
thanks !
CodePudding user response:
Try:
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
json_data = {
"TRBC": 0,
"TRBC_chain": [""],
"aSectors": [[], [], [], [], []],
"aLists": [[], [], [], []],
"markets": [
40,
43,
44,
45,
46,
47,
48,
49,
50,
51,
53,
54,
55,
56,
57,
58,
59,
60,
61,
62,
63,
64,
67,
69,
70,
71,
72,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
],
"capi_min": 2,
"capi_max": 10,
"liqu_min": 0,
"liqu_max": 10,
"tri": [0, 1, 2, 3, 4, 5],
"ord": ["N", "N", "N", "D", "N", "N"],
"special_option_news": "",
"special_option_date": "",
"special_dynamic": "",
"special_partner": "",
"result_mode": 7,
"crit": [],
"page": 1,
"sMode": "AF2",
}
api_url = "https://www.zonebourse.com/outils/mods_a/moteurs_results.php?ResultMode=7&model=3"
all_data = []
for json_data["page"] in range(1, 3):
data = {
"Req": json.dumps(json_data),
"bJSON": "true",
"scrollMode": "false",
}
soup = BeautifulSoup(
requests.post(api_url, data=data).content, "html.parser"
)
for row in soup.table.select("tr")[1:]:
tds = [td.get_text(strip=True, separator=" ") for td in row]
all_data.append(tds)
headers = [
"col0",
"Société",
"Cours",
"col3",
"Capitalisation (M$)",
"Varia. 1janv",
"Secteur",
"col7",
]
print(pd.DataFrame(all_data, columns=headers).head().to_markdown())
Prints:
col0 | Société | Cours | col3 | Capitalisation (M$) | Varia. 1janv | Secteur | col7 | |
---|---|---|---|---|---|---|---|---|
0 | LVMH MOËT HENNESSY LOUIS VUITTON.. EUR | 647.5 | 332 839 | -8.97% | Habillement et accessoires - Autres | |||
1 | NESTLÉ S.A. CHF | 114.58 | 326 734 | -9.64% | Industrie agroalimentaire - Autres | |||
2 | ROCHE HOLDING AG CHF | 315.4 | 266 801 | -16.54% | Pharmacies - Autres | |||
3 | NOVO NORDISK A/S DKK | 792.5 | 242 615 | 8.12% | Pharmacies - Autres | |||
4 | SHELL PLC GBX | 2290 | 198 550 | 44.25% | Pétrole et gaz - compagnies intégrées |