Home > Blockchain >  Trying to scrape content of webpage
Trying to scrape content of webpage

Time:06-12

I want to get the wordlist of this webpage: https://10fastfingers.com/typing-test/english (the list of words you are supposed to type) and I have managed to get the wordlist from the HTML but I just can't get the content of it. Can somebody help me? Thank you

Here is my code:

from bs4 import BeautifulSoup
from urllib.request import urlopen , Request
import requests

url = "https://10fastfingers.com/typing-test/german"
page = urlopen(Request(url, headers={'User-Agent': 'Mozilla'})).read()
# page = requests.get(url).text
soup = BeautifulSoup(page, "lxml")

match = soup.find("div", class_="row main-layout")
wordlist = match.find("div",  {'id': 'wordlist'})
with open("page content.txt", "w") as file:
    file.write(str(page))
print(wordlist)

CodePudding user response:

You're not going to get the words from doing an HTML request, because the website server doesn't store them in the HTML source file. Turns out it dynamically generates the words by making a POST request after the HTML is loaded.

The POST request is to the url: https://10fastfingers.com/speedtests/get_words and you can see exactly what your own request headers, cookies, and data are if you open the "Network" tab in your browser's developer tools and refresh the page.

Here is some example code I wrote to perform the same POST request in Python, instead of the browser:

# import brotli
import requests
from pprint import pprint

url = "https://10fastfingers.com/speedtests/get_words"

data = {
    "speedtest_mode": "",
    "speedtest_id": "1",
}

cookies = {
    "CAKEPHP": "af57ps7vtafsa3firr35453gtb",
    "CookieConsent": "{stamp:'ojcUTA3IjOXHgdsGXA9u9toM7iRROW28zzy1o3KlbxZssbrOj3a81Q==',necessary:true,preferences:true,statistics:true,marketing:true,ver:2,utc:1654954695891,region:'us'}",
    "CakeCookie[lang]": "Q2FrZQ==.5exP",
    "CakeCookie[alternate_language_suggestion]": "Q2FrZQ==.9PBdWA==",
}

headers = {
    "accept": "*/*",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q=0.9",
    "content-length": "30",
    "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
    "cookie": "CAKEPHP=af57ps7vtafsa3firr35453gtb; CookieConsent={stamp:'ojcUTA3IjOXHgdsGXA9u9toM7iRROW28zzy1o3KlbxZssbrOj3a81Q==',necessary:true,preferences:true,statistics:true,marketing:true,ver:2,utc:1654954695891,region:'us'}; CakeCookie[lang]=Q2FrZQ==.5exP; CakeCookie[alternate_language_suggestion]=Q2FrZQ==.9PBdWA==",
    "dnt": "1",
    "origin": "https://10fastfingers.com",
    "referer": "https://10fastfingers.com/typing-test/english",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "sec-gpc": "1",
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36",
    "x-requested-with": "XMLHttpRequest",
}

response_object = requests.post(url, data=data, cookies=cookies, headers=headers)
print("vvv APPARENT ENCODING vvv")
print(response_object.apparent_encoding)
print()
print("vvv ENCODING vvv")
print(response_object.encoding)
print()
print("vvv HEADERS vvv")
pprint(vars(response_object.headers))
print()
print("vvv RESPONSE TEXT vvv")
print(response_object.text)
print()
# print("vvv BROTLI DECODED CONTENT vvv")
# print(brotli.decompress(response_object.content))

... and the results from the print statements:

vvv APPARENT ENCODING vvv
ascii

vvv ENCODING vvv
UTF-8

vvv HEADERS vvv
{'_store': OrderedDict([('date', ('Date', 'Sat, 11 Jun 2022 14:40:21 GMT')),
                        ('content-type',
                         ('Content-Type', 'text/html; charset=UTF-8')),
                        ('transfer-encoding', ('Transfer-Encoding', 'chunked')),
                        ('connection', ('Connection', 'keep-alive')),
                        ('vary', ('vary', 'Accept-Encoding')),
                        ('x-cache', ('x-cache', 'Miss from cloudfront')),
                        ('via',
                         ('via',
                          '1.1 af9b5a8e96971e0d2d7c6fed1b8873b2.cloudfront.net '
                          '(CloudFront)')),
                        ('x-amz-cf-pop', ('x-amz-cf-pop', 'MIA3-P4')),
                        ('x-amz-cf-id',
                         ('x-amz-cf-id',
                          'sMSHv1sVN869z-CZrV2R9q46zGssFwR_tk2vjc7_8Wmcy9zBCuryIg==')),
                        ('cf-cache-status', ('CF-Cache-Status', 'DYNAMIC')),
                        ('expect-ct',
                         ('Expect-CT',
                          'max-age=604800, '
                          'report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct"')),
                        ('report-to',
                         ('Report-To',
                          '{"endpoints":[{"url":"https:\\/\\/a.nel.cloudflare.com\\/report\\/v3?s=GU1vlFpOWgC+Qg9aQZKz2TJGr6bSAoaytdXlGkSdINz6YsKN/nbJg9OkQx1HzzKhzCRHLaGZOaVIaLLQr00u6Z2m3sJnXGOzIbyu1uFhk7SkDtlbxlHBMFNhS++De/R1f0Mo"}],"group":"cf-nel","max_age":604800}')),
                        ('nel',
                         ('NEL',
                          '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}')),
                        ('server', ('Server', 'cloudflare')),
                        ('cf-ray', ('CF-RAY', '719b19f31ad6220f-MIA')),
                        ('content-encoding', ('Content-Encoding', 'br'))])}

vvv RESPONSE TEXT vvv
and|put|ask|own|the|much|come|work|grow|some|along|other|near|letter|we|soon|play|new|always|few|until|their|once|water|there|home|both|story|saw|enough|tree|not|well|tell|you|sentence|need|add|below|all|family|quick|so|our|who|or|which|around|letter|hear|where|then|she|paper|our|back|miss|it|mean|too|add|young|mother|on|almost|to|now|had|four|men|never|got|end|year|two|in|move|state|river|because|even|line|down|while|did|has|house|do|stop|about|large|at|paper|above|it's|help|not|between|no|question|turn|watch|next|only|as|earth|school|family|place|and|animal|any|change|house|here|change|begin|point|would|and|until|about|run|song|up|never|leave|walk|often|much|tell|go|food|are|again|example|quite|is|man|away|feet|of|some|name|few|always|start|ask|learn|small|be|many|first|hand|far|great|idea|no|good|why|you|side|children|night|know|hand|name|see|those|thought|as|how|picture|does|country|almost|new|time|than|grow|way|with|always|children|do|made|eye|below|quickly|know|before|sound|year|later|after|open|Indian|never|every|hear|sometimes|your|got|on|sound|first|run|book|only|paper|even|want|for|each|are|him|they|stop|may|three|me|where|about|is|go|ask|took|small|in|said|far|set|does|just|went|came|different|other|hard|us|line|such|what|soon|really|came|run|when|watch|why|this|not|but|does|live|mean|white|miss|but|then|which|tree|second|being|quick|something|find|big|together|his|up|get|home|over|school|must|idea|light|he|river|those|more|often|are|page|don't|last|day|through|over|work|three|still|people|tree|enough|near|often|try|side|around|near|be|was|let|these|oil|around|year|end|know|got|country|give|America|America|world|these|cut|learn|them|add|might|hear|being|of|they|change|boy|night|girl|it|high|such|said|say|eye|too|Indian|may|our|she|sentence|should|her|like|more|day|car|live|important|found|kind|boy|find|how|far|side|keep|start|if|who|mean|hard|in|both|close|them|will|land|book|page|have|really|kind|show|use|away|again|little|give|four|back|he|said|most|wri

Please note that the response content is encoded with brotli compression, so your script needs to recognize the apparent encoding and actual encoding. Earlier, when running the script, for some reason it didn't recognize any encoding from the response, so I was receiving gibberish that I had to decompress with brotli to get the actual words...

CodePudding user response:

This answer is modified from the awesome answer by @hmomin, except this answer gets the words in German instead of English as you requested:

import requests

url = "https://10fastfingers.com/speedtests/get_words"

data = {
    "speedtest_mode": "",
    "speedtest_id": "2",
}

cookies = {
    "CAKEPHP": "c79f210mdl6n2jpp962g9vjk9f",
    "CookieConsent": "{stamp:'6CPboWb OVPYxb8oz6CfbjxqXJzoYqXEDiPE62FxLheS24AUkkmksA==',necessary:true,preferences:true,statistics:true,marketing:true,ver:2,utc:1654956041620,region:'us'}",
    "CakeCookie[lang]": "Q2FrZQ==.5exP",
    "CakeCookie[alternate_language_suggestion]": "Q2FrZQ==.9PBdWA==",
}

headers = {
    "accept": "*/*",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q=0.9",
    "cache-control": "no-cache",
    "content-length": "30",
    "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
    "cookie": "CAKEPHP=c79f210mdl6n2jpp962g9vjk9f; CookieConsent={stamp:'6CPboWb OVPYxb8oz6CfbjxqXJzoYqXEDiPE62FxLheS24AUkkmksA==',necessary:true,preferences:true,statistics:true,marketing:true,ver:2,utc:1654956041620,region:'us'}; CakeCookie[lang]=Q2FrZQ==.5exP; CakeCookie[alternate_language_suggestion]=Q2FrZQ==.9PBdWA==",
    "dnt": "1",
    "origin": "https://10fastfingers.com",
    "pragma": "no-cache",
    "referer": "https://10fastfingers.com/typing-test/german",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "sec-gpc": "1",
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.99 Safari/537.36",
    "x-requested-with": "XMLHttpRequest",
}

response_object = requests.post(url, data=data, cookies=cookies, headers=headers)
words = response_object.text
print(words)

CodePudding user response:

the website use ajax to render the content , u can get the content using this parsing it is up to you :)

import  requests
headers = {
    'authority': '10fastfingers.com',
    'accept': '*/*',
    'accept-language': 'en,en-US;q=0.9,ar;q=0.8',
    # 'content-length': '0',
    # Requests sorts cookies= alphabetically
    # 'cookie': 'CAKEPHP=frcdkr9lui4ceuktt7imo0c7f0; _ga=GA1.2.365908205.1654969329; _gid=GA1.2.518644676.1654969329; CookieConsent={stamp:'XCkdg91rCyjPGD5k/I6ib24LWrg88uqVGZ17NwZ4unZU1JRQIxBfmw==',necessary:true,preferences:true,statistics:true,marketing:true,ver:2,utc:1654969345495,region:'ma'}; CakeCookie[lang]=Q2FrZQ==.5exP; CakeCookie[alternate_language_suggestion]=Q2FrZQ==.9PBdWA==; _gat=1',
    'dnt': '1',
    'origin': 'https://10fastfingers.com',
    'referer': 'https://10fastfingers.com/typing-test/english',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest',
}
Top_Ranking_url = 'https://10fastfingers.com/speedtests/render_highscore_get_top_ranking/1/1'
Tests_taken_url = "https://10fastfingers.com/speedtests/render_highscore_get_tests_taken"
Global_Challenge_url ="https://10fastfingers.com/speedtests/render_highscore_get_global_challenge"
response = requests.post(Top_Ranking_url, headers=headers)
print(response.content)```
  • Related