Home > Software engineering >  I need help to fix scraping graphql API
I need help to fix scraping graphql API

Time:04-06

I was able via Google Dev Tools - Networking to paste the graphql query into Insomnia (copy url bash) to make a working python request. Now something has been changed on the part of the provider. Now I can not even run the curl in insomnia. I only get response 400. On my previous code I get error message, which I can not solve myself. I would be very happy for a working solution.

My coder that worked so far is:

import requests
import json

def scrape_digitec():
    url = "https://www.digitec.ch/api/graphql"
    headers = {
        "authority": "www.digitec.ch",
        "accept": "application/json",
        "accept-language": "de-CH",
        "cache-control": "no-cache",
        "content-type": "application/json",
        "origin": "https://www.digitec.ch",
        "pragma": "no-cache",
        "referer": "https://www.digitec.ch/search?q=bang olufsen",
        "sec-ch-ua": '"Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Windows"',
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
        "x-dg-country": "ch",
        "x-dg-mandator": "406802",
        "x-dg-portal": "25",
        "x-dg-testgroup": "Default"
    }

    search = 'lg'
    offset = '0'
    payload = '{"query":"query ENTER_SEARCH(\\t$query: String!\\t$sortOrder: ProductSort\\t$limit: Int = 9\\t$offset: Int = 0\\t$filters: [SearchFilter]\\t$include: [String!]\\t$exclude: [String!]\\t$searchQueryId: String\\t$siteId: String) {\\tsearch(\\t\\tquery: $query\\t\\tfilters: $filters\\t\\tsearchQueryId: $searchQueryId\\t\\tsiteId: $siteId\\t) {\\t\\tproducts(limit: $limit, offset: $offset, sortOrder: $sortOrder) {\\t\\t\\ttotal\\t\\t\\thasMore\\t\\t\\tnextOffset\\t\\t\\tresults {\\t\\t\\t\\t...ProductSearchResult\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\t__typename\\t\\t}\\t\\tfilters(include: $include, exclude: $exclude) {\\t\\t\\tproduct {\\t\\t\\t\\tidentifier\\t\\t\\t\\tname\\t\\t\\t\\tfilterType\\t\\t\\t\\tscore\\t\\t\\t\\ttooltip {\\t\\t\\t\\t\\t...FilterTooltipResult\\t\\t\\t\\t\\t__typename\\t\\t\\t\\t}\\t\\t\\t\\t...CheckboxSearchFilterResult\\t\\t\\t\\t...RangeSearchFilterResult\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\t__typename\\t\\t}\\t\\tmagazinePages(limit: 3) {\\t\\t\\tids {\\t\\t\\t\\tid\\t\\t\\t\\tscore\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\ttotal\\t\\t\\t__typename\\t\\t}\\t\\tauthors(limit: 3) {\\t\\t\\tids {\\t\\t\\t\\tid\\t\\t\\t\\tscore\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\ttotal\\t\\t\\t__typename\\t\\t}\\t\\tdiscussions(limit: 3) {\\t\\t\\tids {\\t\\t\\t\\tid\\t\\t\\t\\tscore\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\ttotal\\t\\t\\t__typename\\t\\t}\\t\\tquestions(limit: 3) {\\t\\t\\tids {\\t\\t\\t\\tid\\t\\t\\t\\tscore\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\ttotal\\t\\t\\t__typename\\t\\t}\\t\\tratings(limit: 3) {\\t\\t\\tids {\\t\\t\\t\\tid\\t\\t\\t\\tscore\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\ttotal\\t\\t\\t__typename\\t\\t}\\t\\tproductTypes(limit: 24) {\\t\\t\\ttotal\\t\\t\\tresults {\\t\\t\\t\\tid\\t\\t\\t\\tname\\t\\t\\t\\tprimarySynonyms\\t\\t\\t\\tisVisible\\t\\t\\t\\tdescription\\t\\t\\t\\tmetaDescription\\t\\t\\t\\timageUrl\\t\\t\\t\\tsearchScore\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\t__typename\\t\\t}\\t\\tbrands(limit: 24) {\\t\\t\\ttotal\\t\\t\\tresults {\\t\\t\\t\\tid\\t\\t\\t\\ttitle\\t\\t\\t\\tsearchScore\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\t__typename\\t\\t}\\t\\thelp(limit: 3) {\\t\\t\\tids {\\t\\t\\t\\tid\\t\\t\\t\\tscore\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\ttotal\\t\\t\\thasMore\\t\\t\\tresults {\\t\\t\\t\\tsearchScore\\t\\t\\t\\ttitle\\t\\t\\t\\tid\\t\\t\\t\\turl\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\t__typename\\t\\t}\\t\\t_meta {\\t\\t\\tqueryInfo {\\t\\t\\t\\tcorrectedQuery\\t\\t\\t\\tdidYouMeanQuery\\t\\t\\t\\tlastProductSearchPass\\t\\t\\t\\texecutedSearchTerm\\t\\t\\t\\ttestGroup\\t\\t\\t\\tisManagedQuery\\t\\t\\t\\tisRerankedQuery\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\tredirectionUrl\\t\\t\\tportalReferral {\\t\\t\\t\\tproductCount\\t\\t\\t\\tportalName\\t\\t\\t\\turl\\t\\t\\t\\tproductImageUrls\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\t__typename\\t\\t}\\t\\t__typename\\t}}fragment ProductSearchResult on ProductSearchResultItem {\\tsearchScore\\tmandatorSpecificData {\\t\\t...ProductMandatorSpecific\\t\\t__typename\\t}\\tproduct {\\t\\t...ProductMandatorIndependent\\t\\t__typename\\t}\\toffer {\\t\\t...ProductOffer\\t\\t__typename\\t}\\t__typename}fragment FilterTooltipResult on FilterTooltip {\\ttext\\tmoreInformationLink\\t__typename}fragment CheckboxSearchFilterResult on CheckboxSearchFilter {\\toptions {\\t\\tidentifier\\t\\tname\\t\\tproductCount\\t\\tscore\\t\\treferenceValue {\\t\\t\\tvalue\\t\\t\\tunit {\\t\\t\\t\\tabbreviation\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\t__typename\\t\\t}\\t\\tpreferredValue {\\t\\t\\tvalue\\t\\t\\tunit {\\t\\t\\t\\tabbreviation\\t\\t\\t\\t__typename\\t\\t\\t}\\t\\t\\t__typename\\t\\t}\\t\\ttooltip {\\t\\t\\t...FilterTooltipResult\\t\\t\\t__typename\\t\\t}\\t\\t__typename\\t}\\t__typename}fragment RangeSearchFilterResult on RangeSearchFilter {\\treferenceMin\\tpreferredMin\\treferenceMax\\tpreferredMax\\treferenceStepSize\\tpreferredStepSize\\trangeMergeInfo {\\t\\tisBottomMerged\\t\\tisTopMerged\\t\\t__typename\\t}\\treferenceUnit {\\t\\tabbreviation\\t\\t__typename\\t}\\tpreferredUnit {\\t\\tabbreviation\\t\\t__typename\\t}\\trangeFilterDataPoint {\\t\\t...RangeFilterDataPointResult\\t\\t__typename\\t}\\t__typename}fragment ProductMandatorSpecific on MandatorSpecificData {\\tisBestseller\\tisDeleted\\tshowroomSites\\tsectorIds\\t__typename}fragment ProductMandatorIndependent on ProductV2 {\\tid\\tproductId\\tname\\tnameProperties\\tproductTypeId\\tproductTypeName\\tbrandId\\tbrandName\\taverageRating\\ttotalRatings\\ttotalQuestions\\tisProductSet\\timages {\\t\\turl\\t\\theight\\t\\twidth\\t\\t__typename\\t}\\tenergyEfficiency {\\t\\tenergyEfficiencyColorType\\t\\tenergyEfficiencyLabelText\\t\\tenergyEfficiencyLabelSigns\\t\\tenergyEfficiencyImage {\\t\\t\\turl\\t\\t\\theight\\t\\t\\twidth\\t\\t\\t__typename\\t\\t}\\t\\t__typename\\t}\\tseo {\\t\\tseoProductTypeName\\t\\tseoNameProperties\\t\\tproductGroups {\\t\\t\\tproductGroup1\\t\\t\\tproductGroup2\\t\\t\\tproductGroup3\\t\\t\\tproductGroup4\\t\\t\\t__typename\\t\\t}\\t\\tgtin\\t\\t__typename\\t}\\thasVariants\\tsmallDimensions\\tbasePrice {\\t\\tpriceFactor\\t\\tvalue\\t\\t__typename\\t}\\t__typename}fragment ProductOffer on OfferV2 {\\tid\\tproductId\\tofferId\\tshopOfferId\\tprice {\\t\\tamountIncl\\t\\tamountExcl\\t\\tcurrency\\t\\tfraction\\t\\t__typename\\t}\\tdeliveryOptions {\\t\\tmail {\\t\\t\\tclassification\\t\\t\\tfutureReleaseDate\\t\\t\\t__typename\\t\\t}\\t\\tpickup {\\t\\t\\tsiteId\\t\\t\\tclassification\\t\\t\\tfutureReleaseDate\\t\\t\\t__typename\\t\\t}\\t\\tdetailsProvider {\\t\\t\\tproductId\\t\\t\\tofferId\\t\\t\\tquantity\\t\\t\\ttype\\t\\t\\t__typename\\t\\t}\\t\\t__typename\\t}\\tlabel\\ttype\\tvolumeDiscountPrices {\\t\\tminAmount\\t\\tprice {\\t\\t\\tamountIncl\\t\\t\\tamountExcl\\t\\t\\tcurrency\\t\\t\\t__typename\\t\\t}\\t\\tisDefault\\t\\t__typename\\t}\\tsalesInformation {\\t\\tnumberOfItems\\t\\tnumberOfItemsSold\\t\\tisEndingSoon\\t\\tvalidFrom\\t\\t__typename\\t}\\tincentiveText\\tisIncentiveCashback\\tisNew\\tisSalesPromotion\\thideInProductDiscovery\\tcanAddToBasket\\thidePrice\\tinsteadOfPrice {\\t\\ttype\\t\\tprice {\\t\\t\\tamountIncl\\t\\t\\tamountExcl\\t\\t\\tcurrency\\t\\t\\tfraction\\t\\t\\t__typename\\t\\t}\\t\\t__typename\\t}\\tminOrderQuantity\\t__typename}fragment RangeFilterDataPointResult on RangeFilterDataPoint {\\tcount\\treferenceValue {\\t\\tvalue\\t\\tunit {\\t\\t\\tabbreviation\\t\\t\\t__typename\\t\\t}\\t\\t__typename\\t}\\tpreferredValue {\\t\\tvalue\\t\\tunit {\\t\\t\\tabbreviation\\t\\t\\t__typename\\t\\t}\\t\\t__typename\\t}\\t__typename}\",\"variables\":{\"limit\":100,\"offset\":' offset ',\"query\":\"' search '\",\"filters\":[],\"sortOrder\":null,\"include\":[\"bra\",\"pt\",\"pr\"],\"exclude\":[\"off\"],\"searchQueryId\":\"4ce81461-09e2-4f7a-bb9a-8f6f8503fdc4\",\"siteId\":null},\"operationName\":\"ENTER_SEARCH\"}'

    response = requests.request("POST", url, data=payload, headers=headers)
    print(response)
    data = response.json()
    print(json.dumps(data, indent=2))
    print(json.dumps(data))

if __name__ == '__main__':
    scrape_digitec()

CodePudding user response:

You need to format your payload into json format (python's dictionary/lists), then use the json parameter, as opposed to data:

import requests
import json

def scrape_digitec():
    url = "https://www.digitec.ch/api/graphql"
    headers = {
        "authority": "www.digitec.ch",
        "accept": "application/json",
        "accept-language": "de-CH",
        "cache-control": "no-cache",
        "content-type": "application/json",
        "origin": "https://www.digitec.ch",
        "pragma": "no-cache",
        "referer": "https://www.digitec.ch/search?q=bang olufsen",
        "sec-ch-ua": '"Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Windows"',
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
        "x-dg-country": "ch",
        "x-dg-mandator": "406802",
        "x-dg-portal": "25",
        "x-dg-testgroup": "Default"
    }
    
    search = 'lg'
    offset = 0
    payload = [{
        "operationName":"ENTER_SEARCH",
        "variables":{
            "limit":24,
            "offset":offset,
            "query":search,
            "filters":[],
            #"sortOrder":null,
            "include":["bra","pt","pr","off"],
            "searchQueryId":"e1b620fc-bf9c-41c6-85c0-cc49e5d12e25",
            #"siteId":null},
            },
        "query":"query ENTER_SEARCH($query: String!, $sortOrder: ProductSort, $limit: Int = 9, $offset: Int = 0, $filters: [SearchFilter], $include: [String!], $exclude: [String!], $searchQueryId: String, $siteId: String) {\n  search(\n    query: $query\n    filters: $filters\n    searchQueryId: $searchQueryId\n    siteId: $siteId\n  ) {\n    products(limit: $limit, offset: $offset, sortOrder: $sortOrder) {\n      total\n      hasMore\n      nextOffset\n      results {\n        ...ProductSearchResult\n        __typename\n      }\n      __typename\n    }\n    filters(include: $include, exclude: $exclude) {\n      product {\n        identifier\n        name\n        filterType\n        score\n        tooltip {\n          ...FilterTooltipResult\n          __typename\n        }\n        ...CheckboxSearchFilterResult\n        ...RangeSearchFilterResult\n        __typename\n      }\n      __typename\n    }\n    magazinePages(limit: 3) {\n      ids {\n        id\n        score\n        __typename\n      }\n      total\n      __typename\n    }\n    authors(limit: 3) {\n      ids {\n        id\n        score\n        __typename\n      }\n      total\n      __typename\n    }\n    discussions(limit: 3) {\n      ids {\n        id\n        score\n        __typename\n      }\n      total\n      __typename\n    }\n    questions(limit: 3) {\n      ids {\n        id\n        score\n        __typename\n      }\n      total\n      __typename\n    }\n    ratings(limit: 3) {\n      ids {\n        id\n        score\n        __typename\n      }\n      total\n      __typename\n    }\n    productTypes(limit: 24) {\n      total\n      results {\n        id\n        name\n        primarySynonyms\n        isVisible\n        description\n        metaDescription\n        imageUrl\n        searchScore\n        __typename\n      }\n      __typename\n    }\n    brands(limit: 24) {\n      total\n      results {\n        id\n        title\n        searchScore\n        __typename\n      }\n      __typename\n    }\n    _meta {\n      queryInfo {\n        correctedQuery\n        didYouMeanQuery\n        lastProductSearchPass\n        executedSearchTerm\n        testGroup\n        isManagedQuery\n        isRerankedQuery\n        __typename\n      }\n      redirectionUrl\n      portalReferral {\n        productCount\n        portalName\n        url\n        productImageUrls\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n\nfragment ProductSearchResult on ProductSearchResultItem {\n  searchScore\n  mandatorSpecificData {\n    ...ProductMandatorSpecific\n    __typename\n  }\n  product {\n    ...ProductMandatorIndependent\n    __typename\n  }\n  offer {\n    ...ProductOffer\n    __typename\n  }\n  __typename\n}\n\nfragment FilterTooltipResult on FilterTooltip {\n  text\n  moreInformationLink\n  __typename\n}\n\nfragment CheckboxSearchFilterResult on CheckboxSearchFilter {\n  options {\n    identifier\n    name\n    productCount\n    score\n    referenceValue {\n      value\n      unit {\n        abbreviation\n        __typename\n      }\n      __typename\n    }\n    preferredValue {\n      value\n      unit {\n        abbreviation\n        __typename\n      }\n      __typename\n    }\n    tooltip {\n      ...FilterTooltipResult\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment RangeSearchFilterResult on RangeSearchFilter {\n  referenceMin\n  preferredMin\n  referenceMax\n  preferredMax\n  referenceStepSize\n  preferredStepSize\n  rangeMergeInfo {\n    isBottomMerged\n    isTopMerged\n    __typename\n  }\n  referenceUnit {\n    abbreviation\n    __typename\n  }\n  preferredUnit {\n    abbreviation\n    __typename\n  }\n  rangeFilterDataPoint {\n    ...RangeFilterDataPointResult\n    __typename\n  }\n  __typename\n}\n\nfragment ProductMandatorSpecific on MandatorSpecificData {\n  isBestseller\n  isDeleted\n  showroomSites\n  sectorIds\n  __typename\n}\n\nfragment ProductMandatorIndependent on ProductV2 {\n  id\n  productId\n  name\n  nameProperties\n  productTypeId\n  productTypeName\n  brandId\n  brandName\n  averageRating\n  totalRatings\n  totalQuestions\n  isProductSet\n  images {\n    url\n    height\n    width\n    __typename\n  }\n  energyEfficiency {\n    energyEfficiencyColorType\n    energyEfficiencyLabelText\n    energyEfficiencyLabelSigns\n    energyEfficiencyImage {\n      url\n      height\n      width\n      __typename\n    }\n    __typename\n  }\n  seo {\n    seoProductTypeName\n    seoNameProperties\n    productGroups {\n      productGroup1\n      productGroup2\n      productGroup3\n      productGroup4\n      __typename\n    }\n    gtin\n    __typename\n  }\n  hasVariants\n  smallDimensions\n  basePrice {\n    priceFactor\n    value\n    __typename\n  }\n  __typename\n}\n\nfragment ProductOffer on OfferV2 {\n  id\n  productId\n  offerId\n  shopOfferId\n  price {\n    amountIncl\n    amountExcl\n    currency\n    fraction\n    __typename\n  }\n  deliveryOptions {\n    mail {\n      classification\n      futureReleaseDate\n      __typename\n    }\n    pickup {\n      siteId\n      classification\n      futureReleaseDate\n      __typename\n    }\n    detailsProvider {\n      productId\n      offerId\n      quantity\n      type\n      __typename\n    }\n    __typename\n  }\n  label\n  type\n  volumeDiscountPrices {\n    minAmount\n    price {\n      amountIncl\n      amountExcl\n      currency\n      __typename\n    }\n    isDefault\n    __typename\n  }\n  salesInformation {\n    numberOfItems\n    numberOfItemsSold\n    isEndingSoon\n    validFrom\n    __typename\n  }\n  incentiveText\n  isIncentiveCashback\n  isNew\n  isSalesPromotion\n  hideInProductDiscovery\n  canAddToBasket\n  hidePrice\n  insteadOfPrice {\n    type\n    price {\n      amountIncl\n      amountExcl\n      currency\n      fraction\n      __typename\n    }\n    __typename\n  }\n  minOrderQuantity\n  __typename\n}\n\nfragment RangeFilterDataPointResult on RangeFilterDataPoint {\n  count\n  referenceValue {\n    value\n    unit {\n      abbreviation\n      __typename\n    }\n    __typename\n  }\n  preferredValue {\n    value\n    unit {\n      abbreviation\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n"}]
    response = requests.post(url, json=payload, headers=headers)
    print(response)
    data = response.json()
    print(json.dumps(data, indent=2))
    print(json.dumps(data))


if __name__ == '__main__':
    scrape_digitec()
  • Related