Home > Enterprise >  AWS Athena query JSON array with AND Condition
AWS Athena query JSON array with AND Condition

Time:05-08

I have JSON data like this saved in S3. I am using ATHENA to write select statements.

  {
   "sample_data":{
      "people":[
         {
            "firstName":"Emily",
            "address":{
               "streetAddress":"101",
               "city":"abc",
               "state":"",
               "phoneNumbers":[
                  {
                     "type":"home",
                     "number":"3"
                  },
                  {
                     "type":"city",
                     "number":"4"
                  }
               ]
            }
         },
          {
            "firstName":"Smily",
            "address":{
               "streetAddress":"102",
               "city":"def",
               "state":"",
               "phoneNumbers":[
                  {
                     "type":"home",
                     "number":"1"
                  },
                  {
                     "type":"city",
                     "number":"1"
                  }
               ]
            }
         }
      ]
   }
}

How Can I write a select statement that selects streetaddress and city where home>2 and city=4;

I tried UNNEST but that did not help.

Expected Output:

streetAddress  city
101            abc   

Tried this UNNEST but it extracted phoneNumbers to multiple rows. So can not query by both home and city as they are in different rows now.

SELECT  idx,JSON_EXTRACT_SCALAR(x.n, '$.address.streetaddress') as streetaddress,
JSON_EXTRACT_SCALAR(x.n, '$.address.city') as city, JSON_EXTRACT_SCALAR(x.m, '$.type') as type, JSON_EXTRACT_SCALAR(x.m, '$.number')  as value
  FROM sample_data1 cross join
  UNNEST (CAST(JSON_EXTRACT(sample_data,'$.people') AS ARRAY<JSON>)) AS x(n)
  CROSS JOIN
  UNNEST (CAST(JSON_EXTRACT(x.n,'$.address.phonenumbers') AS ARRAY<JSON>))  WITH ordinality AS x(m,idx) ;

CodePudding user response:

unnest flattens data into multiple rows, so you can process the array without unnesting using array functions. Version of Presto currently used by Athena does not support any_match so you will need to use cardinality filter combination (and it does not support filtering via json path):

-- sample data
WITH dataset (json_str) AS (
    VALUES (
            json '{
            "firstName":"Emily",
            "address":{
               "streetAddress":"101",
               "city":"abc",
               "state":"",
               "phoneNumbers":[
                  {
                     "type":"home",
                     "number":"11"
                  },
                  {
                     "type":"city",
                     "number":"4"
                  }
               ]
            }
         }'
        ),
        (
            json '{
            "firstName":"Smily",
            "address":{
               "streetAddress":"102",
               "city":"def",
               "state":"",
               "phoneNumbers":[
                  {
                     "type":"home",
                     "number":"1"
                  },
                  {
                     "type":"city",
                     "number":"1"
                  }
               ]
            }
         }'
        )
) -- query
select street_address,
    city
from (
        select JSON_EXTRACT_SCALAR(json_str, '$.address.streetAddress') as street_address,
            JSON_EXTRACT_SCALAR(json_str, '$.address.city') as city,
            cast(
                JSON_EXTRACT(json_str, '$.address.phoneNumbers') as array(json)
            ) phones
        from dataset
    )
where cardinality(
        filter(
            phones,
            js->json_extract_scalar(js, '$.type') = 'home'
                and try_cast(json_extract_scalar(js, '$.number') as integer) > 2
        )
    ) > 0 -- check for home
    and
    cardinality(
        filter(
            phones,
            js->json_extract_scalar(js, '$.type') = 'city'
                and json_extract_scalar(js, '$.number') = '4'
        )
    ) > 0 -- check for city

Output:

street_address city
101 abc
  • Related