I have JSON data like this saved in S3. I am using ATHENA to write select statements.
{
"sample_data":{
"people":[
{
"firstName":"Emily",
"address":{
"streetAddress":"101",
"city":"abc",
"state":"",
"phoneNumbers":[
{
"type":"home",
"number":"3"
},
{
"type":"city",
"number":"4"
}
]
}
},
{
"firstName":"Smily",
"address":{
"streetAddress":"102",
"city":"def",
"state":"",
"phoneNumbers":[
{
"type":"home",
"number":"1"
},
{
"type":"city",
"number":"1"
}
]
}
}
]
}
}
How Can I write a select statement that selects streetaddress and city where home>2 and city=4;
I tried UNNEST but that did not help.
Expected Output:
streetAddress city
101 abc
Tried this UNNEST but it extracted phoneNumbers to multiple rows. So can not query by both home and city as they are in different rows now.
SELECT idx,JSON_EXTRACT_SCALAR(x.n, '$.address.streetaddress') as streetaddress,
JSON_EXTRACT_SCALAR(x.n, '$.address.city') as city, JSON_EXTRACT_SCALAR(x.m, '$.type') as type, JSON_EXTRACT_SCALAR(x.m, '$.number') as value
FROM sample_data1 cross join
UNNEST (CAST(JSON_EXTRACT(sample_data,'$.people') AS ARRAY<JSON>)) AS x(n)
CROSS JOIN
UNNEST (CAST(JSON_EXTRACT(x.n,'$.address.phonenumbers') AS ARRAY<JSON>)) WITH ordinality AS x(m,idx) ;
CodePudding user response:
unnest
flattens data into multiple rows, so you can process the array without unnesting using array functions. Version of Presto currently used by Athena does not support any_match
so you will need to use cardinality
filter
combination (and it does not support filtering via json path):
-- sample data
WITH dataset (json_str) AS (
VALUES (
json '{
"firstName":"Emily",
"address":{
"streetAddress":"101",
"city":"abc",
"state":"",
"phoneNumbers":[
{
"type":"home",
"number":"11"
},
{
"type":"city",
"number":"4"
}
]
}
}'
),
(
json '{
"firstName":"Smily",
"address":{
"streetAddress":"102",
"city":"def",
"state":"",
"phoneNumbers":[
{
"type":"home",
"number":"1"
},
{
"type":"city",
"number":"1"
}
]
}
}'
)
) -- query
select street_address,
city
from (
select JSON_EXTRACT_SCALAR(json_str, '$.address.streetAddress') as street_address,
JSON_EXTRACT_SCALAR(json_str, '$.address.city') as city,
cast(
JSON_EXTRACT(json_str, '$.address.phoneNumbers') as array(json)
) phones
from dataset
)
where cardinality(
filter(
phones,
js->json_extract_scalar(js, '$.type') = 'home'
and try_cast(json_extract_scalar(js, '$.number') as integer) > 2
)
) > 0 -- check for home
and
cardinality(
filter(
phones,
js->json_extract_scalar(js, '$.type') = 'city'
and json_extract_scalar(js, '$.number') = '4'
)
) > 0 -- check for city
Output:
street_address | city |
---|---|
101 | abc |