Consider the below sample where data
is of type array<struct<key:string,value:string>>
, with the repeated keys of 'Date', 'Country' and 'Brand':
source | data.key | data.value |
---|---|---|
first_file | Date | 2022-12-14 |
Country | Germany | |
Brand | Mercedes | |
second_file | Date | 2022-12-13 |
Country | Sweden | |
Brand | Volvo |
My desired result is:
source | Date | Country | Brand |
---|---|---|---|
first_file | 2022-12-14 | Germany | Mercedes |
second_file | 2022-12-13 | Sweden | Volvo |
Any help on how I can reach that result? Also, how would you alter your solution if the keys were not repeated, i.e. if they were unique?
Thanks!
CodePudding user response:
WITH
tmp AS (
SELECT
source,
key,
value
FROM
UNNEST(ARRAY<STRUCT<source string, data ARRAY<STRUCT<key string, value string>>>>[
("first_file", [("Date","2022-12-14"), ("Country","Germany"),("Brand","Mercedes")]),
("second_file", [("Date","2022-12-13"), ("Country","Sweden"),("Brand","Volvo")])
]),
UNNEST(data) ) -- unnest data first
SELECT
source, date[SAFE_OFFSET(0)] as date, country[SAFE_OFFSET(0)] as country, brand[SAFE_OFFSET(0)] as brand,
FROM
tmp PIVOT (
ARRAY_AGG(value IGNORE NULLS) FOR key IN ("Date", "Country", "Brand")) -- pivot table
CodePudding user response:
You can use the following query :
with sources AS
(
select
'first_file' as source,
[
struct('Date' as key, '2022-12-14' as value),
struct('Country' as key, 'Germany' as value),
struct('Brand' as key, 'Mercedes' as value)
] as data
UNION ALL
select
'second_file' as source,
[
struct('Date' as key, '2022-12-13' as value),
struct('Country' as key, 'Sweden' as value),
struct('Brand' as key, 'Volvo' as value)
] as data
)
select
source,
(SELECT value FROM UNNEST(data) WHERE key = 'Date') AS Date,
(SELECT value FROM UNNEST(data) WHERE key = 'Country') AS Country,
(SELECT value FROM UNNEST(data) WHERE key = 'Brand') AS Brand,
from sources;
You can also use an udf
to centralize the logic :
CREATE TEMP FUNCTION getValue(k STRING, arr ANY TYPE) AS
((SELECT value FROM UNNEST(arr) WHERE key = k));
with sources AS
(
select
'first_file' as source,
[
struct('Date' as key, '2022-12-14' as value),
struct('Country' as key, 'Germany' as value),
struct('Brand' as key, 'Mercedes' as value)
] as data
UNION ALL
select
'second_file' as source,
[
struct('Date' as key, '2022-12-13' as value),
struct('Country' as key, 'Sweden' as value),
struct('Brand' as key, 'Volvo' as value)
] as data
)
SELECT
source,
getValue('Date', data) AS Date,
getValue('Country', data) AS Country,
getValue('Brand', data) AS Brand
FROM sources;
The result is :