Reading JSON array in Redshift
Introduction#
Currently, reading all elements in JSON array is not possible in Redshift. For e.g. if you want to read Manufacturer and model as columns from following JSON
Remarks#
dim_id | number | info | manufacturer | model |
---|---|---|---|---|
200 | 1 | Nissan~Sentra^Nissan~Maxima^Ford~Taurus^Ford~Escort^ | Nissan | Sentra |
200 | 2 | Nissan~Sentra^Nissan~Maxima^Ford~Taurus^Ford~Escort^ | Nissan | Maxima |
200 | 3 | Nissan~Sentra^Nissan~Maxima^Ford~Taurus^Ford~Escort^ | Ford | Taurus |
200 | 4 | Nissan~Sentra^Nissan~Maxima^Ford~Taurus^Ford~Escort^ | Ford | Escort |
— Create a sample JSON with ARRAY
create table car_sample(dim_id integer, info varchar(2000)); insert into car_sample values (200, ’{“cars”: [ { “Manufacturer”: “Nissan”, “Models”: [{“Name”:“Sentra”, “doors”:4}, {“Name”:“Maxima”, “doors”:4} ]}, {“Manufacturer”: “Ford”, “Models”: [{“Name”:“Taurus”, “doors”:4}, {“Name”:“Escort”, “doors”:4} ]} ] }‘)
— Create a supporting table for CROSS JOIN
create table series1_10 (number integer );
insert into series1_10 values (1);
insert into series1_10 values (2);
insert into series1_10 values (3);
insert into series1_10 values (4);
insert into series1_10 values (5);
insert into series1_10 values (6);
insert into series1_10 values (7);
insert into series1_10 values (8);
insert into series1_10 values (9);
insert into series1_10 values (10);
— UDF for extracting JSON array into one ^ delimited string
CREATE OR REPLACE FUNCTION f_extractJson (jsonVar varchar) RETURNS varchar IMMUTABLE as $$
def myfunc(myParm):
import json
cars=json.loads(jsonVar)
parsedString=”
for car in cars[“cars”]:
for model in car[“Models”]:
parsedString=parsedString+car[“Manufacturer”]+’~‘+model[“Name”]+’^’
return parsedString
return myfunc(jsonVar)
$$ LANGUAGE plpythonu;
— Check the data
select dim_id, f_extractJson(info) from car_sample;
— Pivot rows
WITH w1 AS (select dim_id, f_extractJson(info) info from car_sample)
select dim_id,number, info, split_part(split_part(info,’^‘,number),’’, 1) ’, 2) Model
Manufacturer, split_part(split_part(info,’^‘,number),’
from w1 cross join series1_10
where number <= regexp_count(info,’[=^=]’) ;