Home > database >  transform a complex json object from a web api to multiple rows in a dataframe in Azure Databricks u
transform a complex json object from a web api to multiple rows in a dataframe in Azure Databricks u

Time:06-01

I have a JSON file that is received from a REST API. An example of the return is like this:

{
    "d": {
        "results": [
            {
                "__metadata": {
                    "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')",
                    "type": "EmpEmployment"
                },
                "personIdExternal": "60000033",
                "userId": "60000033",
                "hiringNotCompleted": false,
                "isECRecord": true,
                "lastModifiedDateTime": "/Date(1642917586000 0000)/",
                "endDate": "/Date(1675123200000)/",
                "createdDateTime": "/Date(1641473919000 0000)/",
                "createdOn": "/Date(1641473919000)/",
                "originalStartDate": "/Date(1501545600000)/",
                "customDate1": "/Date(1501545600000)/",
                "customString17": null,
                "customString18": null,
                "customString19": null,
                "assignmentClass": "ST",
                "lastModifiedBy": "This Dude",
                "okToRehire": true,
                "customString4": null,
                "customString3": "3",
                "customString2": null,
                "assignmentIdExternal": "60000033",
                "customString16": null,
                "lastModifiedOn": "/Date(1642917586000)/",
                "customString1": null,
                "createdBy": "This Dudette",
                "seniorityDate": "/Date(1501545600000)/",
                "startDate": "/Date(1659398400000)/",
                "customString16Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString16Nav"
                    }
                },
                "customString1Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString1Nav"
                    }
                },
                "customString18Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString18Nav"
                    }
                },
                "customString3Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString3Nav"
                    }
                },
                "paymentInformationNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/paymentInformationNav"
                    }
                },
                "empJobRelationshipNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empJobRelationshipNav"
                    }
                },
                "personNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/personNav"
                    }
                },
                "empWorkPermitNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empWorkPermitNav"
                    }
                },
                "photoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/photoNav"
                    }
                },
                "compInfoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/compInfoNav"
                    }
                },
                "userNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/userNav"
                    }
                },
                "customString2Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString2Nav"
                    }
                },
                "customString19Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString19Nav"
                    }
                },
                "jobInfoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/jobInfoNav"
                    }
                },
                "wfRequestNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/wfRequestNav"
                    }
                },
                "costDistributionNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/costDistributionNav"
                    }
                },
                "empPayCompNonRecurringNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empPayCompNonRecurringNav"
                    }
                }
            },
            {
                "__metadata": {
                    "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')",
                    "type": "EmpEmployment"
                },
                "personIdExternal": "100003",
                "userId": "100003",
                "hiringNotCompleted": false,
                "isECRecord": true,
                "lastModifiedDateTime": "/Date(1638051713000 0000)/",
                "endDate": null,
                "createdDateTime": "/Date(1638051713000 0000)/",
                "createdOn": "/Date(1638051713000)/",
                "originalStartDate": "/Date(1635724800000)/",
                "customDate1": null,
                "customString17": null,
                "customString18": null,
                "customString19": null,
                "assignmentClass": "ST",
                "lastModifiedBy": "This Dudette",
                "okToRehire": null,
                "customString4": null,
                "customString3": null,
                "customString2": null,
                "assignmentIdExternal": "100003",
                "customString16": null,
                "lastModifiedOn": "/Date(1638051713000)/",
                "customString1": null,
                "createdBy": "This Dude",
                "seniorityDate": "/Date(1635724800000)/",
                "startDate": "/Date(1635724800000)/",
                "customString16Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString16Nav"
                    }
                },
                "customString1Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString1Nav"
                    }
                },
                "customString18Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString18Nav"
                    }
                },
                "customString3Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString3Nav"
                    }
                },
                "paymentInformationNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/paymentInformationNav"
                    }
                },
                "empJobRelationshipNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empJobRelationshipNav"
                    }
                },
                "personNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/personNav"
                    }
                },
                "empWorkPermitNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empWorkPermitNav"
                    }
                },
                "photoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/photoNav"
                    }
                },
                "compInfoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/compInfoNav"
                    }
                },
                "userNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/userNav"
                    }
                },
                "customString2Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString2Nav"
                    }
                },
                "customString19Nav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString19Nav"
                    }
                },
                "jobInfoNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/jobInfoNav"
                    }
                },
                "wfRequestNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/wfRequestNav"
                    }
                },
                "costDistributionNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/costDistributionNav"
                    }
                },
                "empPayCompNonRecurringNav": {
                    "__deferred": {
                        "uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empPayCompNonRecurringNav"
                    }
                }
            }
        ]
    }
}

I'm at present just looking to pull the userId & startDate from the JSON. I've tried using the explode command as shown in this example.

result

Some help in being pointed in the right direction would be great please. Am I better just building a custom schema and trying to parse the JSON into that?

All I'm looking to do is return the result as per the image but each userId and startDate on on its own row due to them relating to each other.

CodePudding user response:

Explode results to get them into row

df = spark.read.json("./sample.json", multiLine=True)
df2 = df.withColumn('d', explode(col('d.results')))
df2.select(df2.d.userId, df2.d.startDate).show(10,False)

 -------- --------------------- 
|d.userId|d.startDate          |
 -------- --------------------- 
|60000033|/Date(1659398400000)/|
|100003  |/Date(1635724800000)/|
 -------- --------------------- 

You can add as many attributes as required e.g

df.select(explode(col('d.results'))).\
    selectExpr("col.userId","col.startDate","col.lastModifiedBy").\
    show(10,False)

 -------- --------------------- -------------- 
|userId  |startDate            |lastModifiedBy|
 -------- --------------------- -------------- 
|60000033|/Date(1659398400000)/|This Dude     |
|100003  |/Date(1635724800000)/|This Dudette  |
 -------- --------------------- -------------- 
  • Related