I need to extract the value of attribute where the namespace is GLOBALID in the following json format, e.g in this example I need to extract "123456"
[{
"namespace": "GLOBALID",
"key": "ID",
"value": "123456"
},
{
"namespace": "RPS",
"key": "ID",
"value": "12xt12"
}
]
However when I use select get_json_object(json_column, '$.GLOBALID.value')
it is not extracting the value (edited)
CodePudding user response:
For the pyspark, create dataframe and filter.
json = [
{
"namespace": "GLOBALID",
"key": "ID",
"value": "123456"
},
{
"namespace": "RPS",
"key": "ID",
"value": "12xt12"
}
]
df = spark.createDataFrame(json)
df.show()
# --- --------- ------
# |key|namespace| value|
# --- --------- ------
# | ID| GLOBALID|123456|
# | ID| RPS|12xt12|
# --- --------- ------
df.filter("namespace = 'GLOBALID'").select('value').rdd.map(lambda x: x[0]).collect()
# ['123456']
CodePudding user response:
You will need to parse the json
string as a JSON
value and then apply filtering. This is needed since Spark does not allow for filter expressions in JSONPath
.
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType, StructType, StructField
df = spark.createDataFrame([('[{"namespace":"GLOBALID","key":"ID","value":"123456"},{"namespace":"RPS","key":"ID","value":"12xt12"}]', ), ], ("json_column", ))
json_schema = ArrayType(StructType([StructField('namespace', StringType(),), \
StructField('value', StringType(),)]))
df.select(F.explode(F.from_json("json_column", json_schema)).alias("json"))\
.filter(F.col("json")["namespace"] == "GLOBALID")\
.select("json.value")\
.show()
Output
------
| value|
------
|123456|
------