I need to change columns name like "InstanceId" to "Instance Id".
I have this Dataframe in Spark:
InstanceId InstanceType
|----------------------|--------------------|
1 Wsss
2 Xles
I would like to transform in this:
Instance Id Instance Type
|----------------------|--------------------|
1 Wsss
2 Xles
CodePudding user response:
If your DataFrame doesn't have nested columns, you can simply do something like this:
val renamedCols = df.columns.map(_.replaceAll("([A-Z])", " $1").trim)
val newDF = df.toDF(renamedCols: _*)
In case you have a nested schema, consider using a recursive rename method similar to the one in this SO answer
:
import org.apache.spark.sql.types._
def renameAllCols(schema: StructType, rename: String => String): StructType = {
def recurRename(schema: StructType): Seq[StructField] = schema.fields.map{
case StructField(name, dtype: StructType, nullable, meta) =>
StructField(rename(name), StructType(recurRename(dtype)), nullable, meta)
case StructField(name, dtype: ArrayType, nullable, meta) if dtype.elementType.isInstanceOf[StructType] =>
StructField(rename(name), ArrayType(StructType(recurRename(dtype.elementType.asInstanceOf[StructType])), true), nullable, meta)
case StructField(name, dtype, nullable, meta) =>
StructField(rename(name), dtype, nullable, meta)
}
StructType(recurRename(schema))
}
A dummy example using the renameAllCols
method:
import org.apache.spark.sql.functions._
import spark.implicits._
val renameFcn = (s: String) => s.replaceAll("([A-Z])", " $1").trim
case class C(CountId: Int, CountValue: Int)
val df = Seq(
(10, "a", C(1, 2), Seq(C(11, 12), C(13, 14)), Seq(101, 102)),
(20, "b", C(3, 4), Seq(C(15, 16)), Seq(103))
).toDF("InstanceId", "InstanceTag", "TestCount", "ActualCountSequence", "TrialSequence")
val newDF = spark.createDataFrame(df.rdd, renameAllCols(df.schema, renameFcn))
newDF.show
/*
----------- ------------ ---------- --------------------- --------------
|Instance Id|Instance Tag|Test Count|Actual Count Sequence|Trial Sequence|
----------- ------------ ---------- --------------------- --------------
| 10| a| {1, 2}| [{11, 12}, {13, 14}]| [101, 102]|
| 20| b| {3, 4}| [{15, 16}]| [103]|
----------- ------------ ---------- --------------------- --------------
*/
newDF.printSchema
/*
root
|-- Instance Id: integer (nullable = false)
|-- Instance Tag: string (nullable = true)
|-- Test Count: struct (nullable = true)
| |-- Count Id: integer (nullable = false)
| |-- Count Value: integer (nullable = false)
|-- Actual Count Sequence: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- Count Id: integer (nullable = false)
| | |-- Count Value: integer (nullable = false)
|-- Trial Sequence: array (nullable = true)
| |-- element: integer (containsNull = false)
*/