Rename columns with words together in Spark-CodePudding

I need to change columns name like "InstanceId" to "Instance Id".

I have this Dataframe in Spark:

  InstanceId               InstanceType
|----------------------|--------------------|

  1                      Wsss
  2                      Xles

I would like to transform in this:

  Instance Id               Instance Type
|----------------------|--------------------|

  1                      Wsss
  2                      Xles

CodePudding user response：

If your DataFrame doesn't have nested columns, you can simply do something like this:

val renamedCols = df.columns.map(_.replaceAll("([A-Z])", " $1").trim)
val newDF = df.toDF(renamedCols: _*)

In case you have a nested schema, consider using a recursive rename method similar to the one in this SO answer:

import org.apache.spark.sql.types._

def renameAllCols(schema: StructType, rename: String => String): StructType = {
  def recurRename(schema: StructType): Seq[StructField] = schema.fields.map{
      case StructField(name, dtype: StructType, nullable, meta) =>
        StructField(rename(name), StructType(recurRename(dtype)), nullable, meta)
      case StructField(name, dtype: ArrayType, nullable, meta) if dtype.elementType.isInstanceOf[StructType] =>
        StructField(rename(name), ArrayType(StructType(recurRename(dtype.elementType.asInstanceOf[StructType])), true), nullable, meta)
      case StructField(name, dtype, nullable, meta) =>
        StructField(rename(name), dtype, nullable, meta)
    }
  StructType(recurRename(schema))
}

A dummy example using the renameAllCols method:

import org.apache.spark.sql.functions._
import spark.implicits._

val renameFcn = (s: String) => s.replaceAll("([A-Z])", " $1").trim

case class C(CountId: Int, CountValue: Int)

val df = Seq(
  (10, "a", C(1, 2), Seq(C(11, 12), C(13, 14)), Seq(101, 102)),
  (20, "b", C(3, 4), Seq(C(15, 16)), Seq(103))
).toDF("InstanceId", "InstanceTag", "TestCount", "ActualCountSequence", "TrialSequence")

val newDF = spark.createDataFrame(df.rdd, renameAllCols(df.schema, renameFcn))

newDF.show
/*
 ----------- ------------ ---------- --------------------- --------------       
|Instance Id|Instance Tag|Test Count|Actual Count Sequence|Trial Sequence|
 ----------- ------------ ---------- --------------------- -------------- 
|         10|           a|    {1, 2}| [{11, 12}, {13, 14}]|    [101, 102]|
|         20|           b|    {3, 4}|           [{15, 16}]|         [103]|
 ----------- ------------ ---------- --------------------- -------------- 
*/

newDF.printSchema
/*
root
 |-- Instance Id: integer (nullable = false)
 |-- Instance Tag: string (nullable = true)
 |-- Test Count: struct (nullable = true)
 |    |-- Count Id: integer (nullable = false)
 |    |-- Count Value: integer (nullable = false)
 |-- Actual Count Sequence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Count Id: integer (nullable = false)
 |    |    |-- Count Value: integer (nullable = false)
 |-- Trial Sequence: array (nullable = true)
 |    |-- element: integer (containsNull = false)
*/