With Spark 1.6.0, there is a pivot function to transform/transpose your data. In your case it requires some preprocessing to get the data ready for pivot. Here an example how I'd do it:
def doPivot(): Unit = { val sqlContext: SQLContext = new org.apache.spark.sql.SQLContext(sc) // dummy data val r1 = Input("P1", "OTHH", 1.1, 1.2) val r2 = Input("P1", "ZGGG", 2.1, 2.2) val r3 = Input("P2", "OTHH", 3.1, 3.2) val records = Seq(r1, r2, r3) val df = sqlContext.createDataFrame(records) // prepare data for pivot val fullName: ((String, String) => String) = (predictor: String, num: String) => { predictor + "." + num } val udfFullName = udf(fullName) val dfFullName = df.withColumn("num1-complete", udfFullName(col("predictor"), lit("num1"))) .withColumn("num2-complete", udfFullName(col("predictor"), lit("num2"))) val dfPrepared = dfFullName.select(col("icaoCode"), col("num1") as "num", col("num1-complete") as "value") .unionAll(dfFullName.select(col("icaoCode"), col("num2") as "num", col("num2-complete") as "value")) // transpose/pivot dataframe val dfPivoted = dfPrepared.groupBy(col("icaoCode")).pivot("value").mean("num") dfPivoted.show() } case class Input(predictor: String, icaoCode: String, num1: Double, num2: Double)
The final dataframe should work for you:
+--------+-------+-------+-------+-------+ |icaoCode|P1.num1|P1.num2|P2.num1|P2.num2| +--------+-------+-------+-------+-------+ | OTHH| 1.1| 1.2| 3.1| 3.2| | ZGGG| 2.1| 2.2| null| null| +--------+-------+-------+-------+-------+