I need to pivot more than one column in a PySpark dataframe. Sample dataframe:
from pyspark.sql import functions as F d = [(100,1,23,10),(100,2,45,11),(100,3,67,12),(100,4,78,13),(101,1,23,10),(101,2,45,13),(101,3,67,14),(101,4,78,15),(102,1,23,10),(102,2,45,11),(102,3,67,16),(102,4,78,18)] mydf = spark.createDataFrame(d,['id','day','price','units']) mydf.show() # +---+---+-----+-----+ # | id|day|price|units| # +---+---+-----+-----+ # |100| 1| 23| 10| # |100| 2| 45| 11| # |100| 3| 67| 12| # |100| 4| 78| 13| # |101| 1| 23| 10| # |101| 2| 45| 13| # |101| 3| 67| 14| # |101| 4| 78| 15| # |102| 1| 23| 10| # |102| 2| 45| 11| # |102| 3| 67| 16| # |102| 4| 78| 18| # +---+---+-----+-----+t Now, if I need to get price column into a row for each id based on day, then I can use pivot method:
pvtdf = mydf.withColumn('combcol', F.concat(F.lit('price_'), mydf['day'])).groupby('id').pivot('combcol').agg(F.first('price')) pvtdf.show() # +---+-------+-------+-------+-------+ # | id|price_1|price_2|price_3|price_4| # +---+-------+-------+-------+-------+ # |100| 23| 45| 67| 78| # |101| 23| 45| 67| 78| # |102| 23| 45| 67| 78| # +---+-------+-------+-------+-------+ So when I need units column as well to be transposed as price, I've got to create one more dataframe as above for units and then join both using "id". But, when I have more columns as such, I tried a function to do it,
def pivot_udf(df, *cols): mydf = df.select('id').drop_duplicates() for c in cols: mydf = mydf.join(df.withColumn('combcol', F.concat(F.lit('{}_'.format(c)), df['day'])).groupby('id').pivot('combcol').agg(F.first(c)),' id') return mydf pivot_udf(mydf, 'price', 'units').show() # +---+-------+-------+-------+-------+-------+-------+-------+-------+ # | id|price_1|price_2|price_3|price_4|units_1|units_2|units_3|units_4| # +---+-------+-------+-------+-------+-------+-------+-------+-------+ # |100| 23| 45| 67| 78| 10| 11| 12| 13| # |101| 23| 45| 67| 78| 10| 13| 14| 15| # |102| 23| 45| 67| 78| 10| 11| 16| 18| # +---+-------+-------+-------+-------+-------+-------+-------+-------+ Is it a good practice to do so and is there any other better way of doing it?