Try this in pyspark: One way of doing this is using window functions
from pyspark.sql import SparkSession from pyspark.sql import functions as F from pyspark.sql.window import Window spark = SparkSession.builder \ .appName('SO')\ .getOrCreate() sc= spark.sparkContext df = sc.parallelize([ ("new south wales", "aus", 4, 4, 4),("victoria", "aus", 4, 4, 4), ("queensland", "aus", 3, 5, 5), ("south australia","aus", 1, 2, 2) ]).toDF(["province_state", "country_region", "2/1/2020", "2/10/2020", "2/11/2020"]) df.show() # # +---------------+--------------+--------+---------+---------+ # | province_state|country_region|2/1/2020|2/10/2020|2/11/2020| # +---------------+--------------+--------+---------+---------+ # |new south wales| aus| 4| 4| 4| # | victoria| aus| 4| 4| 4| # | queensland| aus| 3| 5| 5| # |south australia| aus| 1| 2| 2| # +---------------+--------------+--------+---------+---------+ w = Window().partitionBy('country_region') w1 = Window().partitionBy('country_region').orderBy('country_region') for column in df.columns: if column not in ['country_region','province_state']: df = df.withColumn(column, F.sum(column).over(w) ) df1 = df.withColumn("r_no", F.row_number().over(w1)).where(F.col('r_no')==1) df1.select(F.lit('_').alias('province_state'), *[ column for column in df1.columns if column not in ['province_state']]).drop(F.col('r_no')).show() # +--------------+--------------+--------+---------+---------+ # |province_state|country_region|2/1/2020|2/10/2020|2/11/2020| # +--------------+--------------+--------+---------+---------+ # | _| aus| 12| 15| 15| # +--------------+--------------+--------+---------+---------+