# starting from this form: +------+-------------------------------------------------------------- |Person|res | +------+--------------------------------------------------------------+ |Bob |[[562,Food,June,1], [380,Household,Sept,4], [880,Food,Sept,2]]| +------+--------------------------------------------------------------+ import pyspark.sql.functions as F # explode res to have one row for each item in res exploded_df = df.select("*", F.explode("res").alias("exploded_data")) exploded_df.show(truncate=False) # then use getItem to create separate columns exploded_df = exploded_df.withColumn( "Amount", F.col("exploded_data").getItem("Amount") # either get by name or by index e.g. getItem(0) etc ) exploded_df = exploded_df.withColumn( "Budget", F.col("exploded_data").getItem("Budget") ) exploded_df = exploded_df.withColumn( "Month", F.col("exploded_data").getItem("Month") ) exploded_df = exploded_df.withColumn( "Cluster", F.col("exploded_data").getItem("Cluster") ) exploded_df.select("Person", "Amount", "Budget", "Month", "Cluster").show(10, False) +------+------------------------------+ |Person|Amount|Budget |Month|Cluster| +------+------------------------------+ |Bob |562 |Food |June |1 | |Bob |380 |Household|Sept |4 | |Bob |880 |Food |Sept |2 | +------+------------------------------+ # starting from this form: +------+-------------------------------------------------------------- |Person|res | +------+--------------------------------------------------------------+ |Bob |[[562,Food,June,1], [380,Household,Sept,4], [880,Food,Sept,2]]| +------+--------------------------------------------------------------+ import pyspark.sql.functions as F # explode res to have one row for each item in res exploded_df = df.select("*", F.explode("res").alias("exploded_data")) exploded_df.show(truncate=False) # then use getItem to create separate columns exploded_df = exploded_df.withColumn( "Amount", F.col("exploded_data").getItem("Amount") # either get by name or by index ) exploded_df = exploded_df.withColumn( "Budget", F.col("exploded_data").getItem("Budget") ) exploded_df = exploded_df.withColumn( "Month", F.col("exploded_data").getItem("Month") ) exploded_df = exploded_df.withColumn( "Cluster", F.col("exploded_data").getItem("Cluster") ) exploded_df.select("Person", "Amount", "Budget", "Month", "Cluster").show(10, False) +------+------------------------------+ |Person|Amount|Budget |Month|Cluster| +------+------------------------------+ |Bob |562 |Food |June |1 | |Bob |380 |Household|Sept |4 | |Bob |880 |Food |Sept |2 | +------+------------------------------+ # starting from this form: +------+-------------------------------------------------------------- |Person|res | +------+--------------------------------------------------------------+ |Bob |[[562,Food,June,1], [380,Household,Sept,4], [880,Food,Sept,2]]| +------+--------------------------------------------------------------+ import pyspark.sql.functions as F # explode res to have one row for each item in res exploded_df = df.select("*", F.explode("res").alias("exploded_data")) exploded_df.show(truncate=False) # then use getItem to create separate columns exploded_df = exploded_df.withColumn( "Amount", F.col("exploded_data").getItem("Amount") # either get by name or by index e.g. getItem(0) etc ) exploded_df = exploded_df.withColumn( "Budget", F.col("exploded_data").getItem("Budget") ) exploded_df = exploded_df.withColumn( "Month", F.col("exploded_data").getItem("Month") ) exploded_df = exploded_df.withColumn( "Cluster", F.col("exploded_data").getItem("Cluster") ) exploded_df.select("Person", "Amount", "Budget", "Month", "Cluster").show(10, False) +------+------------------------------+ |Person|Amount|Budget |Month|Cluster| +------+------------------------------+ |Bob |562 |Food |June |1 | |Bob |380 |Household|Sept |4 | |Bob |880 |Food |Sept |2 | +------+------------------------------+ You can use explode and getItem as follows:
# starting from this form: +------+-------------------------------------------------------------- |Person|res | +------+--------------------------------------------------------------+ |Bob |[[562,Food,June,1], [380,Household,Sept,4], [880,Food,Sept,2]]| +------+--------------------------------------------------------------+ import pyspark.sql.functions as F # explode res to have one row for each item in res exploded_df = df.select("*", F.explode("res").alias("exploded_data")) exploded_df.show(truncate=False) # then use getItem to create separate columns exploded_df = exploded_df.withColumn( "Amount", F.col("exploded_data").getItem("Amount") # either get by name or by index ) exploded_df = exploded_df.withColumn( "Budget", F.col("exploded_data").getItem("Budget") ) exploded_df = exploded_df.withColumn( "Month", F.col("exploded_data").getItem("Month") ) exploded_df = exploded_df.withColumn( "Cluster", F.col("exploded_data").getItem("Cluster") ) exploded_df.select("Person", "Amount", "Budget", "Month", "Cluster").show(10, False) +------+------------------------------+ |Person|Amount|Budget |Month|Cluster| +------+------------------------------+ |Bob |562 |Food |June |1 | |Bob |380 |Household|Sept |4 | |Bob |880 |Food |Sept |2 | +------+------------------------------+ You can then drop unnecessary columns. Hope this helps, good luck!
lang-py