Return to Answer

added 20 characters in body

edited Feb 16, 2018 at 9:24

2.7k
24
26

# starting from this form: +------+-------------------------------------------------------------- |Person|res | +------+--------------------------------------------------------------+ |Bob |[[562,Food,June,1], [380,Household,Sept,4], [880,Food,Sept,2]]| +------+--------------------------------------------------------------+ import pyspark.sql.functions as F # explode res to have one row for each item in res exploded_df = df.select("*", F.explode("res").alias("exploded_data")) exploded_df.show(truncate=False) # then use getItem to create separate columns exploded_df = exploded_df.withColumn( "Amount", F.col("exploded_data").getItem("Amount") # either get by name or by index e.g. getItem(0) etc ) exploded_df = exploded_df.withColumn( "Budget", F.col("exploded_data").getItem("Budget") ) exploded_df = exploded_df.withColumn( "Month", F.col("exploded_data").getItem("Month") ) exploded_df = exploded_df.withColumn( "Cluster", F.col("exploded_data").getItem("Cluster") ) exploded_df.select("Person", "Amount", "Budget", "Month", "Cluster").show(10, False) +------+------------------------------+ |Person|Amount|Budget |Month|Cluster| +------+------------------------------+ |Bob |562 |Food |June |1 | |Bob |380 |Household|Sept |4 | |Bob |880 |Food |Sept |2 | +------+------------------------------+

# starting from this form: +------+-------------------------------------------------------------- |Person|res | +------+--------------------------------------------------------------+ |Bob |[[562,Food,June,1], [380,Household,Sept,4], [880,Food,Sept,2]]| +------+--------------------------------------------------------------+ import pyspark.sql.functions as F # explode res to have one row for each item in res exploded_df = df.select("*", F.explode("res").alias("exploded_data")) exploded_df.show(truncate=False) # then use getItem to create separate columns exploded_df = exploded_df.withColumn( "Amount", F.col("exploded_data").getItem("Amount") # either get by name or by index ) exploded_df = exploded_df.withColumn( "Budget", F.col("exploded_data").getItem("Budget") ) exploded_df = exploded_df.withColumn( "Month", F.col("exploded_data").getItem("Month") ) exploded_df = exploded_df.withColumn( "Cluster", F.col("exploded_data").getItem("Cluster") ) exploded_df.select("Person", "Amount", "Budget", "Month", "Cluster").show(10, False) +------+------------------------------+ |Person|Amount|Budget |Month|Cluster| +------+------------------------------+ |Bob |562 |Food |June |1 | |Bob |380 |Household|Sept |4 | |Bob |880 |Food |Sept |2 | +------+------------------------------+

# starting from this form: +------+-------------------------------------------------------------- |Person|res | +------+--------------------------------------------------------------+ |Bob |[[562,Food,June,1], [380,Household,Sept,4], [880,Food,Sept,2]]| +------+--------------------------------------------------------------+ import pyspark.sql.functions as F # explode res to have one row for each item in res exploded_df = df.select("*", F.explode("res").alias("exploded_data")) exploded_df.show(truncate=False) # then use getItem to create separate columns exploded_df = exploded_df.withColumn( "Amount", F.col("exploded_data").getItem("Amount") # either get by name or by index e.g. getItem(0) etc ) exploded_df = exploded_df.withColumn( "Budget", F.col("exploded_data").getItem("Budget") ) exploded_df = exploded_df.withColumn( "Month", F.col("exploded_data").getItem("Month") ) exploded_df = exploded_df.withColumn( "Cluster", F.col("exploded_data").getItem("Cluster") ) exploded_df.select("Person", "Amount", "Budget", "Month", "Cluster").show(10, False) +------+------------------------------+ |Person|Amount|Budget |Month|Cluster| +------+------------------------------+ |Bob |562 |Food |June |1 | |Bob |380 |Household|Sept |4 | |Bob |880 |Food |Sept |2 | +------+------------------------------+

Source Link

answered Feb 16, 2018 at 9:18

mkaran

2.7k
24
26

You can use explode and getItem as follows:

# starting from this form: +------+-------------------------------------------------------------- |Person|res | +------+--------------------------------------------------------------+ |Bob |[[562,Food,June,1], [380,Household,Sept,4], [880,Food,Sept,2]]| +------+--------------------------------------------------------------+ import pyspark.sql.functions as F # explode res to have one row for each item in res exploded_df = df.select("*", F.explode("res").alias("exploded_data")) exploded_df.show(truncate=False) # then use getItem to create separate columns exploded_df = exploded_df.withColumn( "Amount", F.col("exploded_data").getItem("Amount") # either get by name or by index ) exploded_df = exploded_df.withColumn( "Budget", F.col("exploded_data").getItem("Budget") ) exploded_df = exploded_df.withColumn( "Month", F.col("exploded_data").getItem("Month") ) exploded_df = exploded_df.withColumn( "Cluster", F.col("exploded_data").getItem("Cluster") ) exploded_df.select("Person", "Amount", "Budget", "Month", "Cluster").show(10, False) +------+------------------------------+ |Person|Amount|Budget |Month|Cluster| +------+------------------------------+ |Bob |562 |Food |June |1 | |Bob |380 |Household|Sept |4 | |Bob |880 |Food |Sept |2 | +------+------------------------------+

You can then drop unnecessary columns. Hope this helps, good luck!

Collectives™ on Stack Overflow

Return to Answer