Skip to main content
code highlighting
Source Link
desertnaut
  • 60.8k
  • 32
  • 155
  • 183

Here's how you can do custom naming

import pyspark from pyspark.sql import functions as sf sc = pyspark.SparkContext() sqlc = pyspark.SQLContext(sc) df = sqlc.createDataFrame([('row11','row12'), ('row21','row22')], ['colname1', 'colname2']) df.show() 
import pyspark from pyspark.sql import functions as sf sc = pyspark.SparkContext() sqlc = pyspark.SQLContext(sc) df = sqlc.createDataFrame([('row11','row12'), ('row21','row22')], ['colname1', 'colname2']) df.show() 

gives,

+--------+--------+ |colname1|colname2| +--------+--------+ | row11| row12| | row21| row22| +--------+--------+ 
+--------+--------+ |colname1|colname2| +--------+--------+ | row11| row12| | row21| row22| +--------+--------+ 

create new column by concatenating:

df = df.withColumn('joined_column', sf.concat(sf.col('colname1'),sf.lit('_'), sf.col('colname2'))) df.show() +--------+--------+-------------+ |colname1|colname2|joined_column| +--------+--------+-------------+ | row11| row12| row11_row12| | row21| row22| row21_row22| +--------+--------+-------------+ 
df = df.withColumn('joined_column', sf.concat(sf.col('colname1'),sf.lit('_'), sf.col('colname2'))) df.show() +--------+--------+-------------+ |colname1|colname2|joined_column| +--------+--------+-------------+ | row11| row12| row11_row12| | row21| row22| row21_row22| +--------+--------+-------------+ 

Here's how you can do custom naming

import pyspark from pyspark.sql import functions as sf sc = pyspark.SparkContext() sqlc = pyspark.SQLContext(sc) df = sqlc.createDataFrame([('row11','row12'), ('row21','row22')], ['colname1', 'colname2']) df.show() 

gives,

+--------+--------+ |colname1|colname2| +--------+--------+ | row11| row12| | row21| row22| +--------+--------+ 

create new column by concatenating:

df = df.withColumn('joined_column', sf.concat(sf.col('colname1'),sf.lit('_'), sf.col('colname2'))) df.show() +--------+--------+-------------+ |colname1|colname2|joined_column| +--------+--------+-------------+ | row11| row12| row11_row12| | row21| row22| row21_row22| +--------+--------+-------------+ 

Here's how you can do custom naming

import pyspark from pyspark.sql import functions as sf sc = pyspark.SparkContext() sqlc = pyspark.SQLContext(sc) df = sqlc.createDataFrame([('row11','row12'), ('row21','row22')], ['colname1', 'colname2']) df.show() 

gives,

+--------+--------+ |colname1|colname2| +--------+--------+ | row11| row12| | row21| row22| +--------+--------+ 

create new column by concatenating:

df = df.withColumn('joined_column', sf.concat(sf.col('colname1'),sf.lit('_'), sf.col('colname2'))) df.show() +--------+--------+-------------+ |colname1|colname2|joined_column| +--------+--------+-------------+ | row11| row12| row11_row12| | row21| row22| row21_row22| +--------+--------+-------------+ 
Source Link
muon
  • 14.2k
  • 13
  • 74
  • 94

Here's how you can do custom naming

import pyspark from pyspark.sql import functions as sf sc = pyspark.SparkContext() sqlc = pyspark.SQLContext(sc) df = sqlc.createDataFrame([('row11','row12'), ('row21','row22')], ['colname1', 'colname2']) df.show() 

gives,

+--------+--------+ |colname1|colname2| +--------+--------+ | row11| row12| | row21| row22| +--------+--------+ 

create new column by concatenating:

df = df.withColumn('joined_column', sf.concat(sf.col('colname1'),sf.lit('_'), sf.col('colname2'))) df.show() +--------+--------+-------------+ |colname1|colname2|joined_column| +--------+--------+-------------+ | row11| row12| row11_row12| | row21| row22| row21_row22| +--------+--------+-------------+