I have two datasets that I want to INNER JOIN to give me a whole new table with the desired data. I used SQL and manage to get it. But now I want to try it with map() and filter(), is it possible?
This is my code using the SPARK SQL:
import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession object hello { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("local") .setAppName("quest9") val sc = new SparkContext(conf) val spark = SparkSession.builder().appName("quest9").master("local").getOrCreate() val zip_codes = spark.read.format("csv").option("header", "true").load("/home/hdfs/Documents/quest_9/doc/zip.csv") val census = spark.read.format("csv").option("header", "true").load("/home/hdfs/Documents/quest_9/doc/census.csv") census.createOrReplaceTempView("census") zip_codes.createOrReplaceTempView("zip") //val query = spark.sql("SELECT * FROM census") val query = spark.sql("SELECT DISTINCT census.Total_Males AS male, census.Total_Females AS female FROM census INNER JOIN zip ON census.Zip_Code=zip.Zip_Code WHERE zip.City = 'Inglewood' AND zip.County = 'Los Angeles'") query.show() query.write.parquet("/home/hdfs/Documents/population/census/IDE/census.parquet") sc.stop() } }
dataframe.join()thanmaporfilter. Why would you not use it ? See stackoverflow.com/questions/40343625/… or jaceklaskowski.gitbooks.io/mastering-spark-sql/… or stackoverflow.com/questions/36800174/…