Update : With the below we can achive uniform distribution...
- Fetch the primary key of the table.
- Find the key minimum and maximum values.
- Execute Spark with those values.
def main(args: Array[String]){ // parsing input parameters ... val primaryKey = executeQuery(url, user, password, s"SHOW KEYS FROM ${config("schema")}.${config("table")} WHERE Key_name = 'PRIMARY'").getString(5) val result = executeQuery(url, user, password, s"select min(${primaryKey}), max(${primaryKey}) from ${config("schema")}.${config("table")}") val min = result.getString(1).toInt val max = result.getString(2).toInt val numPartitions = (max - min) / 5000 + 1 val spark = SparkSession.builder().appName("Spark reading jdbc").getOrCreate() var df = spark.read.format("jdbc"). option("url", s"${url}${config("schema")}"). option("driver", "com.mysql.jdbc.Driver"). option("lowerBound", min). option("upperBound", max). option("numPartitions", numPartitions). option("partitionColumn", primaryKey). option("dbtable", config("table")). option("user", user). option("password", password).load() // some data manipulations here ... df.repartition(10).write.mode(SaveMode.Overwrite).parquet(outputPath) }