Skip to main content
AI Assist is now on Stack Overflow. Start a chat to get instant answers from across the network. Sign up to save and share your chats.
added date type to excluded types
Source Link
gench
  • 1.4k
  • 2
  • 14
  • 18

To make sure it does not fail for string, date and timestamp columns:

import pyspark.sql.functions as F def count_missings(spark_df,sort=True): """ Counts number of nulls and nans in each column """ df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp', 'string', 'date')]).toPandas() if len(df) == 0: print("There are no any missing values!") return None if sort: return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False) return df 

If you want to see the columns sorted based on the number of nans and nulls in descending:

count_missings(spark_df) # | Col_A | 10 | # | Col_C | 2 | # | Col_B | 1 | 

If you don't want ordering and see them as a single row:

count_missings(spark_df, False) # | Col_A | Col_B | Col_C | # | 10 | 1 | 2 | 

To make sure it does not fail for string and timestamp columns:

import pyspark.sql.functions as F def count_missings(spark_df,sort=True): """ Counts number of nulls and nans in each column """ df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp','string')]).toPandas() if len(df) == 0: print("There are no any missing values!") return None if sort: return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False) return df 

If you want to see the columns sorted based on the number of nans and nulls in descending:

count_missings(spark_df) # | Col_A | 10 | # | Col_C | 2 | # | Col_B | 1 | 

If you don't want ordering and see them as a single row:

count_missings(spark_df, False) # | Col_A | Col_B | Col_C | # | 10 | 1 | 2 | 

To make sure it does not fail for string, date and timestamp columns:

import pyspark.sql.functions as F def count_missings(spark_df,sort=True): """ Counts number of nulls and nans in each column """ df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp', 'string', 'date')]).toPandas() if len(df) == 0: print("There are no any missing values!") return None if sort: return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False) return df 

If you want to see the columns sorted based on the number of nans and nulls in descending:

count_missings(spark_df) # | Col_A | 10 | # | Col_C | 2 | # | Col_B | 1 | 

If you don't want ordering and see them as a single row:

count_missings(spark_df, False) # | Col_A | Col_B | Col_C | # | 10 | 1 | 2 | 
added clause to catch the case of dataframes without any null or nan.
Source Link
gench
  • 1.4k
  • 2
  • 14
  • 18

To make sure it does not fail for string and timestamp columns:

import pyspark.sql.functions as F #def countcount_missings(spark_df,sort=True):  both nans and """  Counts number of nulls and nans in each column  """ df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in dfspark_df.dtypes if c_type not in ('timestamp','string')]).showtoPandas(vertical=True)   # | Col_A | Col_Bif |len(df) Col_C== |0: # | 10 | print("There 1are no any |missing values!")  2 | 

If you want to see the columns sorted based on the number of nans and nulls in descending:

import pyspark.sql.functions as F return None df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for   (c,c_type) in df.dtypes if c_typesort:  not in ('timestamp','string')]).toPandas() return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False) return df 

If you want to see the columns sorted based on the number of nans and nulls in descending:

count_missings(spark_df) # | Col_A | 10 | # | Col_C | 2 | # | Col_B | 1 | 

If you don't want ordering and see them as a single row:

count_missings(spark_df, False) # | Col_A | Col_B | Col_C | # | 10 | 1 | 2 | 

To make sure it does not fail for string and timestamp columns:

import pyspark.sql.functions as F # count both nans and nulls df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in df.dtypes if c_type not in ('timestamp','string')]).show(vertical=True) # | Col_A | Col_B | Col_C | # | 10 | 1 | 2 | 

If you want to see the columns sorted based on the number of nans and nulls in descending:

import pyspark.sql.functions as F  df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in df.dtypes if c_type not in ('timestamp','string')]).toPandas().rename(index={0: 'count'}).T.sort_values("count",ascending=False) # | Col_A | 10 | # | Col_C | 2 | # | Col_B | 1 | 

To make sure it does not fail for string and timestamp columns:

import pyspark.sql.functions as F def count_missings(spark_df,sort=True):   """  Counts number of nulls and nans in each column  """ df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp','string')]).toPandas()   if len(df) == 0: print("There are no any missing values!")  return None    if sort:   return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False) return df 

If you want to see the columns sorted based on the number of nans and nulls in descending:

count_missings(spark_df) # | Col_A | 10 | # | Col_C | 2 | # | Col_B | 1 | 

If you don't want ordering and see them as a single row:

count_missings(spark_df, False) # | Col_A | Col_B | Col_C | # | 10 | 1 | 2 | 
Source Link
gench
  • 1.4k
  • 2
  • 14
  • 18

To make sure it does not fail for string and timestamp columns:

import pyspark.sql.functions as F # count both nans and nulls df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in df.dtypes if c_type not in ('timestamp','string')]).show(vertical=True) # | Col_A | Col_B | Col_C | # | 10 | 1 | 2 | 

If you want to see the columns sorted based on the number of nans and nulls in descending:

import pyspark.sql.functions as F df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in df.dtypes if c_type not in ('timestamp','string')]).toPandas().rename(index={0: 'count'}).T.sort_values("count",ascending=False) # | Col_A | 10 | # | Col_C | 2 | # | Col_B | 1 |