Here's 2 sample approaches:
- using a dict and a UDF
- using a second DataFrame to join with
from pyspark.sql.functions import col, udf, StringType s = """\ SEATTLE TACOMA AIRPORT, WA US MIAMI INTERNATIONAL AIRPORT, FL US SAN FRANCISCO INTERNATIONAL AIRPORT, CA US MIAMI INTERNATIONAL AIRPORT, FL US MIAMI INTERNATIONAL AIRPORT, FL US SAN FRANCISCO INTERNATIONAL AIRPORT, CA US SEATTLE TACOMA AIRPORT, WA US""" abbr = { "SEATTLE TACOMA AIRPORT": "SEA", "MIAMI INTERNATIONAL AIRPORT": "MIA", "SAN FRANCISCO INTERNATIONAL AIRPORT": "SFO", } df = spark.read.csv(sc.parallelize(s.splitlines())) print("=== df ===") df.show() # ================================= # 1. using a UDF # ================================= print("=== using a UDF ===") udf_airport_to_abbr = udf(lambda airport: abbr[airport], StringType()) df.withColumn("abbr", udf_airport_to_abbr("_c0")).show() # ================================= # 2. using a join # ================================= # you may want to create this df in some different way ;) df_abbrs = spark.read.csv(sc.parallelize(["%s,%s" % x for x in abbr.items()])) print("=== df_abbrs ===") df_abbrs.show() print("=== using a join ===") df.join(df_abbrs, on="_c0").show()
Output:
=== df === +--------------------+------+ | _c0| _c1| +--------------------+------+ |SEATTLE TACOMA AI...| WA US| |MIAMI INTERNATION...| FL US| |SAN FRANCISCO INT...| CA US| |MIAMI INTERNATION...| FL US| |MIAMI INTERNATION...| FL US| |SAN FRANCISCO INT...| CA US| |SEATTLE TACOMA AI...| WA US| +--------------------+------+ === using a UDF === +--------------------+------+----+ | _c0| _c1|abbr| +--------------------+------+----+ |SEATTLE TACOMA AI...| WA US| SEA| |MIAMI INTERNATION...| FL US| MIA| |SAN FRANCISCO INT...| CA US| SFO| |MIAMI INTERNATION...| FL US| MIA| |MIAMI INTERNATION...| FL US| MIA| |SAN FRANCISCO INT...| CA US| SFO| |SEATTLE TACOMA AI...| WA US| SEA| +--------------------+------+----+ === df_abbrs === +--------------------+---+ | _c0|_c1| +--------------------+---+ |SEATTLE TACOMA AI...|SEA| |MIAMI INTERNATION...|MIA| |SAN FRANCISCO INT...|SFO| +--------------------+---+ === using a join === +--------------------+------+---+ | _c0| _c1|_c1| +--------------------+------+---+ |SEATTLE TACOMA AI...| WA US|SEA| |SEATTLE TACOMA AI...| WA US|SEA| |SAN FRANCISCO INT...| CA US|SFO| |SAN FRANCISCO INT...| CA US|SFO| |MIAMI INTERNATION...| FL US|MIA| |MIAMI INTERNATION...| FL US|MIA| |MIAMI INTERNATION...| FL US|MIA| +--------------------+------+---+