I did this in python , in addition, I suggest you use integers as keys, not strings.
from pyspark.sql.types import * samples = sc.parallelize([ (" abonsanto@fakemail.com ", "Alberto", "Bonsanto"), (" mbonsanto@fakemail.com ", "Miguel", "Bonsanto"), (" stranger@fakemail.com ", "Stranger", "Weirdo"), (" dbonsanto@fakemail.com ", "Dakota", "Bonsanto") ]) keys = sc.parallelize( [(" abonsanto@fakemail.com ",), (" mbonsanto@fakemail.com ",), (" dbonsanto@fakemail.com ",)] ) complex_schema = StructType([ StructField("email", StringType(), True), StructField("first_name", StringType(), True), StructField("last_name", StringType(), True) ]) simple_schema = StructType([ StructField("email", StringType(), True) ]) df1 = sqlContext.createDataFrame(samples, complex_schema) df2 = sqlContext.createDataFrame(keys, simple_schema) df1.show() df2.show() df3 = df1.join(df2, df1.email == df2.email, "left_outer").where(df2.email.isNull()).show()
source share