, , join. , , DataFrames spark-csv ( , , ). :
file1 = ...
file2 = ...
with open(file1, "w") as fw:
fw.write("c_id,name\n121210,abc\n121211,pqr")
with open(file2, "w") as fw:
fw.write("121211,0,0\n121210,0,1")
:
df1 = (sqlContext.read
.format('com.databricks.spark.csv')
.options(header='true', inferSchema='true')
.load(file1))
:
schema = StructType(
[StructField(x, LongType(), False) for x in ("c_id", "cn_id", "cn_value")])
df2 = (sqlContext.read
.format('com.databricks.spark.csv')
.schema(schema)
.options(header='false')
.load(file2))
, :
combined = df1.join(df2, df1["c_id"] == df2["c_id"])
combined.show()
#
#
#
#
#
#
Edit
RDD - :
file1_fields.join(file2_fields.map(lambda x: (x[0], x[1:])))