Actions Against Transformations
- Collect (action) - return all elements of the data set as an array in the driver. This is usually useful after a filter or other operation that returns a fairly small subset of the data.
spark-sql doc
select (* cols) (). DataFrame.
: cols - () (). "*", DataFrame. **
df.select('*').collect()
[Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
df.select('name', 'age').collect()
[Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
df.select(df.name, (df.age + 10).alias('age')).collect()
[Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
select(column-name1,column-name2,etc) , , select().
. , df , "" "" .
df2 = df.select("name","value")
df2 ( "" "" ) df
df2 select , ( collect())
sql-programming-guide
df.printSchema()
# root
# |-- age: long (nullable = true)
# |-- name: string (nullable = true)
# Select only the "name" column
df.select("name").show()
# +-------+
# | name|
# +-------+
# |Michael|
# | Andy|
# | Justin|
# +-------+
collect() (spark docs)
>>> l = [('Alice', 1)]
>>> spark.createDataFrame(l).collect()
[Row(_1=u'Alice', _2=1)]
>>> spark.createDataFrame(l, ['name', 'age']).collect()
[Row(name=u'Alice', age=1)]
, collect() RDD node, : rdd.collect(). (Println). , , collect() RDD ; RDD, - take(): rdd.take(100).foreach(println).