cache persist , ( , ).
Spark RDD ( ):
Spark ( ) . RDD, node , , ( , ). ( 10 ). - .
RDD, , persist() cache(). , , . - - RDD , , .
( Spark SQL!) :
rdd2.persist(StorageLevel.MEMORY_AND_DISK).count
.
count , , ( ).
, :
persist , ?
, , ( ).
" -", , ( RDD) .

cache persist explain ( QueryExecution.optimizedPlan).
val q1 = spark.range(10).groupBy('id % 5).count.cache
scala> q1.explain
== Physical Plan ==
InMemoryTableScan [(id % 5)
+- InMemoryRelation [(id % 5)
+- *HashAggregate(keys=[(id
+- Exchange hashpartitioning((id
+- *HashAggregate(keys=[(id
+- *Range (0, 10, step=1, splits=8)
scala> println(q1.queryExecution.optimizedPlan.numberedTreeString)
00 InMemoryRelation [(id % 5)
01 +- *HashAggregate(keys=[(id
02 +- Exchange hashpartitioning((id
03 +- *HashAggregate(keys=[(id
04 +- *Range (0, 10, step=1, splits=8)
// Cache sample table range5 using pure SQL
// That registers range5 to contain the output of range(5) function
spark.sql("CACHE TABLE range5 AS SELECT * FROM range(5)")
val q2 = spark.sql("SELECT * FROM range5")
scala> q2.explain
== Physical Plan ==
InMemoryTableScan [id
+- InMemoryRelation [id
+- *Range (0, 5, step=1, splits=8)
InMemoryTableScan ( InMemoryRelation) - , , , , .
, Spark SQL DataFrame SQL CACHE TABLE query (, RDD, ):
if (!isLazy) {
sparkSession.table(tableIdent).count()
}
, . cache persist , SQL CACHE TABLE .