I'm not sure if this question is still relevant for the current version of pySpark, but here is a solution that I developed a couple of weeks after posting this question. The code is pretty ugly and possibly inefficient, but I am posting it here because of the ongoing interest in this subject.
from pyspark import SparkContext from pyspark.sql import HiveContext from pyspark import SparkConf from py4j.protocol import Py4JJavaError myConf = SparkConf(loadDefaults=True) sc = SparkContext(conf=myConf) hc = HiveContext(sc) def chunks(lst, k): """Yield k chunks of close to equal size""" n = len(lst) / k for i in range(0, len(lst), n): yield lst[i: i + n] def reconstruct_rdd(lst, num_parts): partitions = chunks(lst, num_parts) for part in range(0, num_parts - 1): print "Partition ", part, " started..." partition = next(partitions)
source share