Pyspark Unittests Team Guide
1. You need to download Unzip spark from the site and unzip it. Or if you already have a working Spark and Python distribution, just install pyspark :pip install pyspark
2. :
export SPARK_HOME="/home/eugene/spark-1.6.0-bin-hadoop2.6"
export PYTHONPATH="$SPARK_HOME/python/:$SPARK_HOME/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
export PATH="SPARK_HOME/bin:$PATH"
.profile . Spark, .
3. , :
PYSPARK_SUBMIT_ARGS="--jars path/to/hive/jars/jar.jar,path/to/other/jars/jar.jar --conf spark.driver.userClassPathFirst=true --master local[*] pyspark-shell"
PYSPARK_PYTHON="/home/eugene/anaconda3/envs/ste/bin/python3"
? . Pyspark py4j java- Spark. , Kafka Python TestHiveContext Scala, , .
"".
4. pyspark/tests.py, pyspark/streaming/tests.py, pyspark/sql/tests.py, pyspark/ml/tests.py, pyspark/mllib/tests.py, TestCase pyspark. ( pyspark/sql/tests.py):
class HiveContextSQLTests(ReusedPySparkTestCase):
@classmethod
def setUpClass(cls):
ReusedPySparkTestCase.setUpClass()
cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
try:
cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
except py4j.protocol.Py4JError:
cls.tearDownClass()
raise unittest.SkipTest("Hive is not available")
except TypeError:
cls.tearDownClass()
raise unittest.SkipTest("Hive is not available")
os.unlink(cls.tempdir.name)
_scala_HiveContext =\
cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc())
cls.sqlCtx = HiveContext(cls.sc, _scala_HiveContext)
cls.testData = [Row(key=i, value=str(i)) for i in range(100)]
cls.df = cls.sc.parallelize(cls.testData).toDF()
@classmethod
def tearDownClass(cls):
ReusedPySparkTestCase.tearDownClass()
shutil.rmtree(cls.tempdir.name, ignore_errors=True)
-jars Hive PYSPARK_SUBMIT_ARGS,
:
class SQLContextTests(ReusedPySparkTestCase):
def test_get_or_create(self):
sqlCtx = SQLContext.getOrCreate(self.sc)
self.assertTrue(SQLContext.getOrCreate(self.sc) is sqlCtx)
, pyspark pip, test.py, . Spark .
TestCase : python -m unittest test.py