, , approxQuantile
dataframe, , . , , , Spark ( PySpark) .
, approxQuantile
; DataFrame, .. DataFrameStatFunctions:
spark.version
sampleData = [("bob","Developer",125000),("mark","Developer",108000),("carl","Tester",70000),("peter","Developer",185000),("jon","Tester",65000),("roman","Tester",82000),("simon","Developer",98000),("eric","Developer",144000),("carlos","Tester",75000),("henry","Developer",110000)]
df = spark.createDataFrame(sampleData, schema=["Name","Role","Salary"])
df.show()
med = df.approxQuantile("Salary", [0.5], 0.25)
med
DataFrameStatFunctions, , , :
from pyspark.sql import DataFrameStatFunctions as statFunc
med2 = statFunc.approxQuantile( "Salary", [0.5], 0.25)
med2 = statFunc(df).approxQuantile( "Salary", [0.5], 0.25)
med2
PySpark ( , )... ? :
med == med2
, ( , ), , ( , ). , ...
, , , approxQuantile
dataframe - , :
df2 = df.withColumn('median_salary', statFunc(df).approxQuantile( "Salary", [0.5], 0.25))
col
withColumn
, .. approxQuantile
, , Column
- , :
type(statFunc(df).approxQuantile( "Salary", [0.5], 0.25))
, Spark Column
, ; :
import pyspark.sql.functions as func
from pyspark.sql import Window
windowSpec = Window.partitionBy(df['Role'])
df2 = df.withColumn('mean_salary', func.mean(df['Salary']).over(windowSpec))
df2.show()
, approxQuantile
, mean
a Column
:
type(func.mean(df['Salary']).over(windowSpec))