, , , . :
import pandas as pd
pdf = pd.DataFrame({
'city': {0: 'one', 1: 'one', 2: 'one', 3: 'two', 4: 'three', 5: 'three'},
'population': {0: 100, 1: 100, 2: 100, 3: 200, 4: 1000, 5: 1000},
'rate': {0: 0.10000000000000001,
1: 0.11,
2: 0.12,
3: 0.10000000000000001,
4: 0.20999999999999999,
5: 0.22},
'year': {0: 1, 1: 2, 2: 3, 3: 1, 4: 2, 5: 3}})
df = sqlContext.createDataFrame(pdf)
df.show()
:
import sys
from pyspark.sql.window import Window
from pyspark.sql.functions import exp, log, sum, first, col, coalesce
w = Window.partitionBy("city").orderBy("year")
wr = w.rowsBetween(-sys.maxsize, -1)
:
log_sum = sum(log(col("rate") + 1)).over(wr)
cumulative_rate = exp(log_sum).alias("cumulative_rate")
base_population = first("population").over(w).alias("base_population")
current_population = coalesce(
cumulative_rate * base_population,
col("population")
).alias("current_population")
, :
df.select("*", current_population).show()