I think zero323 has a better answer. It is annoying that Spark does not support this initially, given how easy it is to implement it. For posterity, there is a function here that I use:
def trunc(date, format): """Wraps spark trunc fuction to support day, minute, and hour""" import re import pyspark.sql.functions as func # Ghetto hack to get the column name from Column object or string: try: colname = re.match(r"Column<.?'(.*)'>", str(date)).groups()[0] except AttributeError: colname = date alias = "trunc(%s, %s)" % (colname, format) if format in ('year', 'YYYY', 'yy', 'month', 'mon', 'mm'): return func.trunc(date, format).alias(alias) elif format in ('day', 'DD'): return func.date_sub(date, 0).alias(alias) elif format in ('min', ): return ((func.round(func.unix_timestamp(date) / 60) * 60).cast("timestamp")).alias(alias) elif format in ('hour', ): return ((func.round(func.unix_timestamp(date) / 3600) * 3600).cast("timestamp")).alias(alias)
source share