Pandas range_date drastically slows down

I gave a sample data set and I want to select several samples from the original sample data set, for example 1000 sample blocks, each of which consists of 500 data points from the original sample data. I wrote this little function in python:

import timeit
import pandas as pd
import numpy as np
sample_data = np.random.randn(10000, 15)
index = pd.date_range("20000101", periods=10000, freq='B')
sample_data_df = pd.DataFrame(sample_data, index=index)
def f(n, sample_data_df, f):
    s = (1+sample_data_df).resample(f, axis=0)
    r = s.prod()-1
    out = r.sample(n, replace=True)
    # out_index = pd.date_range(start=sample_data_df.index[0],
    #                              periods=len(out.index),
    #                              freq=f)
    # out.index = output_index
    return out


start_time = timeit.default_timer()
N = 1000
a = [f(500, sample_data_df, 'BM') for i in range(N)]
elapsed = timeit.default_timer() - start_time
print(elapsed)

If I run this code, it will take 35.8964748383 seconds. However, I would like to have an index attached to each of the blocks that I would break the lines in the function, i.e.

def f(n, sample_data_df, f):
        s = (1+sample_data_df).resample(f, axis=0)
        r = s.prod()-1
        out = r.sample(n, replace=True)
        out_index = pd.date_range(start=sample_data_df.index[0],
                                  periods=len(out.index),
                                  freq=f)
        out.index = output_index
        return out

72.2418179512. . , , ? , . , , , .

, ? 35.8964748383 - .

+4
1

:

  • ​​

, :

%timeit (1+sample_data_df).resample('BM', axis=0).prod()-1
21.7 ms ± 170 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit pd.date_range(start="20000101", periods=500, freq='BM')
21.4 ms ± 272 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

22 , , , , 150'000 .

1000, ( ). , , . , (lru_cache) (, dfs, lists...). , , , :

from functools import lru_cache
class Sampler():
  def __init__(self, df):
    self.df = df

  def get_resampled_sample(self, n, freq):
    resampled = self._wraper_resample_prod(freq)
    return resampled.sample(n, replace=True)

  def _wraper_resample_prod(self, freq):
    hash_df = hash(self.df.values.tobytes())
    return self._resample_prod(hash_df, freq)

  @lru_cache(maxsize=1)  
  def _resample_prod(self, hash_df, freq):
    return (self.df+1).resample(freq, axis=0).prod()-1

, df . , .

%timeit [sampler.get_resampled_sample(500, 'BM') for i in range(1000)]
881 ms ± 10.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

, , pd.date_range .

class Sampler():
  def __init__(self, df):
    self.df = df

  def update_df(self, df):
    self.df = df

  def get_resampled_sample(self, n, freq):
    resampled = self._wraper_resample_prod(freq)
    df = resampled.sample(n, replace=True)
    df.index = self._create_date_range(self.df.index[0], n, freq)
    return df

  def _wraper_resample_prod(self, freq):
    hash_df = hash(self.df.values.tobytes())
    return self._resample_prod(hash_df, freq)

  @lru_cache(maxsize=1)  
  def _resample_prod(self, hash_df, freq):
    return (self.df+1).resample(freq, axis=0).prod()-1

  @lru_cache(maxsize=1)
  def _create_date_range(self, start, periods, freq):
    return pd.date_range(start=start, periods=periods, freq=freq)

:

%timeit [sampler.get_resampled_sample(500, 'BM') for i in range(1000)]
1.11 s ± 43.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+3

Source: https://habr.com/ru/post/1687437/


All Articles