, , , .
2 pandas dataframes. - dataframes. numpy, dataframes.
, 1000 x 500 . .
A_init = pd.DataFrame(np.random.binomial(1, .5, (1000, 500)))
A_init.columns = pd.MultiIndex.from_product([range(A_init.shape[1]/10), range(10)])
A = A_init
, A a MultiIndex 10.
@Divakar , .
10 - 8. , , 2.
twos = 2 ** np.arange(10)
10 ,
AtB = A.stack(0).dot(twos).unstack()
I stack, 50 10 , . unstack.
1000 x 50, 0 1023.
, B - , 1024 . B B = B.sort_values().reset_index(drop=True).
, , , .
AtB.loc[:2, :2]

(0, 0), 951 , 10 A B 951. , !!! , . , , B !!! 0 1023. , B. .
, A B dataframe , A B. , B.
def FindAinB(A, B):
assert A.shape[1] % 10 == 0, 'Number of columns in A is not a multiple of 10'
rng = np.arange(A.shape[1])
A.columns = pd.MultiIndex.from_product([range(A.shape[1]/10), range(10)])
twos = 2 ** np.arange(10)
return A.stack(0).dot(twos).unstack()
def FindAinB2(A, B):
assert A.shape[1] % 10 == 0, 'Number of columns in A is not a multiple of 10'
rng = np.arange(A.shape[1])
A.columns = pd.MultiIndex.from_product([range(A.shape[1]/10), range(10)])
return (A.stack(0) << np.arange(10)).sum(1).unstack()
@Divakar (, , Divakar)
def FindAinB3(A, B):
assert A.shape[1] % 10 == 0, 'Number of columns in A is not a multiple of 10'
a = A.values.reshape(-1, 10)
a = np.einsum('ij->i', a << np.arange(10))
return pd.DataFrame(a.reshape(A.shape[0], -1), A.index)
f = lambda A: pd.DataFrame(np.einsum('ij->i', A.values.reshape(-1, 10) << np.arange(10)).reshape(A.shape[0], -1), A.index)
f(A)
Timing
FindAinB3
