# 1: set -
def divakar_v1(df):
a = df.values
sidx = a[:,1].argsort() # Use .argsort(kind='mergesort') to keep order
cut_idx = np.nonzero(a[sidx[1:],1] > a[sidx[:-1],1])[0]+1
out = np.split(a[sidx,0], cut_idx)
return list(map(set,out))
# 2: set -
def divakar_v2(df):
data = df.values
a = data[data[:,1].argsort()]
stop = np.append(np.nonzero(a[1:,1] > a[:-1,1])[0]+1,a.size)
start = np.append(0, stop[:-1])
out_set = [set(a[start[i]:stop[i],0]) for i in range(len(start))]
return out_set
, 'OrderID' / 'ItemID', , set() , , . .
№ 3: list of lists as o/p -
def divakar_v3(df):
a = df.values
sidx = a[:,1].argsort() # Use .argsort(kind='mergesort') to keep order
cut_idx = np.nonzero(a[sidx[1:],1] > a[sidx[:-1],1])[0]+1
out = np.split(a[sidx,0], cut_idx)
return list(map(list,out))
№ 4: list of lists as o/p -
def divakar_v4(df):
data = df.values
a = data[data[:,1].argsort()]
stop = np.append(np.nonzero(a[1:,1] > a[:-1,1])[0]+1,a.size)
start = np.append(0, stop[:-1])
a0 = a[:,0].tolist()
return [a0[start[i]:stop[i]] for i in range(len(start))]
-
In [145]: np.random.seed(123)
...: N = 100000
...: df = pd.DataFrame(np.random.randint(30,size=(N,2)))
...: df.columns = ['ItemID','OrderID']
...:
In [146]: %timeit divakar_v1(df)
...: %timeit divakar_v2(df)
...: %timeit divakar_v3(df)
...: %timeit divakar_v4(df)
...:
10 loops, best of 3: 21.1 ms per loop
10 loops, best of 3: 21.7 ms per loop
100 loops, best of 3: 16.7 ms per loop
100 loops, best of 3: 12.3 ms per loop