UPDATE: I found using a format that fills 3 matrix elements in one byte, actually quite quickly. In the example below, the speed penalty is less 2x
compared to direct multiplication using @
, while the space savings are greater than 20x
>>> Y = np.random.randint(0, 5, (3000, 3000), dtype = np.int8)
>>> i, j = np.triu_indices(3000, 1)
>>> Y[i, j] = Y[j, i]
>>> values = np.array([0.3, 0.5, 0.6, 0.9, 2.0])
>>> Ycmp = (np.reshape(Y, (1000, 3, 3000)) * np.array([25, 5, 1], dtype=np.int8)[None, :, None]).sum(axis=1, dtype=np.int8)
>>> full = values[Y]
>>> x @ full @ x
>>> vtable = values[np.transpose(np.unravel_index(np.arange(125), (5,5,5)))]
>>> np.dot(np.concatenate([(vtable * np.bincount(row, x, minlength=125)[:, None]).sum(axis=0) for row in Ycmp]), x)
>>> timeit('x @ full @ x', globals=globals(), number=100)
>>> timeit('np.dot(np.concatenate([(vtable * np.bincount(row, x, minlength=125)[:, None]).sum(axis=0) for row in Ycmp]), x)', globals=globals(), number=100)
. .
, np.bincount
>>> Y = np.random.randint(0, 5, (1000, 1000), dtype = np.int8)
>>> i, j = np.triu_indices(1000, 1)
>>> Y[i, j] = Y[j, i]
>>> values = np.array([0.3, 0.5, 0.6, 0.9, 2.0])
>>> full = values[Y]
>>> x = np.random.random((1000,))
>>> x @ full @ x
>>> np.dot([(values * np.bincount(row, x)).sum() for row in Y], x)
>>> upper = Y[i, j]
>>> diag = np.diagonal(Y)
>>> boundaries = np.r_[0, np.cumsum(np.arange(999, 0, -1))]
>>> (values*np.bincount(diag, x*x)).sum() + 2 * np.dot([(values*np.bincount(upper[boundaries[i]:boundaries[i+1]], x[i+1:],minlength=5)).sum() for i in range(999)], x[:-1])