Looking for a quick way to compute paired distances of many lines

I have a list of ~ 1 million unique 16-character strings (an array called VEC), and I want to calculate the minimum hamming pair distance for each in Python (an array called RES). Basically, I calculate the full pair distance matrix one row at a time, but save only the minimum value in RES for each row.

VEC= ['AAAAAAAAAAAAAAAA','AAAAAAAAAAAAAAAT','AAAAGAAAAAATAAAA'...]

so dist (VEC [1], VEC [2]) = 1, dist (VEC [1], VEC [3]) = 2, etc. and RES [1] = 1. Using the tips and tricks from these pages, I came up with:

#METHOD#1:
import Levenshtein
import numpy
RES=99*numpy.ones(len(VEC))
i=0
for a in VEC:
    dist=numpy.array([Levenshtein.hamming(a,b) for b in VEC] ) #array of distances
    RES[i]=numpy.amin(dist[dist>0])  #pick min distance greater than zero
    i+=1

VEC 10 000 70 , , 8 . , , RES , :

#METHOD #2:
import Levenshtein
import numpy
RES=99*numpy.ones(len(VEC))
for i in range(len(VEC)-1):
    dist=[Levenshtein.hamming(VEC[i],VEC[j]) for j in range(i+1, len(VEC))]
    RES[i]=min(numpy.amin(dist),RES[i])
    #update RES as you go along:
    k=0
    for j in range(i+1,len(VEC)):
        if dist[k]<RES[j]:
             RES[j]=dist[k]
        k+=1

, , 2- (117 ), . , - /, ?

0
2

( ), , "-" " " , -. 128- , 16 , 3 , 16- . - 128- , . LSH, :

  • 3-
  • ( > 0),

, . bitarray .

+1

numpy. :

#!/usr/bin/env python

import numpy as np
import time

def gen_data(n):
    arr = np.empty(shape=(n, 16))
    for i in range(n):
        arr[i] = np.random.randint(ord('A'), ord('Z')+1, 16)
    return arr

def distance_from_array(i, arr):
    r = arr[i] != arr
    r[i,:] = True
    min_i = np.argmin(np.sum(r, axis=1))
    return min_i

data = gen_data(1000000)
distances = []
start = time.time()
for i in range(200):
    distances.append(distance_from_array(i, data))
end = time.time()
print end - start

. numpy , sum argmin. , 1, , .

, 200 10 . 1 000 000 , , . 13 . , ​​ . http://docs.python.org/2/library/multiprocessing.html#module-multiprocessing.pool, .

0

Source: https://habr.com/ru/post/1680308/


All Articles