Much faster computing time in Python

I have this code that reorders the list of company names according to jaccard distance. It is working fine.

However, if I use this code for the names of 30 thousand companies, the calculation time is too long. For example, I ran this code 2 hours ago and was still processing.

How can I run this code much faster ?. Maybe some libraries or a change in structure?

      def jack(a,b):
            x=a.split()
            y=b.split()
            k=float(len(set(x)&set(y)))/float(len((set(x) | set(y))))
            return k

t=['bancorp', 'bancorp', 'bancorp ali', 'bancorp puno', 'bancorp amo', 'gas eu', 'gas', 'profeta', 'bancorp america', 'uni', 'gas for', 'gas tr']

out = [] # this will be the sorted list
for index, val1 in enumerate(t): # work through each item in the original list
    if val1 not in out: # if we haven't already put this item in the new list
        out.append(val1) # put this item in the new list
    for val2 in t[index+1:]: # search the rest of the list
        if val2 not in out: # if we haven't already put this item in the new list
            if jack(val1, val2) >= 0.5: # and the new item is close to the current item
                out.append(val2) # add the new item too

Then output:

print out

['bancorp', 'bancorp ali', 'bancorp puno', 'bancorp amo', 'bancorp america', 'gas eu', 'gas', 'gas for', 'gas tr', 'profeta', 'uni']
+4
source share
4 answers

Accepting the suggestions of others above, I flipped my own efficient code. As noted by Niklas B., the main improvement is a decrease from O (n ^ 3) to O (n ^ 2).

from __future__ import division
import itertools

def jack(a,b):
    #print "jack", a, b, len(a & b) / len(a | b)
    return len(a & b) / len(a | b)

def jacksort(t):
    # precompute the word set of each input
    sts = [(i, it, set(it.split())) for i, it in enumerate(t)]
    # allow O(1) testing for 'word already in output'
    os = set()
    out = [] # this will be the sorted list
    # work through each item in the original list
    for index, val1, sval1 in sts:
        if not val1 in os:
            out.append(val1) # put this item in the new list
            os.add(val1)
        for index2, val2, sval2 in itertools.islice(sts, index+1, len(sts)):
            # search the rest of the list
            if val2 in os: continue
            if jack(sval1, sval2) >= 0.5:
                # the new item is close to the current item
                out.append(val2) # add the new item too
                os.add(val2)

    return out

def main(n=100):
    t=['bancorp', 'bancorp', 'bancorp ali', 'bancorp puno', 'bancorp amo',
        'gas eu', 'gas', 'profeta', 'bancorp america', 'uni', 'gas for',
        'gas tr']
    t += [" ".join(w.split()) for w in open("/usr/share/dict/words").read().split()]
    t = t[:n]
    jacksort(t)

n - . /usr/share/dict/words:/usr/share/dict/american-english debian package wamerican version 7.1-1.

:

   10: 10 loops, best of 3: 30.6 msec per loop
   20: 10 loops, best of 3: 28.9 msec per loop
   50: 10 loops, best of 3: 29.6 msec per loop
  100: 10 loops, best of 3: 31.7 msec per loop
  200: 10 loops, best of 3: 38.7 msec per loop
  500: 10 loops, best of 3: 85.1 msec per loop
 1000: 10 loops, best of 3: 261 msec per loop
 2000: 10 loops, best of 3: 1.01 sec per loop
 5000: 10 loops, best of 3: 6.16 sec per loop
10000: 10 loops, best of 3: 25.3 sec per loop

, :

   10: 10 loops, best of 3: 34.1 msec per loop
   20: 10 loops, best of 3: 34.3 msec per loop
   50: 10 loops, best of 3: 33.9 msec per loop
  100: 10 loops, best of 3: 43.1 msec per loop
  200: 10 loops, best of 3: 74.9 msec per loop
  500: 10 loops, best of 3: 415 msec per loop
 1000: 10 loops, best of 3: 2.35 sec per loop
 2000: 10 loops, best of 3: 14.8 sec per loop
 5000: [did not finish while preparing this post]

timeit :

$ for i in 10 20 50 100 200 500 1000 2000 5000 10000; do printf "%5d: " $i; python -mtimeit 'import ojack as jack' 'jack.main('$i')'; done

Python 2.7.5-5 Debian amd64, i5-3320m. , -, , , Big-O.

, "" - , 5-7 . , , , . , n = 30000 393 . , O (n ^ 2) (25.3 * 9 = 227.7), - log n, -, O (1) Python.

+2
from functools import partial
print sorted(list(set(t)), key=partial(jack,min(t)),reverse=True)

( ... , - )

0

, jack, .

# untested code, but you get the idea
jackDict = dict(map(lambda x: (x, set(x.split())), t))

def jack(a,b):
  x = jackDict[a]
  y = jackDict[b]
  xy = x&y              # single set operation per call to jack()
  return float(len(xy))/float(len(x)+len(y)-len(xy)))

, |, & ,

def jack(a,b):
  x = jackDict[a]
  y = jackDict[b]
  xy = x|y              # single set operation per call to jack()
  return float(len(x)+len(y)-len(xy))/float(len(xy)))

jack, , .

0

.

t = ['bancorp', 'bancorp', 'bancorp ali', 'bancorp puno', 'bancorp amo', 'gas eu', 'gas', 'profeta', 'bancorp america', 'uni', 'gas for', 'gas tr']

def jack(a, b):
    x, y = a.split(), b.split()
    return float(len(set(x) & set(y))) / len((set(x) | set(y)))

seen = set()
source = list(t)
res = []
while source:
    val = source.pop(0)
    if val not in seen:
        seen.add(val)
        res.append(val)
    tmp = []
    for val2 in source:
        if jack(val, val2) >= 0.5 and val2 not in seen:
            seen.add(val2)
            tmp.append(val2)
    tmp.sort(key=lambda i: jack(val, i), reverse=True)
    res.extend(tmp)

print(res)

:

  • , O (1) x in s;
  • .

(, OP!)

, . :

  • . , , jack(), , .

  • , . .

- :

source = list(t)
pre_computed = {}
while source:
    val = source.pop(0)
    pre_computed[val] = max(source or [''], key=lambda i: jack(val, i))
source2 = list(t)
last = source2.pop(0)
res = [last]
while True:
    val = pre_computed[last]
    if val: res.append(val)
    else: break
    last = val

, , :

['bancorp', 'bancorp ali', 'bancorp puno', 'bancorp amo', 'bancorp america', 'uni', 'gas for', 'gas tr']
0

Source: https://habr.com/ru/post/1535417/


All Articles