Speed ​​issues when converting a list to a dictionary

I am having some problems with the speed of converting lists to dictionaries, where the following operation takes about 90% of the total time:

def list2dict(list_):
    return_dict = {}

    for idx, word in enumerate(list_):
        if word in return_dict:
            raise ValueError("duplicate string found in list: %s" % (word))
        return_dict[word] = idx

    return return_dict

I'm having problems seeing exactly what causes this. Are there any obvious bottlenecks you see in the code, or suggestions on how to speed it up?

Thanks.

+4
source share
5 answers

EDIT:

I realized that I put this at the top, because it is larger - it turns out that a slight tuning of the OP code gives a rather large performance bump.

def list2dict(list_):    # OLD
    return_dict = {}
    for idx, word in enumerate(list_):
        if word in return_dict: # this compare is happening every iteration!
            raise ValueError("duplicate string found in list: %s" % (word))
        return_dict[word] = idx
    return return_dict

def list2dictNEW(list_): #NEW HOTNESS
    return_dict = {}
    for idx, word in enumerate(list_):
        return_dict[word] = idx # overwrite if you want to, because...
    if len(return_dict) == len(list_): return return_dict
    # if the lengths aren't the same, something got overwritten so we
    # won't return. If they ARE the same, toss it back with only one
    # compare (rather than n compares in the original
    else: raise ValueError("There were duplicates in list {}".format(list_))

DEMO:
>>> timeit(lambda: list2dictNEW(TEST))
1.9117132451798682
>>> timeit(lambda: list2dict(TEST)):
2.2543816669587216
# gains of a third of a second per million iterations!
# that a 15.2% speed bost

There are no obvious answers, but you can try something like:

def list2dict(list_):
    return_dict = dict()
    for idx,word in enumerate(list_):
        return_dict.setdefault(word,idx)
    return return_dict

list.index, , , , , . ( timeit.timeit)

def list2dict(list_):
    set_ = set(list_)
    return {word:list_.index(word) for word in set_}

. :

TEST = ['a','b','c','d','e','f','g','h','i','j'] # 10 items

def list2dictA(list_): # build set and index word
    set_ = set(list_)
    return {word:list_.index(word) for word in set_}

def list2dictB(list_): # setdefault over enumerate(list)
    return_dict = dict()
    for idx,word in enumerate(list_):
        return_dict.setdefault(word,idx)
    return return_dict

def list2dictC(list_): # dict comp over enumerate(list)
    return_dict = {word:idx for idx,word in enumerate(list_)}
    if len(return_dict) == len(list_):
        return return_dict
    else:
        raise ValueError("Duplicate string found in list")

def list2dictD(list_): # Original example from Question
    return_dict = {}
    for idx, word in enumerate(list_):
        if word in return_dict:
            raise ValueError("duplicate string found in list: %s" % (word))
        return_dict[word] = idx
    return return_dict

>>> timeit(lambda: list2dictA(TEST))
5.336584700190931
>>> timeit(lambda: list2dictB(TEST))
2.7587691306531
>>> timeit(lambda: list2dictC(TEST))
2.1609074989233292
>>> timeit(lambda: list2dictD(TEST))
2.2543816669587216
+1

list_. adsmith list2dictC() ( ~ 80 ). , list2dictE() 8% .

def list2dictC(list_):  # dict comp over enumerate(list)
    return_dict = {word: idx for idx, word in enumerate(list_)}
    if len(return_dict) == len(list_):
        return return_dict
    else:
        raise ValueError("Duplicate string found in list")

def list2dictE(list_):  # Faster for lists with ~80 items or more
    l = len(list_)
    return_dict = dict(zip(list_, range(l)))
    if len(return_dict) == l:
        return return_dict
    else:
        raise ValueError("Duplicate string found in list")

, , , , , , - l = len(list_); if l < 80: ... else: .... if, . 80 , python 2.7, 3.3.

+1
def l2d(list_):
    dupes = set(filter(lambda x: a.count(x) > 1, a))
    if len(dupes) > 0:
        raise ValueError('duplicates: %s' % (dupes))
    return dict((word, idx) for (idx, word) in enumerate(list_))

?

0

pandas, , , , , , " "

> TEST = # english scrabble dictionary, 83882 entries
> def mylist2dict(list_):
      return_dict = pd.Series(index=list_, data=np.arange(len(list_)))
      if not return_dict.index.is_unique:
          raise ValueError
      return return_dict
> %timeit list2dict(TEST)
10 loops, best of 3: 28.8 ms per loop
> %timeit mylist2dict(TEST)
100 loops, best of 3: 18.8 ms per loop
0

2,36 . OPs , , , - .

def mkdct(dct):
    l = len(dct-1)
    return {dct[x]:(l-x) for x in range(len(dct), -1, -1)}

: .

0

Source: https://habr.com/ru/post/1529073/


All Articles