Search for the index of pair elements

Given the purpose ('b', 'a')and inputs:

x0 = ('b', 'a', 'z', 'z')
x1 = ('b', 'a', 'z', 'z')
x2 = ('z', 'z', 'a', 'a')
x3 = ('z', 'b', 'a', 'a')

The goal is to find the location of the continuous element ('b', 'a')and get the result:

>>> find_ba(x0)
0
>>> find_ba(x1)
0
>>> find_ba(x2)
None
>>> find_ba(x3)
1

Using the recipe pairwise:

from itertools import tee
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

I could do this to get the desired result:

def find_ba(x, target=('b', 'a')):
    try:
        return next(i for i, pair in enumerate(pairwise(x)) if pair == target)
    except StopIteration:
        return None

But that would require me to go through all pairs of characters until I find the first instance. Is there a way to find the index of pairwise elements without cyclizing all the characters?


Answering a @MatthiasFripp question in the comments:

Are your items in lists or types (as shown) or in a generator (e.g. reading from a file descriptor)?

x * - all tuples of rows. This way they can access through the index. But if the answer / solution can work on tuples and a generator, it will be great!

, ? .

. > 2.

+6
15

O(n) ( ), , (, , ), .

:

?

( O(n) ), :

from itertools import count

def find_ab(tup):
    for idx in count(start=1, step=2):
        try:
            if tup[idx] == 'b':
                if tup[idx+1] == 'a':
                    return idx
            elif tup[idx] == 'a':
                if tup[idx-1] == 'b':
                    return idx-1
        except IndexError:
            break

, , 'b' 'a'.

, , :

log(n), . , O(n*log(n)), , . , .

bucket ( hashtables)

, - (a dict) , .

, (O(n)) O(1):

d = {}
for idx, pair in enumerate(pairwise(x0)):
    if pair not in d:    # keep only the first index for each pair
        d[pair] = idx

>>> d.get(('b', 'a'), None)
0

, , " " (, ), .

:

  • O(n)
  • O(log(n)) ( )
  • O(1) lookups ( , "" ).

, , . , () , , " ". , .

, iteration_utilities.successive 1. , , 1,5-2 , . , ( ) , !

, . , "" . , . dict ( O(1)). , // , .

, , :

. , (pairwise -recipe) , . , 1 . 'z', 200 . , ( , ). , , , pure-python ( , C, Cython NumPy, Pandas ). next ( itertools.izip python2 zip), , .

. , , , (, ) .


" / ". Fortunatly " " , . :

  • Cormen, et. al -
  • Sedgewick and Wayne -

  • :

  • :
  • : "Hashtable" ( a dict).

python wiki python: "TimeComplexity" . "Get Item" "in".


1 : .

+13

, .

, .

def consecutive_index(src,sample):
    result = None
    il = [src.index(a) for a in sample if a in src]
    if len(il) == len(sample) and len(range(il[0],il[-1]))==1:
        result = il[0]
    return result



x0 = ('b', 'a', 'z', 'z')
x1 = ('b', 'a', 'z', 'z')
x2 = ('z', 'z', 'a', 'a')
x3 = ('z', 'b', 'a', 'a')
sample = ('b', 'a')

##TEST your given combinations.
print consecutive_index(x0,sample) #expected 0
print consecutive_index(x1,sample) #expected 0
print consecutive_index(x2,sample) #expected None
print consecutive_index(x3,sample) #expected 1
+2

, , ? . findPair , . findPairs .

import re

# Function looks for all non-overlapping occurrences of pair (b, a) 
# and returns a list containing their starting positions
def findPairs(x, b, a):
    x = str().join(x)
    y = str().join([str(b), str(a)])
    try:
        return [x.regs[0][0] for x in list(re.finditer(y, x))]
    except AttributeError:
        return None

# Function looks for first occurrence of the pair (b, a) 
# and returns starting position if there was a match 
# or None when the match was not found
def findPair(x, b, a):
    x = str().join(x)
    y = str().join([str(b), str(a)])
    try:
        return re.search(y, x).regs[0][0]
    except AttributeError:
        return None


if __name__ == "__main__":
    # first occurrence
    x0 = ('b', 'a', 'z', 'z')
    x1 = ('b', 'a', 'z', 'z')
    x2 = ('z', 'z', 'a', 'a')
    x3 = ('z', 'b', 'a', 'a')

    outx0 = findPair(x0, 'b', 'a')  # 0
    outx1 = findPair(x1, 'b', 'a')  # 0
    outx2 = findPair(x2, 'b', 'a')  # None
    outx3 = findPair(x3, 'b', 'a')  # 1

    # multiple occurrences:
    x4 = ('z', 'b', 'a', 'a', 'z', 'b', 'a', 'a')
    outx4 = findPairs(x4, 'b', 'a')  # [1, 5]

EDIT:

/ , , find() :

def findPairNoRe(x, b, a):
    y = str().join([str(b), str(a)])
    res = str().join(x).find(y)
    if res == -1:
        return None
    else:
        return res
+1

, . multiprocessing (. ). -, ( O (n)) .

, , :

def find_ba(tup, target):
    last_check = len(tup)-len(target)
    for i, c in enumerate(tup):
        # note: the test below only uses c 95% of the time, 
        # which makes it pretty fast
        if c == target[0] and i <= last_check and tup[i:i+len(target)] == target:
            return i
    return None

, , @MSeifert, :

def find_ba(tup, target):
    import itertools
    search = set(target)
    target_len = len(target)
    for i in count(start=1, step=target_len):
        try:
            if tup[i] in search:  # O(1) reverse lookup
                # search in this neighborhood
                c = tup[i]
                j = 0
                while True:
                    try:
                        # find next occurrence of c in the target
                        j = target[j:].index(c)
                    except ValueError:  # no more occurrences of c in target
                        break
                    # align tup and target and check for a match
                    if j >= i and tup[i-j:i-j+target_len] == target:
                        return i-j
        except IndexError:
            break
    return None

, , Python C-:

def find_ba(x, target):
    # assuming x and target are both strings
    pos = x.find(target)
    return pos if pos >= 0 else None

( , , , , .)

, ( , ). , , (, ):

import itertools
def find_ba(lst, target):
    a, b = itertools.tee(lst)
    next(b)
    for i, pair in enumerate(zip(a, b)):
        if pair == target:
            return i
    return None

. Python 2.7 itertools.izip zip Python 2.7.

- multiprocessing. , multiprocessing.Pool.map . , , itertools.islice, , multiprocessing.Pool.map, ; . , .

+1

: , , . , , , , .

, , , . , (, , Python , C). .

(, !)

def find_ba( x, target=('b','a'), separator = '|' ):
   t = separator.join(target)
   try:
        return  ( separator.join([ c for c in x]).index(t) ) / 2
   except ValueError:
        return None

(, , SW/l10O/Il0O/01L1lO00/22 ).

0

itertools, , :

import itertools
def check(x, target):
    for t in itertools.izip(x, itertools.islice(x, 1, len(x))):
        if t == target:
            return True
    return False
check(x0, ('b', 'a'))
True

EDIT: zip python3

0

nigel222, ( ), , , , , , .

, , , O (1) .

...
my_pairwise = set(pairwise(x))
found_subsequences = [subsequence
                      for subsequence in collection_of_subsequences
                      if subsequence in my_pairwise]

, O (n) x , - O (1).

0

,

def look_up(needle, haystack):
    i = ''.join(haystack).find(''.join(needle))
    return i if i > -1 else None

, , :

x0 = ('b', 'a', 'z', 'z')
x1 = ('b', 'a', 'z', 'z')
x2 = ('z', 'z', 'a', 'a')
x3 = ('z', 'b', 'a', 'a')
ba = ('b', 'a')

:

print(look_up(ba, x0)) # Prints: 0
print(look_up(ba, x1)) # Prints: 0
print(look_up(ba, x2)) # Prints: None
print(look_up(ba, x3)) # Prints: 1

:

def look_up_multiple(needle, haystack):
    needle_str = ''.join(needle)
    haystack_str = ''.join(haystack)
    indexes = []
    i = 0
    while i < len(haystack_str):
        i = haystack_str.find(needle_str, i)
        if i > -1:
            indexes.append(i)
        i += 2
    return indexes

:

x = ('b', 'a', 'z', 'z', 'b', 'a')
ba = ('b', 'a')

print(look_up_multiple(ba, x)) # Prints: [0, 4]
0

, .

def findba(x,target):
    x1 = "".join(x) 
    target1 = "".join(target)
    if target1 in x1:
        return x1.index(target1)
    else:
        return None

ab = ('b','a')
x0 = ('b', 'a', 'z', 'z')
x1 = ('b', 'a', 'z', 'z')
x2 = ('z', 'z', 'a', 'a')
x3 = ('z', 'b', 'a', 'a')

print findba(x0,ab)
print findba(x1,ab)
print findba(x2,ab)
print findba(x3,ab)
0

, . ( Python 3):

from itertools import islice, tee

def find_ba(x):
    pairs = zip(*(islice(g, i, None) for i, g in enumerate(tee(x, 2))))
    return next(
        (i for i, pair in enumerate(pairs) if pair == ('b', 'a')),
        None)
0

target, index . , int target. , 'b' . .

, , .

def find_ba(x, target=('b','a')):
    try:
        ind = 0
        while ind < len(x):
            ind += x[ind:].index(target[0])
            if x[ind+1] == target[1]:
                return ind
            ind += 1
    except ValueError:
        return None

:

# 100 random letters
letters = ['f', 'y', 'h', 'u', 't', 'l', 'y', 'u', 'm', 'z', 'a', 'a',
           'i', 't', 'g', 'm', 'b', 'l', 'z', 'q', 'g', 'f', 'f', 'b', 
           'b', 'a', 'c', 'z', 'n', 'j', 'v', 'b', 'k', 'j', 'y', 'm', 
           'm', 'f', 'z', 'x', 'f', 'q', 'w', 'h', 'p', 'x', 't', 'n', 
           'm', 'd', 'z', 'q', 'v', 'h', 'b', 'f', 'q', 'd', 'b', 's', 
           'a', 't', 'j', 'm', 'h', 'r', 'd', 'n', 'e', 'k', 'y', 'z', 
           'd', 'e', 'x', 'h', 'r', 'z', 'b', 'n', 'q', 'v', 't', 'q', 
           'f', 'w', 'b', 'w', 'f', 'c', 'f', 'h', 'q', 'o', 'r', 'f', 
           'w', 'w', 'n', 'v']
find_ba(letters)  # 24

, zip :

def find_ba1(x):
    try:
        return [(i,j) for i,j in zip(x[:-1], x[1:])].index(('b', 'a'))
    except ValueError:
        return None

:

%timeit find_ba(letters)
100000 loops, best of 3: 2.31 µs per loop

%timeit find_ba1(letters)
100000 loops, best of 3: 8.4 µs per loop
0

- (.. , ) , O (n). (.. ), , , , : , ( "b" "b" "a" , "b" , , ) , ( , , ). , , , , , . , , ( len (list)/len (target)), ( , 'b ',' a ' , ). , , , , . , multi- parallelism , . ( , , python concurrency, multi- parallelism - , ).

, , . , . , , , , . , , , , .

0

:

numpy, .

#np.roll(x1,-1) shifts the list leftwise one element. np.core.defchararray.add builds a paired sequence. 
np.where(np.core.defchararray.add(x1,np.roll(x1,-1)) == 'ba')[0]

Test

for x in [x0,x1,x2,x3]:
    print (np.where(np.core.defchararray.add(x,np.roll(x,-1)) == 'ba'))[0]

[0]
[0]
[]
[1]
0

MSeifert . MSeifert, , , . , . , , - . .

05/09/2017 :
@Matthias Fripp, 10k 100k . Mine 10k , 100k . . , "" , @MSeifert , , .

import random # to generate data
# Set up data
x0 = ('b', 'a', 'z', 'z')
x1 = ('b', 'a', 'z', 'z')
x2 = ('z', 'z', 'a', 'a')
x3 = ('z', 'b', 'a', 'a')
x4 = tuple([random.choice(x3) for i in xrange(10000)])
x5 = tuple([random.choice(x3) for i in xrange(100000)])

# Set up functions
# My code
def findPairwise(x,target):
    currentX = x
    cumulatedIdx=0
    while(1):
        try:
            idx = currentX.index(target[0])
            try:
                if currentX[idx+1] == target[1]:
                    return(idx+cumulatedIdx)
            except:
                pass
        except:
            break
        currentX = currentX[idx+2:]
        cumulatedIdx += idx+2

# MSeifert method
from itertools import count
def find_ab(tup,target):
    for idx in count(start=1, step=2):
        try:
            if tup[idx] == target[0]:
                if tup[idx+1] == target[1]:
                    return idx
            elif tup[idx] == target[1]:
                if tup[idx-1] == target[0]:
                    return idx-1
        except IndexError:
            break

In [109]: %timeit findPairwise(x0,target)
The slowest run took 8.66 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 1.27 µs per loop

In [110]: %timeit find_ab(x0,target)
The slowest run took 5.49 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.04 µs per loop

In [111]: %timeit findPairwise(x1,target)
The slowest run took 4.75 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 1.46 µs per loop

In [112]: %timeit find_ab(x1,target)
The slowest run took 5.04 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 1.99 µs per loop

In [113]: %timeit findPairwise(x2,target)
The slowest run took 4.66 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.56 µs per loop

In [114]: %timeit find_ab(x2,target)
The slowest run took 5.89 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 4.25 µs per loop

In [115]: %timeit findPairwise(x3,target)
The slowest run took 8.59 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 1.28 µs per loop

In [116]: %timeit find_ab(x3,target)
The slowest run took 6.66 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 1.65 µs per loop

In [151]: %timeit findPairwise(x4,target)
The slowest run took 5.46 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 1.27 µs per loop

In [152]: %timeit find_ab(x4,target)
The slowest run took 6.21 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 1.92 µs per loop

In [153]: %timeit findPairwise(x5,target)
1000 loops, best of 3: 325 µs per loop

In [154]: %timeit find_ab(x5,target)
The slowest run took 4.35 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 3.45 µs per loop
0

, , , , . , ( ).

# store first occurrence of each unique 2-char string (O(n))
x1_first = dict()
target_len = 2
for i in range(len(x1)):
    x1_first.setdefault(x1[i:i+target_len], i)

# find first occurrence of a particular string without looping (O(1))
print x1_first.get(('a', 'b'), None)

. @MSeifert, , . , dicts , . , , (, 10 ), (. bisect). .

0

Source: https://habr.com/ru/post/1016788/


All Articles