How to quickly match if a point is between 2 points, for large lists

I have a dictionary with information about individual positions: position_infoand information about functions feature_info. I have to find in which functions (maybe several) the positions are located, so that I can annotate the positions. Now I use:

feature_info = [[1, 10, 'a'],[15, 30, 'b'],[40, 60, 'c'],[55, 71, 'd'],[73, 84, 'e']]
position_info = {5:'some info', 16:'some other info', 75:'last info'}
for pos in position_info.keys():
    for info in feature_info:
        if info[0] <= pos < info[1]:
            print(pos, position_info[pos], info[2])

The problem is that it feature_infocontains 800k + features and position_info150k positions, and it's pretty slow. I can optimize it a bit, but there are probably already methods that do it better than I can, but I haven't found them.

EDIT

So, for example, this is one of the ways you can speed it up:

for info in feature_info:
    for pos in position_info.keys():
        if info[0] <= pos < info[1]:
            print(pos, position_info[pos], info[2])
        if pos > info[1]:
            break

, , ( , ). .

?

3

import timeit

setup = """
from bisect import bisect
import pandas as pd
import random
import numpy as np

position_info = {}

random_number = random.sample(range(9000), 8000)
random_feature_start = random.sample(range(90000), 5000)
random_feature_length = np.random.choice(1000, 5000, replace=True)

for i in random_number:
    position_info[i] = 'test'
feature_info = []
for index, i in enumerate(random_feature_start):
    feature_info.append([i, i+random_feature_length[index],'test'])

"""

p1 = """
sections = sorted(r for a, b, c in feature_info for r in (a,b))
for pos in position_info:
    feature_info[int(bisect(sections, pos) / 2)]
"""

p2 = """
# feature info to dataframe
feature_df = pd.DataFrame(feature_info)

# rename feature df columns
feature_df.rename(index=str, columns={0: "start", 1: "end",2:'name'}, inplace=True)

# positions to dataframe
position_df = pd.DataFrame.from_dict(position_info, orient='index')
position_df['key'] = position_df.index

# merge dataframes
feature_df['merge'] = 1
position_df['merge'] = 1
merge_df = feature_df.merge(position_df, on='merge')
merge_df.drop(['merge'], inplace=True, axis=1)

# filter where key between start and end
merge_df = merge_df.loc[(merge_df.key > merge_df.start) & (merge_df.key < merge_df.end)] 
"""

p3 = """
feature_df = pd.DataFrame(feature_info)
position_df = pd.DataFrame(position_info, index=[0])
hits = position_df.apply(lambda col: (feature_df [0] <= col.name) & (col.name < feature_df [1])).values.nonzero()
for f, p in zip(*hits):
    position_info[position_df.columns[p]]
    feature_info[f]
"""

print('bisect:',timeit.timeit(p1, setup=setup, number = 3))
print('panda method 1:',timeit.timeit(p2, setup=setup, number = 3))
print('panda method 2:',timeit.timeit(p3, setup=setup, number = 3))

bisect: 0.08317881799985116
panda 1: 29.6151025639997
panda 2: 16.90901438500032

bisect , , .

feature_info = [[1, 10, 'a'],[15, 30, 'b'],[40, 60, 'c'],[55, 71, 'd'],[2, 8, 'a_new']]

, pandas.

+4
4

- : pandas. Pandas , .

feature_df = pd.DataFrame(feature_info)
position_df = pd.DataFrame(position_info, index=[0])
hits = position_df.apply(lambda col: (feature_df[0] <= col.name) & (col.name < feature_df[1])).values.nonzero()
for feature, position in zip(*hits):
    print(position_info[position_df.columns[p]], "between", feature_info[f])
+1

bisect .

, . , , .

feature_info[n][0:1] - , ( ) 2.

from bisect import bisect

feature_info = [[1, 10, 'a'],[15, 30, 'b'],[40, 60, 'c'],[55, 71, 'd'],[73, 84, 'e']]
position_info = {5:'some info', 16:'some other info', 75:'last info'}
sections = sorted(r for a, b, c in feature_info for r in (a,b))

for pos in position_info:
  print(pos, feature_info[bisect(sections, pos) / 2])

( , ):

(16, [15, 30, 'b'])
(75, [73, 84, 'e'])
(5, [1, 10, 'a'])
+1

OK?

:

  • "" (, ) - (index, start/end, feature). .

( ):

  • " "
  • :
    • : , :
      • , .
    • ,
    • ,

, :

  • ,
  • () .

, - - . O (N + M), ( current_features ).

, ; , .

+1

Also used pandas. First converts them to dataframes, then combines them, and then filters where the position information is between the columns of function information.

import pandas as pd

feature_info = [[1, 10, 'a'],[15, 30, 'b'],[40, 60, 'c'],[55, 71, 'd'],[73, 84, 'e']]
position_info = {5:'some info', 16:'some other info', 75:'last info'}

# feature info to dataframe
feature_df = pd.DataFrame(feature_info)

# rename feature df columns
feature_df.rename(index=str, columns={0: "start", 1: "end",2:'name'}, inplace=True)

# positions to dataframe
position_df = pd.DataFrame.from_dict(position_info, orient='index')
position_df['key'] = position_df.index

# merge dataframes
feature_df['merge'] = 1
position_df['merge'] = 1
merge_df = feature_df.merge(position_df, on='merge')
merge_df.drop(['merge'], inplace=True, axis=1)

# filter where key between start and end
merge_df = merge_df.loc[(merge_df.key > merge_df.start) & (merge_df.key < merge_df.end)] 
+1
source

Source: https://habr.com/ru/post/1696318/


All Articles