I have a dictionary with information about individual positions: position_infoand information about functions feature_info. I have to find in which functions (maybe several) the positions are located, so that I can annotate the positions. Now I use:
feature_info = [[1, 10, 'a'],[15, 30, 'b'],[40, 60, 'c'],[55, 71, 'd'],[73, 84, 'e']]
position_info = {5:'some info', 16:'some other info', 75:'last info'}
for pos in position_info.keys():
for info in feature_info:
if info[0] <= pos < info[1]:
print(pos, position_info[pos], info[2])
The problem is that it feature_infocontains 800k + features and position_info150k positions, and it's pretty slow. I can optimize it a bit, but there are probably already methods that do it better than I can, but I haven't found them.
EDIT
So, for example, this is one of the ways you can speed it up:
for info in feature_info:
for pos in position_info.keys():
if info[0] <= pos < info[1]:
print(pos, position_info[pos], info[2])
if pos > info[1]:
break
, , ( , ). .
?
3
import timeit
setup = """
from bisect import bisect
import pandas as pd
import random
import numpy as np
position_info = {}
random_number = random.sample(range(9000), 8000)
random_feature_start = random.sample(range(90000), 5000)
random_feature_length = np.random.choice(1000, 5000, replace=True)
for i in random_number:
position_info[i] = 'test'
feature_info = []
for index, i in enumerate(random_feature_start):
feature_info.append([i, i+random_feature_length[index],'test'])
"""
p1 = """
sections = sorted(r for a, b, c in feature_info for r in (a,b))
for pos in position_info:
feature_info[int(bisect(sections, pos) / 2)]
"""
p2 = """
# feature info to dataframe
feature_df = pd.DataFrame(feature_info)
# rename feature df columns
feature_df.rename(index=str, columns={0: "start", 1: "end",2:'name'}, inplace=True)
# positions to dataframe
position_df = pd.DataFrame.from_dict(position_info, orient='index')
position_df['key'] = position_df.index
# merge dataframes
feature_df['merge'] = 1
position_df['merge'] = 1
merge_df = feature_df.merge(position_df, on='merge')
merge_df.drop(['merge'], inplace=True, axis=1)
# filter where key between start and end
merge_df = merge_df.loc[(merge_df.key > merge_df.start) & (merge_df.key < merge_df.end)]
"""
p3 = """
feature_df = pd.DataFrame(feature_info)
position_df = pd.DataFrame(position_info, index=[0])
hits = position_df.apply(lambda col: (feature_df [0] <= col.name) & (col.name < feature_df [1])).values.nonzero()
for f, p in zip(*hits):
position_info[position_df.columns[p]]
feature_info[f]
"""
print('bisect:',timeit.timeit(p1, setup=setup, number = 3))
print('panda method 1:',timeit.timeit(p2, setup=setup, number = 3))
print('panda method 2:',timeit.timeit(p3, setup=setup, number = 3))
bisect: 0.08317881799985116
panda 1: 29.6151025639997
panda 2: 16.90901438500032
bisect , , .
feature_info = [[1, 10, 'a'],[15, 30, 'b'],[40, 60, 'c'],[55, 71, 'd'],[2, 8, 'a_new']]
, pandas.