Profile hit graphs in python

Question

Profile hit graphs in python

I am trying to make a profile graph for two pandas.DataFrame columns. I would not expect this to happen directly in pandas, but there seems to be nothing in matplotlib either. I searched around and cannot find it in any packages other than rootpy. Before I could write this myself, I thought I would ask if there is a small package containing profile histograms, possibly where they are known by a different name.

If you don’t know what I mean by “profile histogram”, look at the ROOT implementation. http://root.cern.ch/root/html/TProfile.html

+4

python matplotlib pandas histogram

Keith May 17 '14 at 8:53

source share

5 answers

.

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt

def Profile(x,y,nbins,xmin,xmax,ax):
    df = DataFrame({'x' : x , 'y' : y})

    binedges = xmin + ((xmax-xmin)/nbins) * np.arange(nbins+1)
    df['bin'] = np.digitize(df['x'],binedges)

    bincenters = xmin + ((xmax-xmin)/nbins)*np.arange(nbins) + ((xmax-xmin)/(2*nbins))
    ProfileFrame = DataFrame({'bincenters' : bincenters, 'N' : df['bin'].value_counts(sort=False)},index=range(1,nbins+1))

    bins = ProfileFrame.index.values
    for bin in bins:
        ProfileFrame.ix[bin,'ymean'] = df.ix[df['bin']==bin,'y'].mean()
        ProfileFrame.ix[bin,'yStandDev'] = df.ix[df['bin']==bin,'y'].std()
        ProfileFrame.ix[bin,'yMeanError'] = ProfileFrame.ix[bin,'yStandDev'] / np.sqrt(ProfileFrame.ix[bin,'N'])

    ax.errorbar(ProfileFrame['bincenters'], ProfileFrame['ymean'], yerr=ProfileFrame['yMeanError'], xerr=(xmax-xmin)/(2*nbins), fmt=None) 
    return ax


def Profile_Matrix(frame):
  #Much of this is stolen from https://github.com/pydata/pandas/blob/master/pandas/tools/plotting.py


    import pandas.core.common as com
    import pandas.tools.plotting as plots
    from pandas.compat import lrange
    from matplotlib.artist import setp

    range_padding=0.05

    df = frame._get_numeric_data()
    n = df.columns.size

    fig, axes = plots._subplots(nrows=n, ncols=n, squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = com.notnull(df)

    boundaries_list = []
    for a in df.columns:
        values = df[a].values[mask[a].values]
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
        boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))

    for i, a in zip(lrange(n), df.columns):
        for j, b in zip(lrange(n), df.columns):

            common = (mask[a] & mask[b]).values
            nbins = 100
            (xmin,xmax) = boundaries_list[i]

            ax = axes[i, j]
            Profile(df[a][common],df[b][common],nbins,xmin,xmax,ax)

            ax.set_xlabel('')
            ax.set_ylabel('')

            plots._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
            plots._label_axis(ax, kind='y', label=a, position='left')

            if j!= 0:
                ax.yaxis.set_visible(False)
            if i != n-1:
                ax.xaxis.set_visible(False)

    for ax in axes.flat:
        setp(ax.get_xticklabels(), fontsize=8)
        setp(ax.get_yticklabels(), fontsize=8)

    return axes

+2

Keith 19 '14 15:39

, scipy.stats.binned_statistic.

import scipy.stats
import numpy
import matplotlib.pyplot as plt

x = numpy.random.rand(10000)
y = x + scipy.stats.norm(0, 0.2).rvs(10000)

means_result = scipy.stats.binned_statistic(x, [y, y**2], bins=50, range=(0,1), statistic='mean')
means, means2 = means_result.statistic
standard_deviations = numpy.sqrt(means2 - means**2)
bin_edges = means_result.bin_edges
bin_centers = (bin_edges[:-1] + bin_edges[1:])/2.

plt.errorbar(x=bin_centers, y=means, yerr=standard_deviations, linestyle='none', marker='.')

+2

jsw 22 . '16 21:11

@Keith , , . , , .

: , ROOT ProfileHistogram, . . matplotlib.

, . pandas, pandas . ROOT , .

, : x - y.

np.digitize pandas groupy aggregate.

:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# just some random numbers to get startet
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
df = pd.DataFrame({'x': x, 'y': y})


# calculate in which bin row belongs base on `x`
# bins needs the bin edges, so this will give as 100 equally sized bins
bins = np.linspace(-2, 2, 101)
df['bin'] = np.digitize(x, bins=bins)
bin_centers = 0.5 * (bins[:-1] + bins[1:])
bin_width = bins[1] - bins[0]

# grouby bin, so we can calculate stuff
binned = df.groupby('bin')
# calculate mean and standard error of the mean for y in each bin
result = binned['y'].agg(['mean', 'sem'])
result['x'] = bin_centers
result['xerr'] = bin_width / 2

# plot it

result.plot(
    x='x',
    y='mean',
    xerr='xerr',
    yerr='sem',
    linestyle='none',
    capsize=0,
    color='black',
)
plt.savefig('result.png', dpi=300)

ROOT;)

+2

MaxNoe 22 . '16 21:46

As far as I know, matplotlib does not allow you to directly create profile histograms. Instead, you can take a look at the Hippodraw package developed in SLAC, which can be used as a Python extension module. Here is an example of a profile histogram:

http://www.slac.stanford.edu/grp/ek/hippodraw/datareps_root.html#datareps_profilehist

0

Antlab May 17 '14 at 12:30

source share

Ruggero Turra · Accepted Answer · 2016-12-20T16:32:17+0000

seaborn. @MaxNoe

import numpy as np
import seaborn as sns

# just some random numbers to get startet
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)

sns.regplot(x=x, y=y, x_bins=10, fit_reg=None)

( , y, ,...)

Profile hit graphs in python

More articles: