Reading a file and building a CDF in Python

I need to read a long file with a timestamp in seconds, as well as a CDF schedule using numpy or scipy. I tried with numpy, but it seems that the output is NOT what it should be. Code below: any suggestions are appreciated.

import numpy as np
import matplotlib.pyplot as plt

data = np.loadtxt('Filename.txt')
sorted_data = np.sort(data)
cumulative = np.cumsum(sorted_data)

plt.plot(cumulative)
plt.show()
+4
source share
5 answers

You have two options:

1: you can save the data first. This is easy to do with the function numpy.histogram:

import numpy as np
import matplotlib.pyplot as plt

data = np.loadtxt ('Filename.txt')

# Choose how many bins you want here
num_bins = 20

# Use the histogram function to bin the data
counts, bin_edges = np.histogram(data, bins=num_bins, normed=True)

# Now find the cdf
cdf = np.cumsum(counts)

# And finally plot the cdf
plt.plot(bin_edges[1:], cdf)

plt.show()

2: numpy.cumsum sorted_data , , ( . fooobar.com/questions/113786/...):

import numpy as np

import matplotlib.pyplot as plt

data = np.loadtxt('Filename.txt')

sorted_data = np.sort(data)

yvals=np.arange(len(sorted_data))/float(len(sorted_data)-1)

plt.plot(sorted_data,yvals)

plt.show()

+13

:

  • : .
  • float

numpy.histogram, , . density=False, :

, 1,

, .

import numpy as np
import matplotlib.pyplot as plt

def cdf(data):

    data_size=len(data)

    # Set bins edges
    data_set=sorted(set(data))
    bins=np.append(data_set, data_set[-1]+1)

    # Use the histogram function to bin the data
    counts, bin_edges = np.histogram(data, bins=bins, density=False)

    counts=counts.astype(float)/data_size

    # Find the cdf
    cdf = np.cumsum(counts)

    # Plot the cdf
    plt.plot(bin_edges[0:-1], cdf,linestyle='--', marker="o", color='b')
    plt.ylim((0,1))
    plt.ylabel("CDF")
    plt.grid(True)

    plt.show()

, :

#[ 0.   0.   0.1  0.1  0.2  0.2  0.3  0.3  0.4  0.4  0.6  0.8  1.   1.2]
data = np.concatenate((np.arange(0,0.5,0.1),np.arange(0.6,1.4,0.2),np.arange(0,0.5,0.1)))
cdf(data)

:

Cdf


cdf, ( ):

import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d

def cdf(data):

    data_size=len(data)

    # Set bins edges
    data_set=sorted(set(data))
    bins=np.append(data_set, data_set[-1]+1)

    # Use the histogram function to bin the data
    counts, bin_edges = np.histogram(data, bins=bins, density=False)

    counts=counts.astype(float)/data_size

    # Find the cdf
    cdf = np.cumsum(counts)

    x = bin_edges[0:-1]
    y = cdf

    f = interp1d(x, y)
    f2 = interp1d(x, y, kind='cubic')

    xnew = np.linspace(0, max(x), num=1000, endpoint=True)

    # Plot the cdf
    plt.plot(x, y, 'o', xnew, f(xnew), '-', xnew, f2(xnew), '--')
    plt.legend(['data', 'linear', 'cubic'], loc='best')
    plt.title("Interpolation")
    plt.ylim((0,1))
    plt.ylabel("CDF")
    plt.grid(True)

    plt.show()

Interpolation

+3

plt.plot(sorted_data, np.linspace(0,1,sorted_data.size)

,

+2

Here's an implementation that is slightly more efficient if there are many duplicate values ​​(since we need to sort only unique values). And it displays CDF as a step function, which, strictly speaking, is.

import sys

import numpy as np
import matplotlib.pyplot as plt

from collections import Counter


def read_data(fp):
    t = []
    for line in fp:
        x = float(line.rstrip())
        t.append(x)
    return t


def main(script, filename=None):
    if filename is None:
        fp = sys.stdin
    else:
        fp = open(filename)

    t = read_data(fp)
    counter = Counter(t)

    xs = counter.keys()
    xs.sort()

    ys = np.cumsum(counter.values()).astype(float)
    ys /= ys[-1]

    options = dict(linewidth=3, alpha=0.5)
    plt.step(xs, ys, where='post', **options)
    plt.xlabel('Values')
    plt.ylabel('CDF')
    plt.show()


if __name__ == '__main__':
    main(*sys.argv)
+1
source

Following is the step of my implementation:

1.sort your data

2. Calculate the cumulative probability of each x '

import numpy as np
import matplotlib.pyplab as plt

def cdf(data):
    n = len(data)
    x = np.sort(data) # sort your data
    y = np.arange(1, n + 1) / n # calculate cumulative probability
    return x, y

x_data, y_data = cdf(your_data)
plt.plot(x_data, y_data) 

Example:

test_data = np.random.normal(size= 100)
x_data, y_data = ecdf(test_data)
plt.plot(x_data, y_data, marker= '.', linestyle= 'none')

Figure: Graphics Link

0
source

Source: https://habr.com/ru/post/1547174/


All Articles