Python multi-line regression using OLS code with specific data?

Question

Python multi-line regression using OLS code with specific data?

I use the ols.py code loaded in the scipy Cookbook (the download is in the first paragraph with bold OLS), but I need to understand, and not use random data for the ols function to perform multiple linear regression.

I have a specific dependent variable y and three explanatory variables. Every time I try to put my variables instead of random variables, this gives me an error:

TypeError: this constructor takes no arguments.

Can anyone help? Is it possible to do this?

Here is a copy of the ols code I'm trying to use with the variables I'm trying to enter

 from __future__ import division from scipy import c_, ones, dot, stats, diff from scipy.linalg import inv, solve, det from numpy import log, pi, sqrt, square, diagonal from numpy.random import randn, seed import time class ols: """ Author: Vincent Nijs (+ ?) Email: v-nijs at kellogg.northwestern.edu Last Modified: Mon Jan 15 17:56:17 CST 2007 Dependencies: See import statement at the top of this file Doc: Class for multi-variate regression using OLS Input: dependent variable y_varnm = string with the variable label for y x = independent variables, note that a constant is added by default x_varnm = string or list of variable labels for the independent variables Output: There are no values returned by the class. Summary provides printed output. All other measures can be accessed as follows: Step 1: Create an OLS instance by passing data to the class m = ols(y,x,y_varnm = 'y',x_varnm = ['x1','x2','x3','x4']) Step 2: Get specific metrics To print the coefficients: >>> print mb To print the coefficients p-values: >>> print mp """ y = [29.4, 29.9, 31.4, 32.8, 33.6, 34.6, 35.5, 36.3, 37.2, 37.8, 38.5, 38.8, 38.6, 38.8, 39, 39.7, 40.6, 41.3, 42.5, 43.9, 44.9, 45.3, 45.8, 46.5, 77.1, 48.2, 48.8, 50.5, 51, 51.3, 50.7, 50.7, 50.6, 50.7, 50.6, 50.7] #tuition x1 = [376, 407, 438, 432, 433, 479, 512, 543, 583, 635, 714, 798, 891, 971, 1045, 1106, 1218, 1285, 1356, 1454, 1624, 1782, 1942, 2057, 2179, 2271, 2360, 2506, 2562, 2700, 2903, 3319, 3629, 3874, 4102, 4291] #research and development x2 = [28740.00, 30952.00, 33359.00, 35671.00, 39435.00, 43338.00, 48719.00, 55379.00, 63224.00, 72292.00, 80748.00, 89950.00, 102244.00, 114671.00, 120249.00, 126360.00, 133881.00, 141891.00, 151993.00, 160876.00, 165350.00, 165730.00, 169207.00, 183625.00, 197346.00, 212152.00, 226402.00, 267298.00, 277366.00, 276022.00, 288324.00, 299201.00, 322104.00, 347048.00, 372535.00, 397629.00] #one/none parents x3 = [11610, 12143, 12486, 13015, 13028, 13327, 14074, 14094, 14458, 14878, 15610, 15649, 15584, 16326, 16379, 16923, 17237, 17088, 17634, 18435, 19327, 19712, 21424, 21978, 22684, 22597, 22735, 22217, 22214, 22655, 23098, 23602, 24013, 24003, 21593, 22319] def __init__(self,y,x1,y_varnm = 'y',x_varnm = ''): """ Initializing the ols class. """ self.y = y #self.x1 = c_[ones(x1.shape[0]),x1] self.y_varnm = y_varnm if not isinstance(x_varnm,list): self.x_varnm = ['const'] + list(x_varnm) else: self.x_varnm = ['const'] + x_varnm # Estimate model using OLS self.estimate() def estimate(self): # estimating coefficients, and basic stats self.inv_xx = inv(dot(self.xT,self.x)) xy = dot(self.xT,self.y) self.b = dot(self.inv_xx,xy) # estimate coefficients self.nobs = self.y.shape[0] # number of observations self.ncoef = self.x.shape[1] # number of coef. self.df_e = self.nobs - self.ncoef # degrees of freedom, error self.df_r = self.ncoef - 1 # degrees of freedom, regression self.e = self.y - dot(self.x,self.b) # residuals self.sse = dot(self.e,self.e)/self.df_e # SSE self.se = sqrt(diagonal(self.sse*self.inv_xx)) # coef. standard errors self.t = self.b / self.se # coef. t-statistics self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2 # coef. p-values self.R2 = 1 - self.e.var()/self.y.var() # model R-squared self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef)) # adjusted R-square self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e) # model F-statistic self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e) # F-statistic p-value def dw(self): """ Calculates the Durbin-Waston statistic """ de = diff(self.e,1) dw = dot(de,de) / dot(self.e,self.e); return dw def omni(self): """ Omnibus test for normality """ return stats.normaltest(self.e) def JB(self): """ Calculate residual skewness, kurtosis, and do the JB test for normality """ # Calculate residual skewness and kurtosis skew = stats.skew(self.e) kurtosis = 3 + stats.kurtosis(self.e) # Calculate the Jarque-Bera test for normality JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3)) JBpv = 1-stats.chi2.cdf(JB,2); return JB, JBpv, skew, kurtosis def ll(self): """ Calculate model log-likelihood and two information criteria """ # Model log-likelihood, AIC, and BIC criterion values ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs) aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs) bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs return ll, aic, bic def summary(self): """ Printing model output to screen """ # local time & date t = time.localtime() # extra stats ll, aic, bic = self.ll() JB, JBpv, skew, kurtosis = self.JB() omni, omnipv = self.omni() # printing output to screen print '\n==============================================================================' print "Dependent Variable: " + self.y_varnm print "Method: Least Squares" print "Date: ", time.strftime("%a, %d %b %Y",t) print "Time: ", time.strftime("%H:%M:%S",t) print '# obs: %5.0f' % self.nobs print '# variables: %5.0f' % self.ncoef print '==============================================================================' print 'variable coefficient std. Error t-statistic prob.' print '==============================================================================' for i in range(len(self.x_varnm)): print '''% -5s % -5.6f % -5.6f % -5.6f % -5.6f''' % tuple([self.x_varnm[i],self.b[i],self.se[i],self.t[i],self.p[i]]) print '==============================================================================' print 'Models stats Residual stats' print '==============================================================================' print 'R-squared % -5.6f Durbin-Watson stat % -5.6f' % tuple([self.R2, self.dw()]) print 'Adjusted R-squared % -5.6f Omnibus stat % -5.6f' % tuple([self.R2adj, omni]) print 'F-statistic % -5.6f Prob(Omnibus stat) % -5.6f' % tuple([self.F, omnipv]) print 'Prob (F-statistic) % -5.6f JB stat % -5.6f' % tuple([self.Fpv, JB]) print 'Log likelihood % -5.6f Prob(JB) % -5.6f' % tuple([ll, JBpv]) print 'AIC criterion % -5.6f Skew % -5.6f' % tuple([aic, skew]) print 'BIC criterion % -5.6f Kurtosis % -5.6f' % tuple([bic, kurtosis]) print '==============================================================================' if __name__ == '__main__': ########################## ### testing the ols class ########################## # intercept is added, by default m = ols(y,x1,y_varnm = 'y',x_varnm = ['x1','x2','x3']) m.summary()

+6

python scipy regression

Rencam Sep 17 '11 at 10:58

source share

3 answers

thomash · Answer 1 · 2013-11-03T16:13:53+0000

You must distinguish between two cases: i) you just want to solve the equation. ii) you also want to find out statistical information about your model. You can do i) with np.linalg.lstsq; and for ii) you better use statsmodels.

Below you will find an example with two solutions:

 # The standard imports import numpy as np import pandas as pd # For the statistic from statsmodels.formula.api import ols def generatedata(): ''' Generate and show the data ''' x = np.linspace(-5,5,101) (X,Y) = np.meshgrid(x,x) # To get reproducable values, I provide a seed value np.random.seed(987654321) Z = -5 + 3*X-0.5*Y+np.random.randn(np.shape(X)[0], np.shape(X)[1]) return (X.flatten(),Y.flatten(),Z.flatten()) def regressionmodel(X,Y,Z): '''Multilinear regression model, calculating fit, P-values, confidence intervals etc.''' # Convert the data into a Pandas DataFrame df = pd.DataFrame({'x':X, 'y':Y, 'z':Z}) # Fit the model model = ols("z ~ x + y", df).fit() # Print the summary print(model.summary()) return model._results.params # should be array([-4.99754526, 3.00250049, -0.50514907]) def linearmodel(X,Y,Z): '''Just fit the plane''' M = np.vstack((np.ones(len(X)), X, Y)).T bestfit = np.linalg.lstsq(M,Z)[0] print('Best fit plane:', bestfit) return bestfit if __name__ == '__main__': (X,Y,Z) = generatedata() regressionmodel(X,Y,Z) linearmodel(X,Y,Z)

user333700 · Answer 2 · 2011-09-19T04:14:31+0000

possibly using http://pypi.python.org/pypi/scikits.statsmodels and it has more features

 import numpy as np import scikits.statsmodels.api as sm y = [29.4, 29.9, 31.4, 32.8, 33.6, 34.6, 35.5, 36.3, 37.2, 37.8, 38.5, 38.8, 38.6, 38.8, 39, 39.7, 40.6, 41.3, 42.5, 43.9, 44.9, 45.3, 45.8, 46.5, 77.1, 48.2, 48.8, 50.5, 51, 51.3, 50.7, 50.7, 50.6, 50.7, 50.6, 50.7] #tuition x1 = [376, 407, 438, 432, 433, 479, 512, 543, 583, 635, 714, 798, 891, 971, 1045, 1106, 1218, 1285, 1356, 1454, 1624, 1782, 1942, 2057, 2179, 2271, 2360, 2506, 2562, 2700, 2903, 3319, 3629, 3874, 4102, 4291] #research and development x2 = [28740.00, 30952.00, 33359.00, 35671.00, 39435.00, 43338.00, 48719.00, 55379.00, 63224.00, 72292.00, 80748.00, 89950.00, 102244.00, 114671.00, 120249.00, 126360.00, 133881.00, 141891.00, 151993.00, 160876.00, 165350.00, 165730.00, 169207.00, 183625.00, 197346.00, 212152.00, 226402.00, 267298.00, 277366.00, 276022.00, 288324.00, 299201.00, 322104.00, 347048.00, 372535.00, 397629.00] #one/none parents x3 = [11610, 12143, 12486, 13015, 13028, 13327, 14074, 14094, 14458, 14878, 15610, 15649, 15584, 16326, 16379, 16923, 17237, 17088, 17634, 18435, 19327, 19712, 21424, 21978, 22684, 22597, 22735, 22217, 22214, 22655, 23098, 23602, 24013, 24003, 21593, 22319] x = np.column_stack((x1,x2,x3)) #stack explanatory variables into an array x = sm.add_constant(x, prepend=True) #add a constant res = sm.OLS(y,x).fit() #create a model and fit it print res.params print res.bse print res.summary()

Foo bah · Answer 3 · 2011-09-18T00:02:14+0000

the ols function takes a second independent dataset as the second argument. Try

 m = ols(y, [x1, x2, x3], ...)

although I suspect you might need to wrap it in numpy arrays:

 x = numpy.ndarray([3, len(x1)]) x[0] = numpy.array(x1) x[1] = numpy.array(x2) x[2] = numpy.array(x3) m = ols(y, x, ...)

Python multi-line regression using OLS code with specific data?

More articles: