Regresión lineal con Python y statsmodels
La regresión lineal es un modelo usado para aproximar la relación de dependencia entre una variable dependiente Y, y variables independientes X.
Por ejemplo, puede predecir cuánto de las ventas de un vendedor (la variable dependiente) se debe a variables independientes como su educaciñon, años de experiencia, horas trabajadas etc. (las variables independientes).
El módulo statsmodels permite la estimación por mínimos cuadrados ordinarios (OLS), mínimos cuadrados ponderados (WLS) y mínimos cuadrados generalizados (GLS).
import numpy as np import statsmodels.api as sm spector_data = sm.datasets.spector.load() spector_data.exog = sm.add_constant(spector_data.exog, prepend=False) mod = sm.OLS(spector_data.endog, spector_data.exog) res = mod.fit() print(res.summary())
Resultado
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.416 Model: OLS Adj. R-squared: 0.353 Method: Least Squares F-statistic: 6.646 Date: Mon, 14 May 2018 Prob (F-statistic): 0.00157 Time: 21:48:12 Log-Likelihood: -12.978 No. Observations: 32 AIC: 33.96 Df Residuals: 28 BIC: 39.82 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ x1 0.4639 0.162 2.864 0.008 0.132 0.796 x2 0.0105 0.019 0.539 0.594 -0.029 0.050 x3 0.3786 0.139 2.720 0.011 0.093 0.664 const -1.4980 0.524 -2.859 0.008 -2.571 -0.425 ============================================================================== Omnibus: 0.176 Durbin-Watson: 2.346 Prob(Omnibus): 0.916 Jarque-Bera (JB): 0.167 Skew: 0.141 Prob(JB): 0.920 Kurtosis: 2.786 Cond. No. 176. ==============================================================================
Mínimos cuadrados ordinarios (OLS)
%matplotlib inline from __future__ import print_function import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt from statsmodels.sandbox.regression.predstd import wls_prediction_std np.random.seed(9876789) #Datos artificiales nsample = 100 x = np.linspace(0, 10, 100) X = np.column_stack((x, x**2)) beta = np.array([1, 0.1, 10]) e = np.random.normal(size=nsample) X = sm.add_constant(X) y = np.dot(X, beta) + e model = sm.OLS(y, X) results = model.fit() print(results.summary()) print('Parameters: ', results.params) print('R2: ', results.rsquared)
Mínimos cuadrados ponderados (WLS)
%matplotlib inline from __future__ import print_function import numpy as np from scipy import stats import statsmodels.api as sm import matplotlib.pyplot as plt from statsmodels.sandbox.regression.predstd import wls_prediction_std from statsmodels.iolib.table import (SimpleTable, default_txt_fmt) np.random.seed(1024) #Datos artificiales: Heteroscedasticidad 2 grupos. nsample = 50 x = np.linspace(0, 20, nsample) X = np.column_stack((x, (x - 5)**2)) X = sm.add_constant(X) beta = [5., 0.5, -0.01] sig = 0.5 w = np.ones(nsample) w[nsample * 6//10:] = 3 y_true = np.dot(X, beta) e = np.random.normal(size=nsample) y = y_true + sig * w * e X = X[:,[0,1]] #Conociendo la verdadera varianza de la heterocedasticidad mod_wls = sm.WLS(y, X, weights=1./(w ** 2)) res_wls = mod_wls.fit() print(res_wls.summary())
Mínimos cuadrados generalizados (GLS)
from __future__ import print_function import statsmodels.api as sm import numpy as np from statsmodels.iolib.table import (SimpleTable, default_txt_fmt) # El conjunto de datos de Longley es un conjunto de datos de series de tiempo data = sm.datasets.longley.load() data.exog = sm.add_constant(data.exog) print(data.exog[:5]) # Primero obtendremos los residuos del ajuste. ols_resid = sm.OLS(data.endog, data.exog).fit().resid resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit() print(resid_fit.tvalues[1]) print(resid_fit.pvalues[1]) rho = resid_fit.params[1] from scipy.linalg import toeplitz toeplitz(range(5)) order = toeplitz(range(len(ols_resid))) sigma = rho**order gls_model = sm.GLS(data.endog, data.exog, sigma=sigma) gls_results = gls_model.fit() glsar_model = sm.GLSAR(data.endog, data.exog, 1) glsar_results = glsar_model.iterative_fit(1) print(glsar_results.summary())
Written by Tutor