Haocheng Ding
Wednesday Nov 30th, 2022
## [1, 5]
## [9, 1]
## [1, 4]
The random numbers generated in Python are not truly random, they are pseudorandom.
So we are able to reproduce the exact same random numbers by setting random seeds.
import random
a = range(1,11)
random.seed(32611) ## 32611 can be replaced by any number.
random.sample(a,2)
## [3, 8]
random.seed(32611) ## if you keep the same random seed, you will end up with the exact same result.
random.sample(a,2)
## [3, 8]
random.seed(32608) ## if you update the random seed, you will end up with different result.
random.sample(a,2)
## [6, 7]
random.seed(32611)
random.sample(range(1,11),2);random.sample(range(1,11),2);random.sample(range(1,11),2)
## [3, 8]
## [5, 9]
## [1, 9]
random.seed(32611)
random.sample(range(1,11),2);random.sample(range(1,11),2);random.sample(range(1,11),2)
## [3, 8]
## [5, 9]
## [1, 9]
## [5, 2, 4, 6, 8, 1, 9, 7, 3, 10]
## [9, 4, 8, 8, 1, 8, 3, 2, 3, 2]
## ['N', 'C', 'I', 'T', 'I', 'M', 'M', 'Z', 'N', 'V']
For normal distribution:
from scipy.stats import norm
scipy.stats.norm.cdf(x, loc=0, scale=1): get CDF/pvalue from normal distribution, \(\Phi(x)=(P(Z\leq x))\). (pnorm() in R)
scipy.stats.norm.pdf(x, loc=0, scale=1): normal density function \(\phi = \Phi'(x)\). (dnorm() in R)
scipy.stats.norm.ppf(q, loc=0, scale=1): quantile function for normal distribution \(q(y)=\Phi-1(y)\) such that \(\Phi(q(y))=y\). (qnorm() in R)
Distribution | Python command |
---|---|
Binomial | numpy.random.binomial |
Poisson | numpy.random.poisson |
Geometric | numpy.random.geometric |
Negative binomial | numpy.random.negative_binomial |
Uniform | numpy.random.uniform |
Exponential | numpy.random.exponential |
Normal | numpy.random.normal |
Gamma | numpy.random.gamma |
Beta | numpy.random.beta |
Student t | numpy.random.standard_t |
F | numpy.random.f |
Chi-squared | numpy.random.chisquare |
Weibull | numpy.random.weibull |
Log-normal | numpy.random.lognormal |
Distribution | Python command |
---|---|
Binomial | scipy.stats.binom.pdf |
Poisson | scipy.stats.poisson.pmf |
Geometric | scipy.stats.poisson.pdf |
Negative binomial | scipy.stats.nbinom.pdf |
Uniform | scipy.stats.uniform.pdf |
Exponential | scipy.stats.expon.pdf |
Normal | scipy.stats.norm.pdf |
Gamma | scipy.stats.gamma.pdf |
Beta | scipy.stats.beta.pdf |
Student t | scipy.stats.t.pdf |
F | scipy.stats.f.pdf |
Chi-squared | scipy.stats.chi2.pdf |
Weibull | scipy.stats.weibull_min.pdf |
Log-normal | scipy.stats.lognorm.pdf |
Distribution | Python command |
---|---|
Binomial | scipy.stats.binom.cdf |
Poisson | scipy.stats.poisson.cdf |
Geometric | scipy.stats.poisson.cdf |
Negative binomial | scipy.stats.nbinom.cdf |
Uniform | scipy.stats.uniform.cdf |
Exponential | scipy.stats.expon.cdf |
Normal | scipy.stats.norm.cdf |
Gamma | scipy.stats.gamma.cdf |
Beta | scipy.stats.beta.cdf |
Student t | scipy.stats.t.cdf |
F | scipy.stats.f.cdf |
Chi-squared | scipy.stats.chi2.cdf |
Weibull | scipy.stats.weibull_min.cdf |
Log-normal | scipy.stats.lognorm.cdf |
Distribution | Python command |
---|---|
Binomial | scipy.stats.binom.ppf |
Poisson | scipy.stats.poisson.ppf |
Geometric | scipy.stats.poisson.ppf |
Negative binomial | scipy.stats.nbinom.ppf |
Uniform | scipy.stats.uniform.ppf |
Exponential | scipy.stats.expon.ppf |
Normal | scipy.stats.norm.ppf |
Gamma | scipy.stats.gamma.ppf |
Beta | scipy.stats.beta.ppf |
Student t | scipy.stats.t.ppf |
F | scipy.stats.f.ppf |
Chi-squared | scipy.stats.chi2.ppf |
Weibull | scipy.stats.weibull_min.ppf |
Log-normal | scipy.stats.lognorm.ppf |
\[f(x;\mu,\sigma)=\cfrac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\]
import numpy as np
import matplotlib.pyplot as plt
class Gaussian:
@staticmethod
def plot(mean, std, lower_bound=None, upper_bound=None, resolution=None,
title=None, x_label=None, y_label=None, legend_label=None, legend_location="best"):
lower_bound = ( mean - 4*std ) if lower_bound is None else lower_bound
upper_bound = ( mean + 4*std ) if upper_bound is None else upper_bound
resolution = 100
title = title or "Gaussian Distribution"
x_label = x_label or "x"
y_label = y_label or "Density"
legend_label = legend_label or "μ={}, σ={}".format(mean, std)
X = np.linspace(lower_bound, upper_bound, resolution)
dist_X = Gaussian._distribution(X, mean, std)
plt.title(title)
plt.plot(X, dist_X, label=legend_label)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.legend(loc=legend_location)
return plt
@staticmethod
def _distribution(X, mean, std):
return 1./(np.sqrt(2*np.pi)*std)*np.exp(-0.5 * (1./std*(X - mean))**2)
plot = Gaussian.plot(0, 1)
plot = Gaussian.plot(1, 1)
plot = Gaussian.plot(0, 2)
plot.show()
\[f(x;v)=\cfrac{\Gamma(\cfrac{v+1}{2})}{\sqrt{v\pi}\Gamma(v/2)}(1+\cfrac{x^2}{v})^{-\cfrac{v+1}{2}}\]
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
class Student_t:
@staticmethod
def plot(df, lower_bound=None, upper_bound=None, resolution=None,
title=None, x_label=None, y_label=None, legend_label=None, legend_location="best"):
resolution = 100
title = "Student t Distribution"
x_label = "x"
y_label = "Density"
legend_label = legend_label or "df={}".format(df)
X = np.linspace(lower_bound, upper_bound, resolution)
dist_X = Student_t._distribution(X, df)
plt.title(title)
plt.plot(X, dist_X, label=legend_label)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.legend(loc=legend_location)
return plt
@staticmethod
def _distribution(X, df):
return scipy.stats.t.pdf(X, df)
plot = Student_t.plot(2, lower_bound=-4, upper_bound=4)
plot = Student_t.plot(4, lower_bound=-4, upper_bound=4)
plot = Student_t.plot(10, lower_bound=-4, upper_bound=4)
plot = Student_t.plot(float('inf'), lower_bound=-4, upper_bound=4)
## /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/scipy/stats/_continuous_distns.py:6314: RuntimeWarning: invalid value encountered in subtract
## Px = (np.exp(sc.gammaln((r+1)/2)-sc.gammaln(r/2))
\[f(x;k)=\cfrac{1}{2^{k/2}\Gamma(k/2)}x^{k/2-1}e^{-x/2}\]
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
class Chi2:
@staticmethod
def plot(df, lower_bound=None, upper_bound=None, resolution=None,
title=None, x_label=None, y_label=None, legend_label=None, legend_location="best"):
resolution = 100
title = "Chi-square Distribution"
x_label = "x"
y_label = "Density"
legend_label = legend_label or "df={}".format(df)
X = np.linspace(lower_bound, upper_bound, resolution)
dist_X = Chi2._distribution(X, df)
plt.title(title)
plt.plot(X, dist_X, label=legend_label)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.legend(loc=legend_location)
return plt
@staticmethod
def _distribution(X, df):
return scipy.stats.chi2.pdf(X, df)
plot = Chi2.plot(1, lower_bound=0, upper_bound=4)
plot = Chi2.plot(2, lower_bound=0, upper_bound=4)
plot = Chi2.plot(5, lower_bound=0, upper_bound=4)
plot = Chi2.plot(10, lower_bound=0, upper_bound=4)
plt.ylim([0, 1])
## (0.0, 1.0)
\[f(k:\lambda)=\cfrac{\lambda^ke^{-\lambda}}{k!} (k>0)\]
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
x = np.arange(0, 10, 0.5)
# poisson distribution data for y-axis
y1 = scipy.stats.poisson.pmf(x, mu=1)
y2 = scipy.stats.poisson.pmf(x, mu=2)
y3 = scipy.stats.poisson.pmf(x, mu=3)
y4 = scipy.stats.poisson.pmf(x, mu=4)
fig, axs = plt.subplots(2, 2)
plt.setp(axs, ylim=(0,.5)) # set y-axis limit
\[f(x;\alpha,\beta)=\cfrac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)}x^{\alpha-1}(1-x)^{\beta-1}\]
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
class Beta:
@staticmethod
def plot(a, b, lower_bound=None, upper_bound=None, resolution=None,
title=None, x_label=None, y_label=None, legend_label=None, legend_location="best"):
resolution = 100
title = "Beta Distribution"
x_label = "Proportion (p)"
y_label = "Density"
legend_label = legend_label or "a={},b={}".format(a,b)
X = np.linspace(lower_bound, upper_bound, resolution)
dist_X = Beta._distribution(X, a, b)
plt.title(title)
plt.plot(X, dist_X, label=legend_label)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.legend(loc=legend_location)
return plt
@staticmethod
def _distribution(X, a, b):
return scipy.stats.beta.pdf(X, a, b)
plot = Beta.plot(a=0.25, b=0.25, lower_bound=0, upper_bound=1)
plot = Beta.plot(a=2, b=2, lower_bound=0, upper_bound=1)
plot = Beta.plot(a=2, b=5, lower_bound=0, upper_bound=1)
plot = Beta.plot(a=12, b=2, lower_bound=0, upper_bound=1)
plot = Beta.plot(a=20, b=0.75, lower_bound=0, upper_bound=1)
plot = Beta.plot(a=1, b=1, lower_bound=0, upper_bound=1)
plt.ylim([0, 6])
import random
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from numpy import linspace
from scipy.stats.kde import gaussian_kde
random.seed(32611)
z = np.random.normal(size=1000) ## generate 1000 samples from N(0,1).
plt.hist(x=z, bins=50, color='gray',alpha=0.7, rwidth=0.8, density=True)
# plot theoritical pdf
x_axis = np.arange(-4, 4, 0.01)
plt.plot(x_axis, norm.pdf(x_axis, 0, 1),color='red',label='Theoritical')
# plot empirical pdf
kde = gaussian_kde(z)
dist_space = linspace( min(z), max(z), 100 )
plt.plot(dist_space, kde(dist_space), color='#104E8B',label='Empirical')
plt.legend(loc="upper left")
plt.grid(axis='y', alpha=0.75)
plt.xlabel('z')
plt.ylabel('Density')
plt.title('Histogram of z')
plt.show()
import random
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
from statsmodels.distributions.empirical_distribution import ECDF
random.seed(32611)
z = np.random.normal(size=1000) ## generate 1000 samples from N(0,1).
# plot theoritical cdf
x = np.linspace(-3, 3, 1000)
y = scipy.stats.norm.cdf(x)
plt.plot(x, y,label='Theoritical distribution')
# plot empirical distribution
ecdf = ECDF(z)
plt.plot(ecdf.x, ecdf.y, label='Empirical distribution')
plt.legend(loc="upper left")
plt.xlabel('x')
plt.ylabel('Distribution')
plt.title('Empirical distribution')
plt.show()
p-value: The probability that the null statistic is as extreme or more extreme than the observed statistic, when null hypothesis is true.
p-value is the cumulative density. It can be computed by scipy.stats.norm.cdf (Normal distribution) in python.
Example: observed Z value equals to 2 with null distribution is \(N(0,1)\).
## 0.04550026389635842
## 0.02275013194817921
scipy.stats.norm.ppf(): return the quantile of input area under the density curve (cumulative density).
## 1.959963984540054
## -1.9599639845400545
## 0.0
import random
import numpy as np
import scipy.stats
random.seed(32611)
group = np.random.normal(size=10)
scipy.stats.ttest_1samp(group,popmean=0)
## Ttest_1sampResult(statistic=-0.6804947657552574, pvalue=0.5133149813477859)
import random
import numpy as np
import scipy.stats
random.seed(32611)
group1 = np.random.normal(size=10)
group2 = np.random.normal(size=10,loc=1.2)
scipy.stats.ttest_ind(group1,group2)
## Ttest_indResult(statistic=-3.3392393385155357, pvalue=0.003650753743877968)
import random
import numpy as np
import scipy.stats
random.seed(32611)
pre = np.random.normal(size=10)
post = np.random.normal(size=10,loc=1.2)
scipy.stats.ttest_rel(pre,post)
## Ttest_relResult(statistic=-4.681530684381899, pvalue=0.0011497193250913657)
To compare the median values of two samples without any distribution assumption.
Non-parametric test.
\(H_0\): median values of two samples are same.
import random
import numpy as np
import scipy.stats
random.seed(32611)
group1 = np.random.normal(size=10)
group2 = np.random.normal(size=10,loc=1.2)
scipy.stats.ranksums(group1, group2)
## RanksumsResult(statistic=-2.192193943453518, pvalue=0.028365505605209992)
When your data are in matched pairs.
\(H_0\): median values of two samples are same.
import random
import numpy as np
import scipy.stats
random.seed(32611)
group1 = np.random.normal(size=10)
group2 = np.random.normal(size=10,loc=1.2)
scipy.stats.wilcoxon(group1, group2)
## WilcoxonResult(statistic=1.0, pvalue=0.00390625)
## Disease
## Treatment Y N
## Y "a" "c"
## N "b" "d"
## Myocardial Infarction
## Treatment Yes No
## Placebo 189 10845
## Aspirin 104 10933
from scipy.stats import chi2_contingency
observed = [[189, 10845], [104,10933]]
chi2_contingency(observed)
## Output details
#(test statistic, p-value, df, expected frequencies)
## (24.42905651522594, 7.709707547507586e-07, 1, array([[ 146.48008699, 10887.51991301],
## [ 146.51991301, 10890.48008699]]))
## Myocardial Infarction
## Treatment Yes No
## Placebo 189 10845
## Aspirin 104 10933
from scipy.stats import fisher_exact
observed = [[189, 10845], [104,10933]]
scipy.stats.fisher_exact(observed)
## Output details
#(odds ratio, p-value)
## (1.8320539419087136, 5.032835599791868e-07)
import random
import numpy as np
import scipy.stats
random.seed(32611)
group1 = np.random.normal(size=10)
group2 = np.random.normal(size=10,loc=1.2)
scipy.stats.pearsonr(group1,group2) # Pearson correlation: requires Gaussian assumption.
## (-0.3272316520982679, 0.35603198241933337)
scipy.stats.spearmanr(group1,group2) # Spearman correlation: non-parametric, no distribution assumption needed.
## SpearmanrResult(correlation=-0.13939393939393938, pvalue=0.7009318849100584)
Test if random variables X and Y are from the same distribution.
\(H_0\): X and Y are from the same distribution.
import random
import numpy as np
import scipy.stats
random.seed(32611)
group1 = np.random.normal(size=50) # N(0,1)
group2 = np.random.normal(size=50,loc=1.2) # N(1.2,1)
scipy.stats.kstest(group1, group2)
## KstestResult(statistic=0.34, pvalue=0.005841778142694731)
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pandas as pd
mtcars = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv", index_col=0)
#Setting response and predictors
y = mtcars.mpg
X = mtcars[["cyl"]]
#Fitting simple Linear Regression Model
linr_model = LinearRegression().fit(X, y)
#Model Fitting Results
print('estimated slope (cly)', 'estimated intercept', 'R-square')
## estimated slope (cly) estimated intercept R-square
## 37.88457648546145 -2.875790139064476 0.7261800050938048
## cyl
## Mazda RX4 6
## Mazda RX4 Wag 6
## Datsun 710 4
## Hornet 4 Drive 6
## Hornet Sportabout 8
## Valiant 6
## Duster 360 8
## Merc 240D 4
## Merc 230 4
## Merc 280 6
## array([20.62983565, 20.62983565, 26.38141593, 20.62983565, 14.87825537,
## 20.62983565, 14.87825537, 26.38141593, 26.38141593, 20.62983565])
import statsmodels.api as sm
#define response variable
y = mtcars['mpg']
#define predictor variables
x = mtcars[['cyl']]
#add constant to predictor variables
x = sm.add_constant(x)
#fit linear regression model
model1 = sm.OLS(y, x).fit()
#view model summary
print(model1.summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: mpg R-squared: 0.726
## Model: OLS Adj. R-squared: 0.717
## Method: Least Squares F-statistic: 79.56
## Date: Fri, 02 Dec 2022 Prob (F-statistic): 6.11e-10
## Time: 21:03:28 Log-Likelihood: -81.653
## No. Observations: 32 AIC: 167.3
## Df Residuals: 30 BIC: 170.2
## Df Model: 1
## Covariance Type: nonrobust
## ==============================================================================
## coef std err t P>|t| [0.025 0.975]
## ------------------------------------------------------------------------------
## const 37.8846 2.074 18.268 0.000 33.649 42.120
## cyl -2.8758 0.322 -8.920 0.000 -3.534 -2.217
## ==============================================================================
## Omnibus: 1.007 Durbin-Watson: 1.670
## Prob(Omnibus): 0.604 Jarque-Bera (JB): 0.874
## Skew: 0.380 Prob(JB): 0.646
## Kurtosis: 2.720 Cond. No. 24.1
## ==============================================================================
##
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = mtcars['mpg']
x = mtcars[['cyl','disp']]
x = sm.add_constant(x)
model2 = sm.OLS(y, x).fit()
print(model2.summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: mpg R-squared: 0.760
## Model: OLS Adj. R-squared: 0.743
## Method: Least Squares F-statistic: 45.81
## Date: Fri, 02 Dec 2022 Prob (F-statistic): 1.06e-09
## Time: 21:03:30 Log-Likelihood: -79.573
## No. Observations: 32 AIC: 165.1
## Df Residuals: 29 BIC: 169.5
## Df Model: 2
## Covariance Type: nonrobust
## ==============================================================================
## coef std err t P>|t| [0.025 0.975]
## ------------------------------------------------------------------------------
## const 34.6610 2.547 13.609 0.000 29.452 39.870
## cyl -1.5873 0.712 -2.230 0.034 -3.043 -0.131
## disp -0.0206 0.010 -2.007 0.054 -0.042 0.000
## ==============================================================================
## Omnibus: 3.200 Durbin-Watson: 1.596
## Prob(Omnibus): 0.202 Jarque-Bera (JB): 2.660
## Skew: 0.701 Prob(JB): 0.264
## Kurtosis: 2.822 Cond. No. 1.27e+03
## ==============================================================================
##
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 1.27e+03. This might indicate that there are
## strong multicollinearity or other numerical problems.
y = mtcars['mpg']
x = mtcars.loc[:, mtcars.columns != 'mpg'] # exclude mpg
x = sm.add_constant(x)
model3 = sm.OLS(y, x).fit()
print(model3.summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: mpg R-squared: 0.869
## Model: OLS Adj. R-squared: 0.807
## Method: Least Squares F-statistic: 13.93
## Date: Fri, 02 Dec 2022 Prob (F-statistic): 3.79e-07
## Time: 21:03:32 Log-Likelihood: -69.855
## No. Observations: 32 AIC: 161.7
## Df Residuals: 21 BIC: 177.8
## Df Model: 10
## Covariance Type: nonrobust
## ==============================================================================
## coef std err t P>|t| [0.025 0.975]
## ------------------------------------------------------------------------------
## const 12.3034 18.718 0.657 0.518 -26.623 51.229
## cyl -0.1114 1.045 -0.107 0.916 -2.285 2.062
## disp 0.0133 0.018 0.747 0.463 -0.024 0.050
## hp -0.0215 0.022 -0.987 0.335 -0.067 0.024
## drat 0.7871 1.635 0.481 0.635 -2.614 4.188
## wt -3.7153 1.894 -1.961 0.063 -7.655 0.224
## qsec 0.8210 0.731 1.123 0.274 -0.699 2.341
## vs 0.3178 2.105 0.151 0.881 -4.059 4.694
## am 2.5202 2.057 1.225 0.234 -1.757 6.797
## gear 0.6554 1.493 0.439 0.665 -2.450 3.761
## carb -0.1994 0.829 -0.241 0.812 -1.923 1.524
## ==============================================================================
## Omnibus: 1.907 Durbin-Watson: 1.861
## Prob(Omnibus): 0.385 Jarque-Bera (JB): 1.747
## Skew: 0.521 Prob(JB): 0.418
## Kurtosis: 2.526 Cond. No. 1.22e+04
## ==============================================================================
##
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 1.22e+04. This might indicate that there are
## strong multicollinearity or other numerical problems.