Introduction to Biostatistical Computing PHC 6937

Basic statistical inference in Python

Haocheng Ding

Wednesday Nov 30th, 2022

Outline

Random samples from a given pool of numbers

import random
a = range(1,11)
random.sample(a, 2)
## [1, 5]
random.sample(a, 2)
## [9, 1]
random.sample(a, 2)
## [1, 4]

Are they really random? Can generate the same random number (for reproducibility)?

import random
a = range(1,11)
random.seed(32611) ## 32611 can be replaced by any number.
random.sample(a,2)
## [3, 8]
random.seed(32611) ## if you keep the same random seed, you will end up with the exact same result.
random.sample(a,2)
## [3, 8]
random.seed(32608) ## if you update the random seed, you will end up with different result.
random.sample(a,2)
## [6, 7]

Each time the seed is set, the same sequence follows

random.seed(32611)
random.sample(range(1,11),2);random.sample(range(1,11),2);random.sample(range(1,11),2)
## [3, 8]
## [5, 9]
## [1, 9]
random.seed(32611)
random.sample(range(1,11),2);random.sample(range(1,11),2);random.sample(range(1,11),2)
## [3, 8]
## [5, 9]
## [1, 9]

Want to sample from a given pool of numbers with replacement

import random
import string

random.sample(range(1,11),10) ## default is without replacement
## [5, 2, 4, 6, 8, 1, 9, 7, 3, 10]
random.choices(range(1,11),k=10) ## with replacement
## [9, 4, 8, 8, 1, 8, 3, 2, 3, 2]
random.choices(string.ascii_uppercase,k=10)
## ['N', 'C', 'I', 'T', 'I', 'M', 'M', 'Z', 'N', 'V']

Random number generation from normal distribution

For normal distribution:

from scipy.stats import norm

Random variable simulators in Python

Distribution Python command
Binomial numpy.random.binomial
Poisson numpy.random.poisson
Geometric numpy.random.geometric
Negative binomial numpy.random.negative_binomial
Uniform numpy.random.uniform
Exponential numpy.random.exponential
Normal numpy.random.normal
Gamma numpy.random.gamma
Beta numpy.random.beta
Student t numpy.random.standard_t
F numpy.random.f
Chi-squared numpy.random.chisquare
Weibull numpy.random.weibull
Log-normal numpy.random.lognormal

Random variable density in Python

Distribution Python command
Binomial scipy.stats.binom.pdf
Poisson scipy.stats.poisson.pmf
Geometric scipy.stats.poisson.pdf
Negative binomial scipy.stats.nbinom.pdf
Uniform scipy.stats.uniform.pdf
Exponential scipy.stats.expon.pdf
Normal scipy.stats.norm.pdf
Gamma scipy.stats.gamma.pdf
Beta scipy.stats.beta.pdf
Student t scipy.stats.t.pdf
F scipy.stats.f.pdf
Chi-squared scipy.stats.chi2.pdf
Weibull scipy.stats.weibull_min.pdf
Log-normal scipy.stats.lognorm.pdf

Random variable distribution tail probablity in Python

Distribution Python command
Binomial scipy.stats.binom.cdf
Poisson scipy.stats.poisson.cdf
Geometric scipy.stats.poisson.cdf
Negative binomial scipy.stats.nbinom.cdf
Uniform scipy.stats.uniform.cdf
Exponential scipy.stats.expon.cdf
Normal scipy.stats.norm.cdf
Gamma scipy.stats.gamma.cdf
Beta scipy.stats.beta.cdf
Student t scipy.stats.t.cdf
F scipy.stats.f.cdf
Chi-squared scipy.stats.chi2.cdf
Weibull scipy.stats.weibull_min.cdf
Log-normal scipy.stats.lognorm.cdf

Random variable distribution quantile in Python

Distribution Python command
Binomial scipy.stats.binom.ppf
Poisson scipy.stats.poisson.ppf
Geometric scipy.stats.poisson.ppf
Negative binomial scipy.stats.nbinom.ppf
Uniform scipy.stats.uniform.ppf
Exponential scipy.stats.expon.ppf
Normal scipy.stats.norm.ppf
Gamma scipy.stats.gamma.ppf
Beta scipy.stats.beta.ppf
Student t scipy.stats.t.ppf
F scipy.stats.f.ppf
Chi-squared scipy.stats.chi2.ppf
Weibull scipy.stats.weibull_min.ppf
Log-normal scipy.stats.lognorm.ppf

Normal distribution

\[f(x;\mu,\sigma)=\cfrac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\]

import numpy as np
import matplotlib.pyplot as plt


class Gaussian:
  @staticmethod
  def plot(mean, std, lower_bound=None, upper_bound=None, resolution=None,
    title=None, x_label=None, y_label=None, legend_label=None, legend_location="best"):
    
    lower_bound = ( mean - 4*std ) if lower_bound is None else lower_bound
    upper_bound = ( mean + 4*std ) if upper_bound is None else upper_bound
    resolution  = 100
    
    title        = title        or "Gaussian Distribution"
    x_label      = x_label      or "x"
    y_label      = y_label      or "Density"
    legend_label = legend_label or "μ={}, σ={}".format(mean, std)
    
    X = np.linspace(lower_bound, upper_bound, resolution)
    dist_X = Gaussian._distribution(X, mean, std)
    
    plt.title(title)
    
    plt.plot(X, dist_X, label=legend_label)
    
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc=legend_location)
    
    return plt
  
  @staticmethod
  def _distribution(X, mean, std):
    return 1./(np.sqrt(2*np.pi)*std)*np.exp(-0.5 * (1./std*(X - mean))**2)
  
plot = Gaussian.plot(0, 1)
plot = Gaussian.plot(1, 1)
plot = Gaussian.plot(0, 2)
plot.show()

t distribution

\[f(x;v)=\cfrac{\Gamma(\cfrac{v+1}{2})}{\sqrt{v\pi}\Gamma(v/2)}(1+\cfrac{x^2}{v})^{-\cfrac{v+1}{2}}\]

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats


class Student_t:
  @staticmethod
  def plot(df, lower_bound=None, upper_bound=None, resolution=None,
    title=None, x_label=None, y_label=None, legend_label=None, legend_location="best"):
    
    resolution  = 100
    
    title        =  "Student t Distribution"
    x_label      =   "x"
    y_label      =  "Density"
    legend_label = legend_label or "df={}".format(df)
    
    X = np.linspace(lower_bound, upper_bound, resolution)
    dist_X = Student_t._distribution(X, df)
    
    plt.title(title)
    
    plt.plot(X, dist_X, label=legend_label)
    
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc=legend_location)
    
    return plt
  
  @staticmethod
  def _distribution(X, df):
    return scipy.stats.t.pdf(X, df)
  
plot = Student_t.plot(2,  lower_bound=-4, upper_bound=4)
plot = Student_t.plot(4,  lower_bound=-4, upper_bound=4)
plot = Student_t.plot(10,  lower_bound=-4, upper_bound=4)
plot = Student_t.plot(float('inf'),  lower_bound=-4, upper_bound=4)
## /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/scipy/stats/_continuous_distns.py:6314: RuntimeWarning: invalid value encountered in subtract
##   Px = (np.exp(sc.gammaln((r+1)/2)-sc.gammaln(r/2))
plot.show()

Chi-square distribution

\[f(x;k)=\cfrac{1}{2^{k/2}\Gamma(k/2)}x^{k/2-1}e^{-x/2}\]

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats


class Chi2:
  @staticmethod
  def plot(df, lower_bound=None, upper_bound=None, resolution=None,
    title=None, x_label=None, y_label=None, legend_label=None, legend_location="best"):
    
    resolution  = 100
    
    title        =  "Chi-square Distribution"
    x_label      =   "x"
    y_label      =  "Density"
    legend_label = legend_label or "df={}".format(df)
    
    X = np.linspace(lower_bound, upper_bound, resolution)
    dist_X = Chi2._distribution(X, df)
    
    plt.title(title)
    
    plt.plot(X, dist_X, label=legend_label)
    
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc=legend_location)
    
    return plt
  
  @staticmethod
  def _distribution(X, df):
    return scipy.stats.chi2.pdf(X, df)
  
plot = Chi2.plot(1,  lower_bound=0, upper_bound=4)
plot = Chi2.plot(2,  lower_bound=0, upper_bound=4)
plot = Chi2.plot(5,  lower_bound=0, upper_bound=4)
plot = Chi2.plot(10,  lower_bound=0, upper_bound=4)
plt.ylim([0, 1])
## (0.0, 1.0)
plot.show()

Relationship between Normal distribution and Chi-square distribution

## verification via qq-plot

import random
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats

random.seed(32611)

x1 = np.random.normal(size=1000)
x2 = np.random.chisquare(df=1,size=1000)

x1_sorted = np.sort(np.square(x1))
x2_sorted = np.sort(x2)

plt.xlim([0, 5])
plt.ylim([0, 5])
plt.plot(x1_sorted,x2_sorted,"o")
plot.show()

Poisson distribution

\[f(k:\lambda)=\cfrac{\lambda^ke^{-\lambda}}{k!} (k>0)\]

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats

x = np.arange(0, 10, 0.5)
# poisson distribution data for y-axis
y1 = scipy.stats.poisson.pmf(x, mu=1)
y2 = scipy.stats.poisson.pmf(x, mu=2)
y3 = scipy.stats.poisson.pmf(x, mu=3)
y4 = scipy.stats.poisson.pmf(x, mu=4)

fig, axs = plt.subplots(2, 2)

plt.setp(axs, ylim=(0,.5)) # set y-axis limit
axs[0, 0].bar(x, y1,color='black')
axs[0, 0].set_title('lambda=1')
axs[0, 1].bar(x, y2,color='red')
axs[0, 1].set_title('lambda=2')
axs[1, 0].bar(x, y3,color='green')
axs[1, 0].set_title('lambda=3')
axs[1, 1].bar(x, y4)
axs[1, 1].set_title('lambda=4')


for ax in axs.flat:
    ax.set(xlabel='x', ylabel='density')

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()
plot.show()

Beta distribution

\[f(x;\alpha,\beta)=\cfrac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)}x^{\alpha-1}(1-x)^{\beta-1}\]

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats


class Beta:
  @staticmethod
  def plot(a, b, lower_bound=None, upper_bound=None, resolution=None,
    title=None, x_label=None, y_label=None, legend_label=None, legend_location="best"):
    
    resolution  = 100
    
    title        =  "Beta Distribution"
    x_label      =   "Proportion (p)"
    y_label      =  "Density"
    legend_label = legend_label or "a={},b={}".format(a,b)
    
    X = np.linspace(lower_bound, upper_bound, resolution)
    dist_X = Beta._distribution(X, a, b)
    
    plt.title(title)
    
    plt.plot(X, dist_X, label=legend_label)
    
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend(loc=legend_location)
    
    return plt
  
  @staticmethod
  def _distribution(X, a, b):
    return scipy.stats.beta.pdf(X, a, b)
  
plot = Beta.plot(a=0.25, b=0.25, lower_bound=0, upper_bound=1)
plot = Beta.plot(a=2, b=2,  lower_bound=0, upper_bound=1)
plot = Beta.plot(a=2, b=5,  lower_bound=0, upper_bound=1)
plot = Beta.plot(a=12, b=2, lower_bound=0, upper_bound=1)
plot = Beta.plot(a=20, b=0.75, lower_bound=0, upper_bound=1)
plot = Beta.plot(a=1, b=1, lower_bound=0, upper_bound=1)
plt.ylim([0, 6])
plot.show()

Beta distribution properties

Samples from Normal distribution

import random
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from numpy import linspace
from scipy.stats.kde import gaussian_kde

random.seed(32611)
z = np.random.normal(size=1000) ## generate 1000 samples from N(0,1). 
plt.hist(x=z, bins=50, color='gray',alpha=0.7, rwidth=0.8, density=True)

# plot theoritical pdf
x_axis = np.arange(-4, 4, 0.01)
plt.plot(x_axis, norm.pdf(x_axis, 0, 1),color='red',label='Theoritical')

# plot empirical pdf
kde = gaussian_kde(z)
dist_space = linspace( min(z), max(z), 100 )
plt.plot(dist_space, kde(dist_space), color='#104E8B',label='Empirical')

plt.legend(loc="upper left")
plt.grid(axis='y', alpha=0.75)
plt.xlabel('z')
plt.ylabel('Density')
plt.title('Histogram of z')
plt.show()

Check CDF

import random
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
from statsmodels.distributions.empirical_distribution import ECDF

random.seed(32611)
z = np.random.normal(size=1000) ## generate 1000 samples from N(0,1).



# plot theoritical cdf
x = np.linspace(-3, 3, 1000)
y = scipy.stats.norm.cdf(x)
plt.plot(x, y,label='Theoritical distribution')

# plot empirical distribution
ecdf = ECDF(z)
plt.plot(ecdf.x, ecdf.y, label='Empirical distribution')

plt.legend(loc="upper left")
plt.xlabel('x')
plt.ylabel('Distribution')
plt.title('Empirical distribution')
plt.show()

p-value

import scipy.stats

2*(1-scipy.stats.norm.cdf(2)) ## Two-sided test
## 0.04550026389635842
1-scipy.stats.norm.cdf(2) ## One-sided test
## 0.02275013194817921

Quantile

scipy.stats.norm.ppf(): return the quantile of input area under the density curve (cumulative density).

import scipy.stats

scipy.stats.norm.ppf(0.975)
## 1.959963984540054
scipy.stats.norm.ppf(.025)
## -1.9599639845400545
scipy.stats.norm.ppf(.5)
## 0.0

Commonly used statistical tests

One-sample T test

import random
import numpy as np
import scipy.stats

random.seed(32611)
group = np.random.normal(size=10)

scipy.stats.ttest_1samp(group,popmean=0)
## Ttest_1sampResult(statistic=-0.6804947657552574, pvalue=0.5133149813477859)

Two-sample T test

import random
import numpy as np
import scipy.stats

random.seed(32611)
group1 = np.random.normal(size=10)
group2 = np.random.normal(size=10,loc=1.2)

scipy.stats.ttest_ind(group1,group2)
## Ttest_indResult(statistic=-3.3392393385155357, pvalue=0.003650753743877968)

Two-sample paired T test

import random
import numpy as np
import scipy.stats

random.seed(32611)
pre = np.random.normal(size=10)
post = np.random.normal(size=10,loc=1.2)

scipy.stats.ttest_rel(pre,post)
## Ttest_relResult(statistic=-4.681530684381899, pvalue=0.0011497193250913657)

Two-sample Wilcox Rank Sum test

import random
import numpy as np
import scipy.stats

random.seed(32611)
group1 = np.random.normal(size=10)
group2 = np.random.normal(size=10,loc=1.2)

scipy.stats.ranksums(group1, group2)
## RanksumsResult(statistic=-2.192193943453518, pvalue=0.028365505605209992)

Two-sample paired Wilcox Rank Sum test

import random
import numpy as np
import scipy.stats

random.seed(32611)
group1 = np.random.normal(size=10)
group2 = np.random.normal(size=10,loc=1.2)

scipy.stats.wilcoxon(group1, group2)
## WilcoxonResult(statistic=1.0, pvalue=0.00390625)

Chi-square test

##          Disease
## Treatment Y   N  
##         Y "a" "c"
##         N "b" "d"
##          Myocardial Infarction
## Treatment Yes    No
##   Placebo 189 10845
##   Aspirin 104 10933
from scipy.stats import chi2_contingency

observed = [[189, 10845], [104,10933]]
chi2_contingency(observed)

## Output details
#(test statistic, p-value, df, expected frequencies)
## (24.42905651522594, 7.709707547507586e-07, 1, array([[  146.48008699, 10887.51991301],
##        [  146.51991301, 10890.48008699]]))

Fisher’s exact test

##          Myocardial Infarction
## Treatment Yes    No
##   Placebo 189 10845
##   Aspirin 104 10933
from scipy.stats import fisher_exact

observed = [[189, 10845], [104,10933]]
scipy.stats.fisher_exact(observed)

## Output details
#(odds ratio, p-value)
## (1.8320539419087136, 5.032835599791868e-07)

Correlation test

import random
import numpy as np
import scipy.stats

random.seed(32611)
group1 = np.random.normal(size=10)
group2 = np.random.normal(size=10,loc=1.2)

scipy.stats.pearsonr(group1,group2) # Pearson correlation: requires Gaussian assumption.
## (-0.3272316520982679, 0.35603198241933337)
scipy.stats.spearmanr(group1,group2) # Spearman correlation: non-parametric, no distribution assumption needed.
## SpearmanrResult(correlation=-0.13939393939393938, pvalue=0.7009318849100584)

Kolmogorov-Smirnov (KS) test

import random
import numpy as np
import scipy.stats

random.seed(32611)
group1 = np.random.normal(size=50) # N(0,1)
group2 = np.random.normal(size=50,loc=1.2) # N(1.2,1)


scipy.stats.kstest(group1, group2)
## KstestResult(statistic=0.34, pvalue=0.005841778142694731)

Fit linear model in Python

import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pandas as pd 

mtcars = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv", index_col=0)

#Setting response and predictors
y = mtcars.mpg
X = mtcars[["cyl"]]

#Fitting simple Linear Regression Model
linr_model = LinearRegression().fit(X, y)

#Model Fitting Results
print('estimated slope (cly)', 'estimated intercept', 'R-square')
## estimated slope (cly) estimated intercept R-square
print(linr_model.intercept_, linr_model.coef_[0],linr_model.score(X,y)) 
## 37.88457648546145 -2.875790139064476 0.7261800050938048
mtcars[["cyl"]].head(10)
##                    cyl
## Mazda RX4            6
## Mazda RX4 Wag        6
## Datsun 710           4
## Hornet 4 Drive       6
## Hornet Sportabout    8
## Valiant              6
## Duster 360           8
## Merc 240D            4
## Merc 230             4
## Merc 280             6
linr_model.predict(mtcars[["cyl"]])[0:10]
## array([20.62983565, 20.62983565, 26.38141593, 20.62983565, 14.87825537,
##        20.62983565, 14.87825537, 26.38141593, 26.38141593, 20.62983565])
import statsmodels.api as sm

#define response variable
y = mtcars['mpg']

#define predictor variables
x = mtcars[['cyl']]

#add constant to predictor variables
x = sm.add_constant(x)

#fit linear regression model
model1 = sm.OLS(y, x).fit()

#view model summary
print(model1.summary())
##                             OLS Regression Results                            
## ==============================================================================
## Dep. Variable:                    mpg   R-squared:                       0.726
## Model:                            OLS   Adj. R-squared:                  0.717
## Method:                 Least Squares   F-statistic:                     79.56
## Date:                Fri, 02 Dec 2022   Prob (F-statistic):           6.11e-10
## Time:                        21:03:28   Log-Likelihood:                -81.653
## No. Observations:                  32   AIC:                             167.3
## Df Residuals:                      30   BIC:                             170.2
## Df Model:                           1                                         
## Covariance Type:            nonrobust                                         
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## const         37.8846      2.074     18.268      0.000      33.649      42.120
## cyl           -2.8758      0.322     -8.920      0.000      -3.534      -2.217
## ==============================================================================
## Omnibus:                        1.007   Durbin-Watson:                   1.670
## Prob(Omnibus):                  0.604   Jarque-Bera (JB):                0.874
## Skew:                           0.380   Prob(JB):                        0.646
## Kurtosis:                       2.720   Cond. No.                         24.1
## ==============================================================================
## 
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = mtcars['mpg']
x = mtcars[['cyl','disp']]
x = sm.add_constant(x)
model2 = sm.OLS(y, x).fit()
print(model2.summary())
##                             OLS Regression Results                            
## ==============================================================================
## Dep. Variable:                    mpg   R-squared:                       0.760
## Model:                            OLS   Adj. R-squared:                  0.743
## Method:                 Least Squares   F-statistic:                     45.81
## Date:                Fri, 02 Dec 2022   Prob (F-statistic):           1.06e-09
## Time:                        21:03:30   Log-Likelihood:                -79.573
## No. Observations:                  32   AIC:                             165.1
## Df Residuals:                      29   BIC:                             169.5
## Df Model:                           2                                         
## Covariance Type:            nonrobust                                         
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## const         34.6610      2.547     13.609      0.000      29.452      39.870
## cyl           -1.5873      0.712     -2.230      0.034      -3.043      -0.131
## disp          -0.0206      0.010     -2.007      0.054      -0.042       0.000
## ==============================================================================
## Omnibus:                        3.200   Durbin-Watson:                   1.596
## Prob(Omnibus):                  0.202   Jarque-Bera (JB):                2.660
## Skew:                           0.701   Prob(JB):                        0.264
## Kurtosis:                       2.822   Cond. No.                     1.27e+03
## ==============================================================================
## 
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 1.27e+03. This might indicate that there are
## strong multicollinearity or other numerical problems.
y = mtcars['mpg']
x = mtcars.loc[:, mtcars.columns != 'mpg'] # exclude mpg
x = sm.add_constant(x)
model3 = sm.OLS(y, x).fit()
print(model3.summary())
##                             OLS Regression Results                            
## ==============================================================================
## Dep. Variable:                    mpg   R-squared:                       0.869
## Model:                            OLS   Adj. R-squared:                  0.807
## Method:                 Least Squares   F-statistic:                     13.93
## Date:                Fri, 02 Dec 2022   Prob (F-statistic):           3.79e-07
## Time:                        21:03:32   Log-Likelihood:                -69.855
## No. Observations:                  32   AIC:                             161.7
## Df Residuals:                      21   BIC:                             177.8
## Df Model:                          10                                         
## Covariance Type:            nonrobust                                         
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## const         12.3034     18.718      0.657      0.518     -26.623      51.229
## cyl           -0.1114      1.045     -0.107      0.916      -2.285       2.062
## disp           0.0133      0.018      0.747      0.463      -0.024       0.050
## hp            -0.0215      0.022     -0.987      0.335      -0.067       0.024
## drat           0.7871      1.635      0.481      0.635      -2.614       4.188
## wt            -3.7153      1.894     -1.961      0.063      -7.655       0.224
## qsec           0.8210      0.731      1.123      0.274      -0.699       2.341
## vs             0.3178      2.105      0.151      0.881      -4.059       4.694
## am             2.5202      2.057      1.225      0.234      -1.757       6.797
## gear           0.6554      1.493      0.439      0.665      -2.450       3.761
## carb          -0.1994      0.829     -0.241      0.812      -1.923       1.524
## ==============================================================================
## Omnibus:                        1.907   Durbin-Watson:                   1.861
## Prob(Omnibus):                  0.385   Jarque-Bera (JB):                1.747
## Skew:                           0.521   Prob(JB):                        0.418
## Kurtosis:                       2.526   Cond. No.                     1.22e+04
## ==============================================================================
## 
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
## [2] The condition number is large, 1.22e+04. This might indicate that there are
## strong multicollinearity or other numerical problems.