Linear Regression

Simple linear regression

1. simulate data

x = rnorm(100, mean=3, sd=1)
error = rnorm(100, mean=0, sd=2)
y = 2.1+1.25*x+error

data=data.frame(cbind(x,y))
data
##             x          y
## 1   3.9793640  2.5761405
## 2   5.1386303  8.4661888
## 3   2.0430265 10.0727015
## 4   3.6100217  6.7243519
## 5   3.3215997  7.2804373
## 6   3.3140954  7.6002772
## 7   3.3354035  6.7945571
## 8   3.6123281  7.4552030
## 9   4.0630674  6.7234036
## 10  2.8019342  8.3388050
## 11  4.5258436  8.6763455
## 12  2.4698703  5.9526247
## 13  2.7385767  5.0999419
## 14  3.2044398  4.5281878
## 15  3.2000526  6.6112213
## 16  3.3794672  6.5676722
## 17  3.3082350  7.0427834
## 18  1.8147456  6.4033318
## 19  1.6813888  2.8151900
## 20  2.3738327  1.7437867
## 21  4.1956789  7.2418005
## 22  3.5451016  7.3214685
## 23  2.3453735  2.3274328
## 24  1.7737128  6.6662548
## 25  1.4094517  3.0705989
## 26  1.3624106  2.1276377
## 27  4.9246270 11.2466016
## 28  2.0163582  6.6399774
## 29  3.3835513  6.6717229
## 30  3.6886151 10.2903106
## 31  3.7785975  3.6782764
## 32  1.2996601  5.7681445
## 33  1.5260973  3.4920580
## 34  2.5939079  5.2827154
## 35  3.3286650  5.4406623
## 36  3.1751069  7.5239655
## 37  2.5061492  5.0933598
## 38  3.0880816  7.3747565
## 39  1.5213901  5.3490445
## 40  0.9660970 -0.2650114
## 41  3.9323562  6.9009512
## 42  3.2512446  8.2029174
## 43  1.9453669  3.3844220
## 44  3.1178933  5.0447260
## 45  4.0198402  4.3300035
## 46  3.7282105  6.9751899
## 47  5.5241618 13.0396586
## 48  0.8378355  4.5375969
## 49  1.7729049  4.8128601
## 50  2.4046528  6.5402952
## 51  1.7863395  3.8550248
## 52  2.6336540  5.6227788
## 53  2.5359246  4.1531127
## 54  4.0901638  4.7291566
## 55  2.8792388  9.3600040
## 56  2.9788760  6.1628472
## 57  0.3321258  6.4001010
## 58  1.9145244  5.0707228
## 59  1.9120350  3.0655884
## 60  3.5993110  5.4088345
## 61  5.0735281  9.0152914
## 62  3.6079777  7.4332010
## 63  4.2593591 11.7518569
## 64  2.4871951  4.5341563
## 65  1.7033518  3.6938252
## 66  2.6866707  3.2999198
## 67  3.0153334  6.4317947
## 68  2.6255460  5.0675378
## 69  2.6866809  6.4659001
## 70  2.3296344  4.2564870
## 71  4.6134373  9.0217882
## 72  2.4005958  4.9235664
## 73  3.1400863  4.8255920
## 74  1.0472564  0.4342755
## 75  1.2869217  5.0899830
## 76  2.1948161  3.5085934
## 77  3.2765949  1.6303044
## 78  3.8296748  8.9777281
## 79  2.1651990  4.7049311
## 80  1.3416811  5.5286629
## 81  2.9779329  1.9529144
## 82  2.2210098  7.4549081
## 83  2.1410234  4.6326170
## 84  2.8955712  7.8883829
## 85  3.6009060  6.7363076
## 86  2.3788469  6.2568700
## 87  1.3463776  1.1736270
## 88  2.6721125  5.3847154
## 89  2.3500780  4.1937148
## 90  2.0907736  1.7784603
## 91  3.5107597  6.4185253
## 92  2.0096989  6.6310309
## 93  4.1246527  5.1126534
## 94  3.5153321  3.2489419
## 95  1.3006636  6.0698687
## 96  2.5961843  4.3167853
## 97  3.5854213  9.5887443
## 98  3.4275029  7.4076069
## 99  2.6170163  6.6064457
## 100 0.8199772  4.2178065
plot(data)

2. fit the linear regression model

result = lm(data$y~data$x)
summary(result)
## 
## Call:
## lm(formula = data$y ~ data$x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7515 -1.1403  0.0264  1.2466  5.3070 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0891     0.5685   3.675 0.000389 ***
## data$x        1.3101     0.1906   6.872  5.9e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.983 on 98 degrees of freedom
## Multiple R-squared:  0.3252, Adjusted R-squared:  0.3183 
## F-statistic: 47.22 on 1 and 98 DF,  p-value: 5.903e-10

3. a residual plot

plot(result$residuals)
abline(a=0,b=0, col="red")

hist(result$residuals)

4. check the normality assumption

shapiro.test(result$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  result$residuals
## W = 0.9902, p-value = 0.6804

5. prediction

x_new = 0.7
y_new = result$coefficients[1]+result$coefficients[2]*x_new
y_new
## (Intercept) 
##    3.006198

6. a quadratic relationship

x = rnorm(100, mean=0, sd=1)
y = 2.1+1.25*x^2+error
data=data.frame(cbind(x,y))
result = lm(data$y~data$x)
summary(result)
## 
## Call:
## lm(formula = data$y ~ data$x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7467 -1.6504 -0.0636  1.3304  8.3432 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.34842    0.26429  12.670   <2e-16 ***
## data$x      -0.06559    0.28175  -0.233    0.816    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.617 on 98 degrees of freedom
## Multiple R-squared:  0.0005527,  Adjusted R-squared:  -0.009646 
## F-statistic: 0.0542 on 1 and 98 DF,  p-value: 0.8164

Multiple linear regression

1. simulate data

x1 = rnorm(100, mean=3, sd=1)
x2 = rnorm(100, mean=2.5, sd=2.1)

error = rnorm(100, mean=0, sd=2)
y = 2.1 + 1.25*x1 - 3*x2 + error
data=data.frame(cbind(y,x1,x2))
2. fit the linear regression model
result = lm(y~x1+x2,data=data)
summary(result)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3176 -1.4438 -0.2222  1.3928  5.4977 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.5958     0.7768   2.054   0.0426 *  
## x1            1.2331     0.2355   5.235 9.56e-07 ***
## x2           -2.7818     0.1183 -23.506  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.14 on 97 degrees of freedom
## Multiple R-squared:  0.8533, Adjusted R-squared:  0.8503 
## F-statistic: 282.1 on 2 and 97 DF,  p-value: < 2.2e-16

3. a residual plot

plot(result$residuals)
abline(a=0,b=0, col="red")

hist(result$residuals)

#### 4. check the normality assumption

shapiro.test(result$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  result$residuals
## W = 0.97942, p-value = 0.1198

Logistic regression

1. simulate data

age <- round(runif(100, 18, 80))
log_odds = -2.2 + 0.02*age
p = 1/(1 + exp(-log_odds))
y <- rbinom(n = 100, size = 1, prob = p)

2. fit the logistic regression model

mod <- glm(y ~ age, family = "binomial")
summary(mod)
## 
## Call:
## glm(formula = y ~ age, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7056  -0.7052  -0.7047  -0.7041   1.7414  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.270e+00  7.990e-01  -1.589    0.112
## age          8.239e-05  1.473e-02   0.006    0.996
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 105.38  on 99  degrees of freedom
## Residual deviance: 105.38  on 98  degrees of freedom
## AIC: 109.38
## 
## Number of Fisher Scoring iterations: 4