Sree Kanagala

Data Cleaning

Load in dataset and create a variable for

library(readxl)
  harvard <- read_excel("C:/Users/ksree/Desktop/comp_app/harvard.xlsx", 
                        sheet = "Sheet16")
  View(harvard)

  data <- cbind(harvard)

Remove binary answers to statements 1-12 for trust and sharing (I use the average per participant for these variables)

data = subset(data, select = -c(No, FNtrust1,FNtrust2,FNtrust3,FNtrust4,FNtrust5,FNtrust6,FNtrust7,FNtrust8,
                                  FNtrust9,FNtrust10,FNtrust11,FNtrust12,FNtrust13,  FNsharing1,FNsharing2,FNsharing3,FNsharing4,FNsharing5,
                                  FNsharing6,FNsharing7,FNsharing8,FNsharing9,FNsharing10,FNsharing11,FNsharing12, FNsharing13))

Profile data to see any immediate anomalies and check variables

View(data)

Needed packages

library(faraway)
  library(psych)

## 
  ## Attaching package: 'psych'

## The following object is masked from 'package:faraway':
  ## 
  ##     logit

Descriptive Statistics

Get some descriptive statistics for the overall dataset Identify any visual correlations among variables, visual outliers, mean and skew

plot(data)

summary(data)

##      anger          anxiety         ideology         gender      
  ##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.0000  
  ##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:0.0000  
  ##  Median :4.000   Median :4.000   Median :4.000   Median :1.0000  
  ##  Mean   :3.928   Mean   :3.926   Mean   :4.109   Mean   :0.5224  
  ##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:1.0000  
  ##  Max.   :6.000   Max.   :6.000   Max.   :7.000   Max.   :1.0000  
  ##       age            region            educ            income      
  ##  Min.   :17.00   Min.   : 1.000   Min.   : 2.000   Min.   : 1.000  
  ##  1st Qu.:35.00   1st Qu.: 3.000   1st Qu.: 8.000   1st Qu.: 3.000  
  ##  Median :49.00   Median : 6.000   Median :10.000   Median : 5.000  
  ##  Mean   :48.19   Mean   : 6.665   Mean   : 9.158   Mean   : 5.374  
  ##  3rd Qu.:62.00   3rd Qu.:10.000   3rd Qu.:10.000   3rd Qu.: 7.000  
  ##  Max.   :94.00   Max.   :19.000   Max.   :12.000   Max.   :12.000  
  ##        st               ss        
  ##  Min.   : 0.000   Min.   : 0.000  
  ##  1st Qu.: 2.000   1st Qu.: 1.000  
  ##  Median : 3.000   Median : 3.000  
  ##  Mean   : 3.639   Mean   : 3.238  
  ##  3rd Qu.: 5.000   3rd Qu.: 5.000  
  ##  Max.   :13.000   Max.   :13.000

describe(data)

Create a visual for the correlations existing among variables through a correlation plot Correlations are not as high as hypothesized

library(GGally)

## Warning: package 'GGally' was built under R version 4.0.3

## Loading required package: ggplot2

## 
  ## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
  ## 
  ##     %+%, alpha

## Registered S3 method overwritten by 'GGally':
  ##   method from   
  ##   +.gg   ggplot2

## 
  ## Attaching package: 'GGally'

## The following object is masked from 'package:faraway':
  ## 
  ##     happy

ggcorr(data, method = c("everything", "pearson"))

Split Training and Test data

Split data into training and test sets to validate model in the end

require(caTools)

## Loading required package: caTools

## Warning: package 'caTools' was built under R version 4.0.3

set.seed(4321)
  sample = sample.split(data,SplitRatio = 0.75)
  train = subset(data,sample ==TRUE) 
  test = subset(data, sample==FALSE)

  describe(train)

describe(test)

Standardize

Standardize and scale data so that the mean is 0 and std is 1 (comparing measurements with different units) Important to scale data after splitting train/test

train$anger_scaled<-scale(train$anger)
  train$anxiety_scaled<-scale(train$anxiety)
  train$ideology_scaled<-scale(train$ideology)
  train$gender_scaled<-scale(train$gender)
  train$age_scaled<-scale(train$age)
  train$educ_scaled<-scale(train$educ)
  train$income_scaled<-scale(train$income)
  train$region_scaled<-scale(train$region)
  train$st_scaled<-scale(train$st)
  train$ss_scaled<-scale(train$ss)

  train = subset(train, select = -c(anger, anxiety,ideology, gender, age, educ, income, region, st, ss))
  describe(train)

test$anger_scaled<-scale(test$anger)
  test$anxiety_scaled<-scale(test$anxiety)
  test$ideology_scaled<-scale(test$ideology)
  test$gender_scaled<-scale(test$gender)
  test$age_scaled<-scale(test$age)
  test$educ_scaled<-scale(test$educ)
  test$income_scaled<-scale(test$income)
  test$region_scaled<-scale(test$region)
  test$st_scaled<-scale(test$st)
  test$ss_scaled<-scale(test$ss)

  test = subset(test, select = -c(anger, anxiety,ideology, gender, age, educ, income, region, st, ss))
  describe(test)

PCA

Principle Component Analysis looks to reduce dimensionality through unsupervised learning The goal is for the model to explain as much variation as possible by transforming a large set of variables into a smaller one that still contains most of the information in the large set

train.pca <- prcomp(train[,1:10])
  train.pca

## Standard deviations (1, .., p=10):
  ##  [1] 1.4255319 1.2077033 1.1529020 1.0887612 0.9731955 0.8938451 0.8922933
  ##  [8] 0.8441184 0.7324568 0.4510464
  ## 
  ## Rotation (n x k) = (10 x 10):
  ##                         PC1         PC2          PC3         PC4         PC5
  ## anger_scaled    -0.25583327  0.57400924 -0.218816336  0.01419926  0.13902518
  ## anxiety_scaled  -0.26163825  0.58754240 -0.112973270 -0.10224780  0.22324146
  ## ideology_scaled  0.18730345 -0.13699372 -0.460041912  0.36959192  0.16250685
  ## gender_scaled    0.11918834 -0.09918661  0.547144849 -0.20349712  0.29557019
  ## age_scaled      -0.25644739  0.11560798  0.503856074 -0.04362025 -0.18781140
  ## educ_scaled      0.24571311  0.05528468  0.003909389 -0.45788526  0.61382793
  ## income_scaled    0.17087265 -0.20222358 -0.348418899 -0.50472856  0.02881812
  ## region_scaled    0.06003516 -0.06929543  0.183675363  0.57769944  0.58003623
  ## st_scaled       -0.57538926 -0.34135434 -0.088042282 -0.06086302  0.19860545
  ## ss_scaled       -0.57165693 -0.34360070 -0.107612108 -0.08706893  0.16819793
  ##                          PC6          PC7          PC8          PC9
  ## anger_scaled    -0.102424733 -0.215839558  0.073590176  0.689284240
  ## anxiety_scaled  -0.007255479 -0.030576195  0.172903418 -0.694421526
  ## ideology_scaled -0.434023289  0.374473811  0.488566733 -0.002866363
  ## gender_scaled   -0.543400395 -0.392750484  0.307983359  0.007919557
  ## age_scaled       0.244165044  0.538362771  0.505317276  0.158149055
  ## educ_scaled      0.027212835  0.500412493 -0.288059340  0.123776087
  ## income_scaled    0.435959716 -0.286391627  0.528033751  0.044457474
  ## region_scaled    0.489857141 -0.188509564  0.088777883  0.014324610
  ## st_scaled        0.001721200  0.010591295 -0.077848971  0.008858990
  ## ss_scaled       -0.124158628  0.008914722  0.007234805 -0.001805096
  ##                         PC10
  ## anger_scaled     0.006834423
  ## anxiety_scaled   0.013440128
  ## ideology_scaled  0.064748705
  ## gender_scaled    0.057986670
  ## age_scaled       0.003875763
  ## educ_scaled     -0.025648413
  ## income_scaled    0.007340747
  ## region_scaled   -0.064000244
  ## st_scaled        0.703740331
  ## ss_scaled       -0.701530164

summary(train.pca)

## Importance of components:
  ##                           PC1    PC2    PC3    PC4     PC5    PC6     PC7
  ## Standard deviation     1.4255 1.2077 1.1529 1.0888 0.97320 0.8938 0.89229
  ## Proportion of Variance 0.2032 0.1459 0.1329 0.1185 0.09471 0.0799 0.07962
  ## Cumulative Proportion  0.2032 0.3491 0.4820 0.6005 0.69524 0.7751 0.85475
  ##                            PC8     PC9    PC10
  ## Standard deviation     0.84412 0.73246 0.45105
  ## Proportion of Variance 0.07125 0.05365 0.02034
  ## Cumulative Proportion  0.92601 0.97966 1.00000

Visualize how each principle component explains model variation

plot(train.pca)

biplot(train.pca)

Reassess correlations with PCA’s

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.0.3

## corrplot 0.84 loaded

cor1 <- cor(train.pca$x, method="pearson")
  corrplot::corrplot(cor1, method= "color", order = "hclust", tl.pos = 'n')

Save PCA’s as a seperate data frame to use for model building

pcs <- as.data.frame(train.pca$x)
  data2 <- cbind(train, pcs)
  data2 <- subset(data2, select = -c(anger_scaled, anxiety_scaled, ideology_scaled, gender_scaled, age_scaled, educ_scaled,
                                    income_scaled, region_scaled, st_scaled))
  data2

Opitmal PCA’s is 3
Run a regression model using the first 3 PCA’s

lmodel <- lm(ss_scaled ~ PC1 + PC2 + PC3, data = data2)
  summary(lmodel)

## 
  ## Call:
  ## lm(formula = ss_scaled ~ PC1 + PC2 + PC3, data = data2)
  ## 
  ## Residuals:
  ##      Min       1Q   Median       3Q      Max 
  ## -1.31237 -0.23538  0.04115  0.22582  1.31479 
  ## 
  ## Coefficients:
  ##               Estimate Std. Error t value Pr(>|t|)    
  ## (Intercept) -1.956e-17  2.041e-02    0.00        1    
  ## PC1         -5.717e-01  1.434e-02  -39.87  < 2e-16 ***
  ## PC2         -3.436e-01  1.693e-02  -20.30  < 2e-16 ***
  ## PC3         -1.076e-01  1.773e-02   -6.07 3.29e-09 ***
  ## ---
  ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  ## 
  ## Residual standard error: 0.3868 on 355 degrees of freedom
  ## Multiple R-squared:  0.8517, Adjusted R-squared:  0.8504 
  ## F-statistic: 679.5 on 3 and 355 DF,  p-value: < 2.2e-16

Least Squares Model

Compare to a simple least squares regression

lmod <- lm(ss_scaled ~ ., data = train)
  summary(lmod)

## 
  ## Call:
  ## lm(formula = ss_scaled ~ ., data = train)
  ## 
  ## Residuals:
  ##      Min       1Q   Median       3Q      Max 
  ## -2.41925 -0.27647  0.06367  0.26180  2.72794 
  ## 
  ## Coefficients:
  ##                   Estimate Std. Error t value Pr(>|t|)    
  ## (Intercept)     -8.984e-17  3.238e-02   0.000   1.0000    
  ## anger_scaled     1.230e-02  3.668e-02   0.335   0.7375    
  ## anxiety_scaled   1.834e-02  3.673e-02   0.499   0.6179    
  ## ideology_scaled  4.701e-02  3.391e-02   1.386   0.1666    
  ## gender_scaled    3.810e-02  3.362e-02   1.133   0.2580    
  ## age_scaled       2.448e-02  3.421e-02   0.716   0.4748    
  ## educ_scaled     -5.381e-02  3.342e-02  -1.610   0.1083    
  ## income_scaled    7.737e-03  3.378e-02   0.229   0.8190    
  ## region_scaled   -6.915e-02  3.292e-02  -2.101   0.0364 *  
  ## st_scaled        7.847e-01  3.336e-02  23.522   <2e-16 ***
  ## ---
  ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  ## 
  ## Residual standard error: 0.6135 on 349 degrees of freedom
  ## Multiple R-squared:  0.6331, Adjusted R-squared:  0.6236 
  ## F-statistic:  66.9 on 9 and 349 DF,  p-value: < 2.2e-16

Use forward selection to find most significant features in training dataset

library(MASS)
  library(olsrr)

## Warning: package 'olsrr' was built under R version 4.0.3

## Registered S3 methods overwritten by 'car':
  ##   method                          from
  ##   influence.merMod                lme4
  ##   cooks.distance.influence.merMod lme4
  ##   dfbeta.influence.merMod         lme4
  ##   dfbetas.influence.merMod        lme4

## 
  ## Attaching package: 'olsrr'

## The following object is masked from 'package:MASS':
  ## 
  ##     cement

## The following object is masked from 'package:faraway':
  ## 
  ##     hsb

## The following object is masked from 'package:datasets':
  ## 
  ##     rivers

model <- lm(ss_scaled ~ ., data = train)
  k <- ols_step_forward_p(model)
  ols_step_forward_p(model)

## 
  ##                                Selection Summary                                
  ## -------------------------------------------------------------------------------
  ##         Variable                         Adj.                                      
  ## Step        Entered        R-Square    R-Square     C(p)       AIC        RMSE     
  ## -------------------------------------------------------------------------------
  ##    1    st_scaled            0.6226      0.6216    3.9338    673.9481    0.6152    
  ##    2    region_scaled        0.6269      0.6248    1.8961    671.8867    0.6126    
  ##    3    educ_scaled          0.6294      0.6263    1.4768    671.4311    0.6113    
  ##    4    ideology_scaled      0.6306      0.6264    2.3834    672.3158    0.6112    
  ##    5    gender_scaled        0.6318      0.6266    3.2080    673.1128    0.6111    
  ## -------------------------------------------------------------------------------

plot(k)

Repeat for stepwise selection

k2 <- ols_step_both_p(model)
  ols_step_both_p(model)

## 
  ##                                Stepwise Selection Summary                                 
  ## -----------------------------------------------------------------------------------------
  ##                           Added/                   Adj.                                      
  ## Step      Variable       Removed     R-Square    R-Square     C(p)       AIC        RMSE     
  ## -----------------------------------------------------------------------------------------
  ##    1      st_scaled      addition       0.623       0.622    3.9340    673.9481    0.6152    
  ##    2    region_scaled    addition       0.627       0.625    1.8960    671.8867    0.6126    
  ## -----------------------------------------------------------------------------------------

plot(k2)

summary(k2)

##            Length Class      Mode     
  ## orders      2     -none-     character
  ## method      2     -none-     character
  ## steps       1     -none-     numeric  
  ## predictors  2     -none-     character
  ## rsquare     2     -none-     numeric  
  ## aic         2     -none-     numeric  
  ## sbc         2     -none-     numeric  
  ## sbic        2     -none-     numeric  
  ## adjr        2     -none-     numeric  
  ## rmse        2     -none-     numeric  
  ## mallows_cp  2     -none-     numeric  
  ## indvar      9     -none-     character
  ## betas       5     -none-     numeric  
  ## lbetas      2     -none-     numeric  
  ## pvalues     5     -none-     numeric  
  ## beta_pval   4     data.frame list     
  ## model      12     lm         list

View all possible models

library(olsrr)
  k3 <- ols_step_all_possible(model)
  k3

plot(k3)

Using these variable selection methods, formally write out each model to compare model accuracy

#pcr
  lmodpcr <- lm(train$ss_scaled ~ train.pca$x[,1:3])
  #simple 
  lmod <- lm(ss_scaled ~ ., data = train)
  #forward 
  forward <- lm(ss_scaled ~ region_scaled + educ_scaled + ideology_scaled + gender_scaled, data = train)
  #stepwise 
  stepwise <- lm(ss_scaled ~ st_scaled + region_scaled, data = train)

Cross Validation

Perform cross validation on these 4 models

library(caret)

## Loading required package: lattice

## 
  ## Attaching package: 'lattice'

## The following object is masked from 'package:faraway':
  ## 
  ##     melanoma

set.seed(13245)
  train.control <- trainControl(method = "repeatedcv", number = 10,repeats=20)
  model_leastsquares <- train(ss_scaled ~ ., data = train, method = "lm",
                        trControl = train.control)
  model_pcr <- train(ss_scaled ~ PC1 + PC2 + PC3, data = data2, method = "lm",
                    trControl = train.control)
  model_forward <- train(ss_scaled ~ region_scaled + educ_scaled + ideology_scaled + gender_scaled, data = train, method = "lm",
                        trControl = train.control)
  model_stepwise <- train(ss_scaled ~ st_scaled + region_scaled, data = train, method = "lm",
                          trControl = train.control)

  print(model_leastsquares)

## Linear Regression 
  ## 
  ## 359 samples
  ##   9 predictor
  ## 
  ## No pre-processing
  ## Resampling: Cross-Validated (10 fold, repeated 20 times) 
  ## Summary of sample sizes: 323, 323, 324, 323, 323, 322, ... 
  ## Resampling results:
  ## 
  ##   RMSE       Rsquared   MAE      
  ##   0.6134204  0.6174199  0.4311321
  ## 
  ## Tuning parameter 'intercept' was held constant at a value of TRUE

print(model_pcr)

## Linear Regression 
  ## 
  ## 359 samples
  ##   3 predictor
  ## 
  ## No pre-processing
  ## Resampling: Cross-Validated (10 fold, repeated 20 times) 
  ## Summary of sample sizes: 323, 324, 324, 323, 324, 323, ... 
  ## Resampling results:
  ## 
  ##   RMSE       Rsquared  MAE     
  ##   0.3847179  0.850367  0.293216
  ## 
  ## Tuning parameter 'intercept' was held constant at a value of TRUE

print(model_forward)

## Linear Regression 
  ## 
  ## 359 samples
  ##   4 predictor
  ## 
  ## No pre-processing
  ## Resampling: Cross-Validated (10 fold, repeated 20 times) 
  ## Summary of sample sizes: 322, 325, 323, 324, 322, 323, ... 
  ## Resampling results:
  ## 
  ##   RMSE       Rsquared    MAE      
  ##   0.9848961  0.04987417  0.7874916
  ## 
  ## Tuning parameter 'intercept' was held constant at a value of TRUE

print(model_stepwise)

## Linear Regression 
  ## 
  ## 359 samples
  ##   2 predictor
  ## 
  ## No pre-processing
  ## Resampling: Cross-Validated (10 fold, repeated 20 times) 
  ## Summary of sample sizes: 323, 324, 322, 324, 323, 323, ... 
  ## Resampling results:
  ## 
  ##   RMSE       Rsquared   MAE      
  ##   0.6053524  0.6288003  0.4233915
  ## 
  ## Tuning parameter 'intercept' was held constant at a value of TRUE

Validate

Validate models on test data set

Set up PCA validation

testpca <- cbind(data2[1:154,])
testpca

PC1 <- train.pca$rotation[,1]
PC2<- train.pca$rotation[,2]
PC3<- train.pca$rotation[,3]
View(testpca)

Validate models on test data set

model_leastsquares_test <- train(ss_scaled ~ ., data = test, method = "lm", trControl = train.control)
model_PCA_test <- model_pcr_test <- train(ss_scaled ~ PC1 + PC2 + PC3, data = testpca,
                        method = "lm",
                        trControl = train.control)

model_forward_test <- train(ss_scaled ~ region_scaled + educ_scaled + ideology_scaled + gender_scaled, data = test, method =                          "lm", trControl = train.control)

model_stepwise_test <- train(ss_scaled ~ st_scaled + region_scaled, data = test, method = "lm",
                        trControl = train.control)

model_leastsquares_test

## Linear Regression 
## 
## 154 samples
##   9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 20 times) 
## Summary of sample sizes: 139, 139, 140, 138, 138, 139, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.6152499  0.6245287  0.4369182
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

model_PCA_test

## Linear Regression 
## 
## 154 samples
##   3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 20 times) 
## Summary of sample sizes: 139, 138, 138, 139, 139, 139, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.3604114  0.8871406  0.2763031
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

model_forward_test

## Linear Regression 
## 
## 154 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 20 times) 
## Summary of sample sizes: 138, 138, 138, 140, 138, 140, ... 
## Resampling results:
## 
##   RMSE       Rsquared    MAE      
##   0.9988511  0.07279995  0.7535439
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

model_stepwise_test

## Linear Regression 
## 
## 154 samples
##   2 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 20 times) 
## Summary of sample sizes: 138, 138, 140, 139, 139, 138, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.5980537  0.6459783  0.4289118
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Motivation

Predict if someone would spread misinformation regarding COVID19 using unsupervised machine learning

Background

My Role

Materials

Data

Variables

Model Building

Misinformation Spread COVID19