library(ggplot2)
library(car)
## Loading required package: carData
EV = read.csv("/Users/emilyluo/Desktop/2022 Fall/DSO 510/PROJECT/final/All_Data_Final-Overview.csv") # File Name: All_Data_Final - Overview.csv
colnames(EV)[2] = 'EV_Registration_Num'
colnames(EV)[3] = 'Gas_Price'
colnames(EV)[4] = 'Population'
colnames(EV)[5] = 'Household_Income'
colnames(EV)[6] = 'Charging_Stations'
colnames(EV)[7] = 'Political_Party'
EV$EV_Registration_Num = as.numeric(gsub(',','', EV$EV_Registration_Num))
EV$Household_Income = gsub(',', "", EV$Household_Income)
EV$Household_Income = as.numeric(gsub("\\$", "", EV$Household_Income))
EV$Charging_Stations = as.integer(gsub(',', "", EV$Charging_Stations))
EV$Political_Party = as.factor(EV$Political_Party)
EV$Population = as.numeric(gsub(',','', EV$Population))
options(scipen = 999)
ggplot(EV, aes(EV_Registration_Num, color = Political_Party)) +
geom_boxplot() +
scale_y_discrete(breaks = "NULL") +
scale_color_manual(values = c('red', 'steelblue3'), labels = c('Republicans', 'Democrats')) +
labs(title = 'Boxplot of EV Registration Number by Democratic/Republican States')
ggplot(EV, aes(x = Gas_Price, y = EV_Registration_Num, color = Political_Party)) +
geom_point() +
geom_line() +
scale_color_manual(values = c('red', 'steelblue3'), labels = c('Republicans', 'Democrats')) +
geom_text(aes(label=ifelse((Political_Party == 1 & EV_Registration_Num>400000) | (Political_Party == 0 & EV_Registration_Num>25000),States, ''), hjust=1,vjust=-1.3)) +
labs(title = 'Line Chart between Gas Price and EV Registration Num')
EV = EV[EV$States != 'California',] # Exclude the Democratic States Outlier
EVNEW = EV[!(EV$States %in% c('Florida','Texas','North Carolina')),] # Exclude the Republicans States Outliers
ggplot(EVNEW, aes(x = Gas_Price, y = EV_Registration_Num, color = Political_Party)) +
geom_point() +
geom_smooth(method = 'lm', fill = NA) +
scale_color_manual(values = c('red', 'steelblue3'), labels = c('Republicans', 'Democrats')) +
facet_wrap(~Political_Party) +
labs(title = 'Trendline after Excluding the Outliers')
## `geom_smooth()` using formula = 'y ~ x'
ggplot(EVNEW, aes(x = Gas_Price, y = EV_Registration_Num, color = Political_Party)) +
geom_point() +
geom_smooth(method = 'lm', fill = NA) +
scale_color_manual(values = c('red', 'steelblue3'), labels = c('Republicans', 'Democrats')) +
labs(title = 'Trendline after Excluding the Outliers')
## `geom_smooth()` using formula = 'y ~ x'
cor(EVNEW[, c(2,3,4,5,6)])
## EV_Registration_Num Gas_Price Population
## EV_Registration_Num 1.0000000 0.30945536 0.74091241
## Gas_Price 0.3094554 1.00000000 -0.08034107
## Population 0.7409124 -0.08034107 1.00000000
## Household_Income 0.4473394 0.45259553 0.07518320
## Charging_Stations 0.8448172 0.13992779 0.79670675
## Household_Income Charging_Stations
## EV_Registration_Num 0.4473394 0.8448172
## Gas_Price 0.4525955 0.1399278
## Population 0.0751832 0.7967068
## Household_Income 1.0000000 0.3704219
## Charging_Stations 0.3704219 1.0000000
Because we check the histograms for each numeric independent
variables and find out all variables except Household_Income are in
skewed distribution. For better modeling, we will take the log for all
numeric variables except Household Income.
ggplot(EVNEW, aes(x = Population)) +
geom_histogram(bins = 20)
ggplot(EVNEW, aes(x = log(Population))) +
geom_histogram(bins = 20)
ggplot(EVNEW, aes(x = Charging_Stations)) +
geom_histogram(bins = 20)
ggplot(EVNEW, aes(x = log(Charging_Stations))) +
geom_histogram(bins = 20)
ggplot(EVNEW, aes(x = Household_Income)) +
geom_histogram(bins = 20)
l_dummy_s = lm(log(EV_Registration_Num)~Political_Party*log(Gas_Price), EVNEW)
summary(l_dummy_s)
##
## Call:
## lm(formula = log(EV_Registration_Num) ~ Political_Party * log(Gas_Price),
## data = EVNEW)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0863 -0.8759 0.1276 0.7782 2.0408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.279 2.638 3.897 0.000344 ***
## Political_Party1 -4.911 3.717 -1.321 0.193599
## log(Gas_Price) -1.931 2.320 -0.832 0.410090
## Political_Party1:log(Gas_Price) 5.571 3.203 1.739 0.089373 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.058 on 42 degrees of freedom
## Multiple R-squared: 0.408, Adjusted R-squared: 0.3657
## F-statistic: 9.647 on 3 and 42 DF, p-value: 0.00005749
l_dummy_m_9 = lm(log(EV_Registration_Num)~Political_Party*log(Gas_Price)+log(Population)+Household_Income+log(Charging_Stations), EVNEW)
summary(l_dummy_m_9)
##
## Call:
## lm(formula = log(EV_Registration_Num) ~ Political_Party * log(Gas_Price) +
## log(Population) + Household_Income + log(Charging_Stations),
## data = EVNEW)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68411 -0.23758 0.04389 0.21576 0.63935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.01098366 1.70616364 -4.109 0.000197 ***
## Political_Party1 -0.84978919 1.24623359 -0.682 0.499341
## log(Gas_Price) 1.41577633 0.85970521 1.647 0.107632
## log(Population) 0.67310027 0.11460264 5.873 0.000000777 ***
## Household_Income 0.00001261 0.00000585 2.155 0.037355 *
## log(Charging_Stations) 0.50187501 0.11490059 4.368 0.000089797 ***
## Political_Party1:log(Gas_Price) 1.12108726 1.07396286 1.044 0.302969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3154 on 39 degrees of freedom
## Multiple R-squared: 0.9511, Adjusted R-squared: 0.9436
## F-statistic: 126.5 on 6 and 39 DF, p-value: < 0.00000000000000022
vif(l_dummy_m_9)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## Political_Party log(Gas_Price)
## 179.226239 3.453960
## log(Population) Household_Income
## 4.962864 2.165594
## log(Charging_Stations) Political_Party:log(Gas_Price)
## 6.687973 188.540908
avPlots(l_dummy_m_9)