Code Part 1:
#Packages
library(knitr)
library(kableExtra)
library(ggplot2)
library(dplyr)
#Loading the Data into R after Transforming in Excel
FemaleEarnings <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/Female Earnings Transformed.csv")
#Clean up
names(FemaleEarnings) <- c("Year", "Female.Earnings")
#Adding a Number to year
FemaleEarnings <- FemaleEarnings %>%
arrange(Year) %>%
mutate(YearNum = row_number())
#Initial look at the data
head(FemaleEarnings)
tail(FemaleEarnings)
TableEarnings1 <- rbind(head(FemaleEarnings), "...", tail(FemaleEarnings))
kable(TableEarnings1, row.names = F, caption = "Table 1.1: US Census data regarding Median earnings for Total Female Workers from 1967 - 2021. The Years are numbered in order to not skew the analysis later.") %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
#Creating the initial regression model
Reg.Model1 <- lm(Female.Earnings~YearNum, data = FemaleEarnings)
#Plotting the intial data
#linearity check 1
ggplot(FemaleEarnings, aes(x = Year, y = Female.Earnings)) +
geom_smooth(method = "lm", se = F, color = "blue") +
geom_point(color="red")+
theme_minimal(base_size = 12) +
theme(axis.title.x = element_text(size = 14, vjust = -4, hjust = 0.43),
axis.title.y = element_text(size = 14, vjust = 6, hjust = 0.6),
plot.title = element_text(size = 16, vjust = 6),
plot.margin = unit(c(1,1,1,1), "cm")) +
labs(title = expression(underline("Median Female Earnings in the US from 1967-2021"))) +
ylab("Median Female Earnings ($)") +
xlab("Time in Years")
#Figure 1.1: ...
#Looks linear
#Coefficient Table
Model1Table <- Reg.Model1$coefficients
kable(t(Model1Table), caption = "Table 1.2: Linear Regression Model 1. The intercept (constant) β0 and the slope (coefficient) β1, which are used with the predictor X to achieve the response variable Y.", col.names = c("β0 (Intercept)", "β1 (Slope)")) %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
#Checking assumptions are met
#Homoscedasticity check 1
par(mfrow = c(2,2))
plot(Reg.Model1)
par(mfrow = c(1,1))
#Figure 1.2:...
#QQ Plot tells us the normality of the residuals, a straight line indicates that the residuals are normally distributed.
#Residuals vs Fitted: Shows the residuals which is y - predicted y as a function of the fitted values.
#Residuals should bounce randomly around the 0 line and should be roughly equal around the estimated regression line.
#Residuals somewhat a pattern in this case, so we will transform the data to see if this makes a difference.
#log10 Transformation
Reg.Model2 <- lm(log10(Female.Earnings)~(YearNum), data = FemaleEarnings)
#Table For Coefficients
Model2Table <- Reg.Model2$coefficients
kable(t(Model2Table), caption = "Table 1.3: Linear Regression Model 2 with log10() transformation. The intercept (constant) β0 and the slope (coefficient) β1, which are used with the predictor X to achieve the response variable Y.", col.names = c("β0 (Intercept)", "β1 (Slope)")) %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
#Homoscedasticity Check 2
#Plotting Model 2
par(mfrow = c(2,2))
plot(Reg.Model2)
par(mfrow = c(1,1))
#Figure 1.3:...
#Residuals vs Fitted, After the log10() transformation there is less dispersion and residuals form a U shape pattern, showing a non linear relationship
##The QQ Residuals do not follow a straight line which shows lack normality of residuals.
#One cannot say that all the assumptions of linear regression are met as the data points are not randomly spread around the line.
#Adding another variable to our linear regression model.
#Inflation.
#Since we are looking at how income has increased over the years, inflation will be a good variable to add to our model as it directly has an effect on income.
InflationData <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/Inflation.csv")
FemaleEarningsNew <- cbind(FemaleEarnings, InflationData$Inflation)
names(FemaleEarningsNew) <- c("Year", "Female.Earnings", "YearNum", "Inflation")
#Scale Both Year Num and Inflation in order to create a model.
YearNumScaled <- scale(FemaleEarningsNew$YearNum, center = T, scale = T)
InflationScaled <- scale(FemaleEarningsNew$Inflation, center = T, scale = T)
FemaleEarningsNew$YearScaled <- scale(FemaleEarningsNew$YearNum, center = T, scale = T)
FemaleEarningsNew$InflationScaled <- scale(FemaleEarningsNew$Inflation, center = T, scale = T)
#Linear Regression Model 3
Reg.Model3 <- lm(Female.Earnings ~ YearScaled + InflationScaled, data = FemaleEarningsNew)
Model3Table <- Reg.Model3$coefficients
kable(t(Model3Table), caption = "Table 1.4: Linear Regression Model 3 with new predictor Inflation (x2) . The intercept (constant) β0 and the slope (coefficient) β1 + β2, which are used with the predictor x1 + x2 to achieve the response variable Y.", col.names = c("β0 (Intercept)", "β1 (Slope for x1)", "β2 (Slope for x2)")) %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
#Plotting Model 3
par(mfrow = c(2,2))
plot(Reg.Model3)
par(mfrow = c(1,1))
#Figure 1.4:...
#Checking Multiple R Squared and Adjusted R Squared of the two models
summary(Reg.Model1)$r.squared
summary(Reg.Model1)$adj.r.squared
Model1rSquared<- cbind.data.frame(summary(Reg.Model1)$r.squared, summary(Reg.Model1)$adj.r.squared)
names(Model1rSquared) <- c("MultipleR^2", "AdjustedR^2")
##
Model3rSquared<- cbind.data.frame(summary(Reg.Model3)$r.squared, summary(Reg.Model3)$adj.r.squared)
names(Model3rSquared) <- c("MultipleR^2", "AdjustedR^2")
##
RSquaredTable <-rbind(Model1rSquared, Model3rSquared)
row.names(RSquaredTable) <- c("MultipleR^2", "AdjustedR^2")
kable(t(RSquaredTable), caption = "Table 1.5: Comparing the Multiple R Sqaured and Adjusted R Squared of Model 1 and 2 to determine whcih is best fit for our data.", col.names = c("Model 1", "Model 2", "Model 3"), table.attr = "style='width:80%;'") %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
#Adding Inflation to our model -> Model3, we get an increase in MultipleR^2 and AdjustedR^2 squared.
#Main indicator for goodness of fit of introducing a new indicator to our model is AdjustedR^2. As it explaines how much variability is explained in the model.
#Can use anova to determine whether Inflation should be added to our model or not.
anova(Reg.Model1, Reg.Model3)
#Df = 1, more complex model has one additional parameter
#P-value 0.2 > 0.05, means we should reject Model 3 and continue with model 1
#Summary of regression model 1
summary(Reg.Model1)
#P - Value - Upper Tail test
p_value <- pt(q = 41.22, df = 53, lower.tail=FALSE)
p_value
#Significant increase in Female earnings by change in Time in Years.
#Plotting Residuals
FemaleEarnings$predicted <- predict(Reg.Model1)
FemaleEarnings$residuals <- residuals(Reg.Model1)
ggplot(FemaleEarnings, aes(x = Year, y = Female.Earnings)) +
geom_smooth(method = "lm", se = F, color = "blue") +
geom_point(aes(color = abs(residuals), size = abs(residuals))) +
scale_color_continuous(low = "blue", high = "red") +
guides(color = F, size = F) +
geom_segment(aes(xend = Year, yend = predicted), alpha = .2) +
theme_minimal(base_size = 12) +
theme(axis.title.x = element_text(size = 14, vjust = -4, hjust = 0.43),
axis.title.y = element_text(size = 14, vjust = 6, hjust = 0.6),
plot.title = element_text(size = 16, vjust = 6),
plot.margin = unit(c(1,1,1,1), "cm")) +
labs(title = expression(underline("Median Female Earnings ($) in the US from 1967-2021"))) +
ylab("Median Female Earnings ($)") +
xlab("Time in Years")
#Figure 1.5: Median Female Earnings plotted against time in Years, with line of best fit and Residuals plotted to the predicted values.
#We can predict median female earnings in the US using the coefficients...
#And using the Year number as X1
Model1Table <- Reg.Model1$coefficients
kable(t(Model1Table), caption = "Table 1.2: Linear Regression Model 1. The intercept (constant) β0 and the slope (coefficient) β1, which are used with the predictor X to achieve the response variable Y.", col.names = c("β0 (Intercept)", "β1 (Slope)")) %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
# Y(Predicted Median Female Earnings) = 15383.08 + 406.6405 * x1
#Yet this number does not account for inflation, unemployment, other socioeconomic and economic factors.
Code Part 2:
#Packages
library(ggplot2)
library(ggpubr)
library(knitr)
library(kableExtra)
library(Rcmdr)
library(dplyr)
#Data
MaleFemaleEarnings <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/MFEarnings.csv")
names(MaleFemaleEarnings) <- c("Year", "Gender", "Median.Income")
#Initial Look at our data
head(MaleFemaleEarnings)
tail(MaleFemaleEarnings)
TableEarnings2 <- rbind(head(MaleFemaleEarnings), "...", tail(MaleFemaleEarnings))
kable(TableEarnings2,
caption = "Table 2.1: Initial look at how our data is laid out. Providing Income data for both Genders in the US from 1967 - 2022. The Income in this dataset accounts for inflation and is in 2022 Dollars ($).") %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
#Summary Stats
#Summary Statistics by Gender
ByGender <-group_by(MaleFemaleEarnings, Gender) %>%
summarise(
count = n(),
median = median(Median.Income, na.rm = T),
IQR = IQR(Median.Income, na.rm = T)
)
kable(ByGender,caption = "Table 2.2: Summary Statistics for the seperate Genders in the US from 1967-2021. The count = number of years in the study, median income for each, and the interquartile range.") %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
#InterQuartile Range is very high for females meaning there is a high range where most of our values lie.
#Plots to check for normal distribution, for all median income data
par(mfrow=c(1,3))
{with(MaleFemaleEarnings, Hist(Median.Income, scale="percent", ylab= "Percent", main= "Histogram for Income", breaks="Sturges",col="lightgray"))
abline(v = mean(MaleFemaleEarnings$Median.Income), col = "red", lwd = 3)
abline(v = median(MaleFemaleEarnings$Median.Income), col = "yellow", lwd = 3)
legend("topright", c("Mean", "Median"), col = c("red", "yellow"), lwd = 3)}
densityPlot( ~ Median.Income, data = MaleFemaleEarnings, main = "Density Plot for Income", bw=bw.SJ, adjust=1, kernel=dnorm, method="adaptive")
with(MaleFemaleEarnings, qqPlot(Median.Income, dist="norm", main = "QQ Plot for Income", id=list(method="y", n=2, labels=rownames(MaleFemaleEarnings))))
par(mfrow=c(1,1))
#Figure 2.1 ...Positive skew in Histogram and density plot.
#Shapiro Wilk normality test
shapiro.test(MaleFemaleEarnings$Median.Income)
#Normality is not seen here as p-value < 0.05 - Significant deviation from the normal distribution
#Levene test for equal variances
leveneTest(Median.Income~Gender, MaleFemaleEarnings)
#Equal variances not seen here as p-value < 0.05 - Significant difference between the variances
#Continue with non-parametric version of the test.
#Wilcoxon test
wilcox.test(Median.Income ~ Gender, data = MaleFemaleEarnings, alternative = "two.sided", paired = F)
#P-value < 0.05 reject the null and there is a significant difference in median income
#visualisation
ggplot(data = MaleFemaleEarnings, aes(x=Gender, y=Median.Income, color = Gender)) +
geom_boxplot(linewidth = 0.8, width = 0.8) +
scale_fill_manual(values = c("#0000FF", "#FF0000")) +
theme_minimal(base_size = 12) +
theme(axis.title.x = element_text(size = 14, vjust = -5),
axis.title.y = element_text(size = 14, vjust = 5),
plot.title = element_text(size = 16, vjust = 5),
plot.margin = unit(c(1,1,1,1), "cm")) +
xlab("Gender") +
ylab("Median Income ($)") +
labs(title = expression(underline("Median Income($) by Gender in the US 1967 - 2021")))
#Figure 2.2 ...
#Effect size
wilcox_effsize(MaleFemaleEarnings, Median.Income ~ Gender, paired = F)
#Large magnitude, effsize = 0.862.
Code Part 3:
#Loading Packages
library(knitr)
library(kableExtra)
library(psych)
library(car)
library(ggplot2)
library(multcomp)
library(ggeffects)
library(phia)
library(effects)
#Importing the data
GenderIncomeDegree <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/GenderIncomeDegree.csv")
names(GenderIncomeDegree) <- c("Year", "Gender", "Median.Income", "Degree")
#checking the structure of the data
str(GenderIncomeDegree)
GenderIncomeDegree$Gender <- as.factor(GenderIncomeDegree$Gender)
GenderIncomeDegree$Degree <- as.factor(GenderIncomeDegree$Degree)
GenderIncomeDegree$Median.Income <- as.integer(GenderIncomeDegree$Median.Income)
str(GenderIncomeDegree)
#Initial look at the data
head(GenderIncomeDegree)
head(GenderIncomeDegree[GenderIncomeDegree$Degree == "Masters",])
TableIncomeDegree <- rbind(head(GenderIncomeDegree,4), "...", head(GenderIncomeDegree[GenderIncomeDegree$Degree == "Masters",],2), tail(GenderIncomeDegree[GenderIncomeDegree$Degree == "Masters",],2), "...", tail(GenderIncomeDegree,4))
kable(TableIncomeDegree,
caption = "Table 3.1: Initial look at how our data is laid out. Providing Income data for both Genders and 3 different degree types in the US from 1991 - 2022. The Income in this dataset accounts for inflation and is in 2022 Dollars ($).") %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
#Summary Statistics Two Way
SummaryStatsTable <- describeBy(GenderIncomeDegree$Median.Income, group = GenderIncomeDegree$Gender : GenderIncomeDegree$Degree, mat = T)
KableStatsTable <- as.data.frame(rbind((SummaryStatsTable$group1),(SummaryStatsTable$n),(SummaryStatsTable$mean),(SummaryStatsTable$sd),(SummaryStatsTable$median),(SummaryStatsTable$min),(SummaryStatsTable$max),(SummaryStatsTable$range),(SummaryStatsTable$skew),(SummaryStatsTable$kurtosis),(SummaryStatsTable$se)))
names(KableStatsTable) <- c("F:Bachelors","F:Doctorate","F:Masters", "M:Bachelors","M:Doctorate","M:Masters")
KableStatsTable <- KableStatsTable[-1, ]
KableStatsTable2 <- as.data.frame(t(KableStatsTable))
names(KableStatsTable2) <- c("n", "mean", "sd", "median", "min", "max", "range", "skew", "kurtosis", "se")
#changing some types
KableStatsTable2$n <- as.integer(KableStatsTable2$n)
KableStatsTable2$mean <- as.numeric(KableStatsTable2$mean)
KableStatsTable2$sd <- as.numeric(KableStatsTable2$sd)
KableStatsTable2$median <- as.numeric(KableStatsTable2$median)
KableStatsTable2$min <- as.numeric(KableStatsTable2$min)
KableStatsTable2$max <- as.numeric(KableStatsTable2$max)
KableStatsTable2$range <- as.numeric(KableStatsTable2$range)
KableStatsTable2$skew <- as.numeric(KableStatsTable2$skew)
KableStatsTable2$kurtosis <- as.numeric(KableStatsTable2$kurtosis)
KableStatsTable2$se <- as.numeric(KableStatsTable2$se)
#Kable Statistics table
kable(KableStatsTable2, caption = "Table 3.2: Summary Statistics of the median income data for both Genders and 3 degree types in the US from 1991-2021.") %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
row_spec(0, bold = T, font_size = 15)
#Anova Model - Two way
TwoWayM1 <- aov(Median.Income ~ Gender * Degree, data = GenderIncomeDegree)
TwoWayM1
#Diagnostic Checks
#homoscedasticity
par(mfrow = c(2,2))
plot(TwoWayM1)
par(mfrow = c(1,1))
#Figure 3.1:...
#No more than 3 times smallest, looks fine here
#Normal Distribution of residuals
Res_Epsilon <- TwoWayM1$residuals
shapiro.test(Res_Epsilon)
ggplot(GenderIncomeDegree, aes(x = Res_Epsilon)) +
geom_histogram(aes(y=..density..),
colour = 1, fill = "white", bins = 14) +
geom_density(lwd = 1, colour = 4, fill = 4, alpha = 0.25) +
theme_minimal(base_size = 12) +
theme(axis.title.x = element_text(size = 14, vjust = -5),
axis.title.y = element_text(size = 14, vjust = 5, hjust = 0.6),
plot.title = element_text(size = 16, vjust = 5),
plot.margin = unit(c(1,1,1,1), "cm")) +
xlab("Residuals") +
ylab("Density") +
labs(title = expression(underline("Histogram of Residuals | Two Way ANOVA Model 1")))
#Figure 3.2
#Residuals are normally distributed
#Analyzing the data
anova(TwoWayM1)
#Interaction between variables
plot(allEffects(TwoWayM1), ylab={"Median Income ($)"},main={"Effect Plot of Median Income ($)"}, lwd = 2, lty = 5, multiline = T, rug = T, colors = c("Red", "DeepSkyBlue"))
#Figure 3.3:...
#Another
TwoWayPlot <- ggeffect(TwoWayM1, terms = c("Gender", "Degree"))
plot(TwoWayPlot) +
scale_x_continuous(labels = c("Female", "Male"), breaks = c(1,2)) +
theme_minimal(base_size = 12) +
theme(axis.title.x = element_text(size = 14, vjust = -4),
axis.title.y = element_text(size = 14, vjust = 6),
plot.title = element_text(size = 16, vjust = 6),
plot.margin = unit(c(1,1,1,1), "cm")) +
labs(title = expression(underline("Predicted Values of Median Income ($)"))) +
ylab("Median Income ($)") +
xlab("Gender")
#Figure 3.4:...
#Tukey HSD
#Test 1 - Post Hoc test for differences between Genders within Degrees
t1 <- testInteractions(TwoWayM1, pairwise = "Gender", fixed = "Degree", adjustment = "holm")
t1$`Pr(>F)` <- as.numeric(t1$`Pr(>F)`)
t1$`Pr(>F)` <- formatC(t1$`Pr(>F)`, format = "e", digits = 2)
kable(t1, caption = "Table 3.3: Pairwise tests to test for interaction between Genders within Degree type (adjustment = Holms test).") %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
row_spec(0, bold = T, font_size = 15)
#Test 2 - Post Hoc test for differences between Degrees within Genders
t2 <- testInteractions(TwoWayM1, pairwise = "Degree", fixed = "Gender", adjustment = "holm")
t2$`Pr(>F)` <- as.numeric(t2$`Pr(>F)`)
t2$`Pr(>F)` <- formatC(t2$`Pr(>F)`, format = "e", digits = 2)
kable(t2, caption = "Table 3.4: Pairwise tests to test for interaction between Degree type within Genders (adjustment = Holms test).") %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
row_spec(0, bold = T, font_size = 15)
Code Part 4:
#Packages
library(kableExtra)
library(dplyr)
library(knitr)
library(ggplot2)
library(corrplot)
library(vcd)
#Data
EmpStatusGender <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/EmpStatusGender_25andOverBachelorsOrHigher.csv", stringsAsFactors=TRUE)
#Clean
names(EmpStatusGender) <- c("Year", "Q", "Emp.Status", "Gender", "Total.Count")
#Initial Look at our Data
head(EmpStatusGender)
tail(EmpStatusGender)
TableEmpGender1 <- rbind(head(EmpStatusGender), "...", tail(EmpStatusGender))
TableEmpGender1
kable(TableEmpGender1, row.names = F, caption = "Table 4.1: BLS Beta Labs data regarding Total Employed/Unemployed from 2015-2021 (Q1-Q4) by Gender. The count of this data is in thousands (K)") %>%
kable_styling(bootstrap_options = "striped") %>%
row_spec(0, bold = T, font_size = 15)
#Select Random Year and Q
selected_year <- sample(unique(EmpStatusGender$Year), 1)
selected_quarter <- sample(unique(EmpStatusGender$Q), 1)
RandomSample <- EmpStatusGender %>%
filter(Year == selected_year, Q == selected_quarter)
#Looking at our Sample
kable(RandomSample, row.names = F, caption = "Table 4.2: Random Year and Q selected to generate a Sample from our original data from BLS Beta Labs data. Showing Total Employed/Unemployed by Gender. The count of this data is in thousands (K)") %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
row_spec(0, bold = T, font_size = 15)
#Contingency Table
GroupedData <- xtabs(Total.Count ~ Emp.Status + Gender, data = RandomSample)
ContingencyTable <- as.data.frame.matrix(GroupedData)
kable(ContingencyTable, caption = "Table 4.3: ") %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
row_spec(0, bold = T, font_size = 15)
#Initial visualization
ggplot(data = RandomSample, aes(x = Gender, y = Total.Count, fill = Emp.Status)) +
geom_bar(stat = "identity", position = position_dodge()) +
ggtitle("Attrition | Job Level - Observed Values") +
scale_fill_brewer(name = "Employment", palette = "Dark2") +
theme_minimal(base_size = 12) +
theme(axis.title.x = element_text(size = 14, vjust = -5),
axis.title.y = element_text(size = 14, vjust = 5, hjust = 0.6),
plot.title = element_text(size = 16, vjust = 5),
plot.margin = unit(c(1,1,1,1), "cm")) +
xlab("Gender") +
ylab("Frequency (in thousands - k)") +
labs(title = expression(underline("Counts of Employment by Gender in the US")))
#Figure 4.1:.. **NOT USED**
#Test
Test1 <- chisq.test(ContingencyTable, correct = FALSE)
Test1
#N > 40, not using Yates Correction
##Expected Values
kable(round(Test1$expected,3), caption = "Table 4.4: Expected Values of our random sample of the population data. The count of this data is in thousands (K)") %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
row_spec(0, bold = T, font_size = 15) %>%
add_header_above(c("Employment" = 1, "Expected Values" = 2))
#More than 20% of expected values are not less than 5. won't use fisher
#Table for Discussion
#Observed Values, Expected, Chi Squared Components in 1 table
merged_tables <- cbind(Test1$observed, round(Test1$expected,3), round(Test1$residuals^2,4))
kable(merged_tables, caption = "Table 4.5: Observed Values, Expected Values and the Chi-Square Components of our random sample of the population data. The count of this data is in thousands (K)") %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
row_spec(0, bold = T, font_size = 15) %>%
add_header_above(c(" " = 1, "Observed Values" = 2, "Expected Values" = 2, "Chi-Squared Components" = 2))
#Test output
Test1
#Visualisation... For Discussion. Corrplot
association_plot <- assoc(GroupedData, shade = T, las = 3, main = "Association Plot of Gender and Employment Status")
#Figure: 4.2: Association Plot showing Pearson residuals for our Chi Squared test of the association between Gender and Employment Status.
# A positive association between Gender and Employment status is indicated with a bar above the dotted line,
# A negative association between Gender and Employment status is indicated with a bar below the dotted line.
# The strength of the association is indicated by the size of the bar.
---
title: "Project Code"
output: html_notebook
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
                      results = "hide",
                      eval = F)
knitr::opts_chunk$set()
```



#### Code Part 1:

```{r echo=TRUE, message=FALSE, warning=FALSE, results="hide", eval = F}
#Packages
library(knitr)
library(kableExtra)
library(ggplot2)
library(dplyr)

#Loading the Data into R after Transforming in Excel

FemaleEarnings <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/Female Earnings Transformed.csv")



#Clean up

names(FemaleEarnings) <- c("Year", "Female.Earnings")

#Adding a Number to year

FemaleEarnings <- FemaleEarnings %>%
  arrange(Year) %>%
  mutate(YearNum = row_number())



#Initial look at the data

head(FemaleEarnings)

tail(FemaleEarnings)

TableEarnings1 <- rbind(head(FemaleEarnings), "...", tail(FemaleEarnings))

kable(TableEarnings1, row.names = F, caption = "Table 1.1: US Census data regarding Median earnings for Total Female Workers from 1967 - 2021. The Years are numbered in order to not skew the analysis later.") %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)




#Creating the initial regression model

Reg.Model1 <- lm(Female.Earnings~YearNum, data = FemaleEarnings)




#Plotting the intial data

#linearity check 1

ggplot(FemaleEarnings, aes(x = Year, y = Female.Earnings)) +
  geom_smooth(method = "lm", se = F, color = "blue") +
  geom_point(color="red")+
  theme_minimal(base_size = 12) +
  theme(axis.title.x = element_text(size = 14,  vjust = -4, hjust = 0.43),
        axis.title.y = element_text(size = 14, vjust = 6, hjust = 0.6),
        plot.title = element_text(size = 16, vjust = 6),
        plot.margin = unit(c(1,1,1,1), "cm")) +
  labs(title = expression(underline("Median Female Earnings in the US from 1967-2021"))) +
  ylab("Median Female Earnings ($)") +
  xlab("Time in Years")
#Figure 1.1: ...


#Looks linear


#Coefficient Table

Model1Table <- Reg.Model1$coefficients

kable(t(Model1Table), caption = "Table 1.2: Linear Regression Model 1. The intercept (constant) β0 and the slope (coefficient) β1, which are used with the predictor X to achieve the response variable Y.", col.names = c("β0 (Intercept)", "β1 (Slope)")) %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)

#Checking assumptions are met



#Homoscedasticity check 1

par(mfrow = c(2,2))

plot(Reg.Model1)

par(mfrow = c(1,1))

#Figure 1.2:...

#QQ Plot tells us the normality of the residuals, a straight line indicates that the residuals are normally distributed.

#Residuals vs Fitted: Shows the residuals which is y - predicted y as a function of the fitted values.
#Residuals should bounce randomly around the 0 line and should be roughly equal around the estimated regression line.
#Residuals somewhat a pattern in this case, so we will transform the data to see if this makes a difference.




#log10 Transformation

Reg.Model2 <- lm(log10(Female.Earnings)~(YearNum), data = FemaleEarnings)


#Table For Coefficients

Model2Table <- Reg.Model2$coefficients

kable(t(Model2Table), caption = "Table 1.3: Linear Regression Model 2 with log10() transformation. The intercept (constant) β0 and the slope (coefficient) β1, which are used with the predictor X to achieve the response variable Y.", col.names = c("β0 (Intercept)", "β1 (Slope)")) %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)


#Homoscedasticity Check 2

#Plotting Model 2

par(mfrow = c(2,2))

plot(Reg.Model2)

par(mfrow = c(1,1))

#Figure 1.3:...


#Residuals vs Fitted, After the log10() transformation there is less dispersion and residuals form a U shape pattern, showing a non linear relationship
##The QQ Residuals do not follow a straight line which shows lack normality of residuals.

#One cannot say that all the assumptions of linear regression are met as the data points are not randomly spread around the line.



#Adding another variable to our linear regression model.
#Inflation.

#Since we are looking at how income has increased over the years, inflation will be a good variable to add to our model as it directly has an effect on income.
InflationData <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/Inflation.csv")

FemaleEarningsNew <- cbind(FemaleEarnings, InflationData$Inflation)

names(FemaleEarningsNew) <- c("Year", "Female.Earnings", "YearNum", "Inflation")
 



#Scale Both Year Num and Inflation in order to create a model.

YearNumScaled <- scale(FemaleEarningsNew$YearNum, center = T, scale = T)

InflationScaled <- scale(FemaleEarningsNew$Inflation, center = T, scale = T)

FemaleEarningsNew$YearScaled <- scale(FemaleEarningsNew$YearNum, center = T, scale = T)

FemaleEarningsNew$InflationScaled <- scale(FemaleEarningsNew$Inflation, center = T, scale = T)




#Linear Regression Model 3

Reg.Model3 <- lm(Female.Earnings ~ YearScaled + InflationScaled, data = FemaleEarningsNew)


Model3Table <- Reg.Model3$coefficients

kable(t(Model3Table), caption = "Table 1.4: Linear Regression Model 3 with new predictor Inflation (x2) . The intercept (constant) β0 and the slope (coefficient) β1 + β2, which are used with the predictor x1 + x2 to achieve the response variable Y.", col.names = c("β0 (Intercept)", "β1 (Slope for x1)",  "β2 (Slope for x2)")) %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)

#Plotting Model 3

par(mfrow = c(2,2))

plot(Reg.Model3)

par(mfrow = c(1,1))

#Figure 1.4:...





#Checking Multiple R Squared and Adjusted R Squared of the two models

summary(Reg.Model1)$r.squared
summary(Reg.Model1)$adj.r.squared

Model1rSquared<- cbind.data.frame(summary(Reg.Model1)$r.squared, summary(Reg.Model1)$adj.r.squared)

names(Model1rSquared) <- c("MultipleR^2", "AdjustedR^2")

##

Model3rSquared<- cbind.data.frame(summary(Reg.Model3)$r.squared, summary(Reg.Model3)$adj.r.squared)

names(Model3rSquared) <- c("MultipleR^2", "AdjustedR^2")

##

RSquaredTable <-rbind(Model1rSquared, Model3rSquared)

row.names(RSquaredTable) <- c("MultipleR^2", "AdjustedR^2")

kable(t(RSquaredTable), caption = "Table 1.5: Comparing the Multiple R Sqaured and Adjusted R Squared of Model 1 and 2 to determine whcih is best fit for our data.", col.names = c("Model 1", "Model 2", "Model 3"),  table.attr = "style='width:80%;'") %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)

#Adding Inflation to our model -> Model3, we get an increase in MultipleR^2 and AdjustedR^2 squared.
#Main indicator for goodness of fit of introducing a new indicator to our model is AdjustedR^2. As it explaines how much variability is explained in the model.




#Can use anova to determine whether Inflation should be added to our model or not.

anova(Reg.Model1, Reg.Model3)

#Df = 1, more complex model has one additional parameter
#P-value 0.2 > 0.05, means we should reject Model 3 and continue with model 1





#Summary of regression model 1

summary(Reg.Model1)



#P - Value - Upper Tail test

p_value <- pt(q = 41.22, df = 53, lower.tail=FALSE)

p_value

#Significant increase in Female earnings by change in Time in Years.





#Plotting Residuals

FemaleEarnings$predicted <- predict(Reg.Model1)

FemaleEarnings$residuals <- residuals(Reg.Model1)

ggplot(FemaleEarnings, aes(x = Year, y = Female.Earnings)) +
  geom_smooth(method = "lm", se = F, color = "blue") +
  geom_point(aes(color = abs(residuals), size = abs(residuals))) +
  scale_color_continuous(low = "blue", high = "red") +
  guides(color = F, size = F) +
  geom_segment(aes(xend = Year, yend = predicted), alpha = .2) +
  theme_minimal(base_size = 12) +
  theme(axis.title.x = element_text(size = 14,  vjust = -4, hjust = 0.43),
        axis.title.y = element_text(size = 14, vjust = 6, hjust = 0.6),
        plot.title = element_text(size = 16, vjust = 6),
        plot.margin = unit(c(1,1,1,1), "cm")) +
  labs(title = expression(underline("Median Female Earnings ($) in the US from 1967-2021"))) +
  ylab("Median Female Earnings ($)") +
  xlab("Time in Years")
#Figure 1.5: Median Female Earnings plotted against time in Years, with line of best fit and Residuals plotted to the predicted values.




#We can predict median female earnings in the US using the coefficients...
#And using the Year number as X1

Model1Table <- Reg.Model1$coefficients

kable(t(Model1Table), caption = "Table 1.2: Linear Regression Model 1. The intercept (constant) β0 and the slope (coefficient) β1, which are used with the predictor X to achieve the response variable Y.", col.names = c("β0 (Intercept)", "β1 (Slope)")) %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)

# Y(Predicted Median Female Earnings) = 15383.08 + 406.6405 * x1

#Yet this number does not account for inflation, unemployment, other socioeconomic and economic factors.


```

#### Code Part 2:

```{r echo=TRUE, message=FALSE, warning=FALSE, results="hide", eval = F}
#Packages
library(ggplot2)
library(ggpubr)
library(knitr)
library(kableExtra)
library(Rcmdr)
library(dplyr)

#Data
MaleFemaleEarnings <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/MFEarnings.csv")

names(MaleFemaleEarnings) <- c("Year", "Gender", "Median.Income")

#Initial Look at our data

head(MaleFemaleEarnings)

tail(MaleFemaleEarnings)

TableEarnings2 <- rbind(head(MaleFemaleEarnings), "...", tail(MaleFemaleEarnings))

kable(TableEarnings2,
      caption = "Table 2.1: Initial look at how our data is laid out. Providing Income data for both Genders in the US from 1967 - 2022. The Income in this dataset accounts for inflation and is in 2022 Dollars ($).") %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)




#Summary Stats

#Summary Statistics by Gender

ByGender <-group_by(MaleFemaleEarnings, Gender) %>%
  summarise(
    count = n(),
    median = median(Median.Income, na.rm = T),
    IQR = IQR(Median.Income, na.rm = T)
  )

kable(ByGender,caption =  "Table 2.2: Summary Statistics for the seperate Genders in the US from 1967-2021. The count = number of years in the study, median income for each, and the interquartile range.") %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)

#InterQuartile Range is very high for females meaning there is a high range where most of our values lie.






#Plots to check for normal distribution, for all median income data

par(mfrow=c(1,3))

{with(MaleFemaleEarnings, Hist(Median.Income, scale="percent", ylab= "Percent", main= "Histogram for Income", breaks="Sturges",col="lightgray"))
  abline(v = mean(MaleFemaleEarnings$Median.Income), col = "red", lwd = 3)
  abline(v = median(MaleFemaleEarnings$Median.Income), col = "yellow", lwd = 3)
  legend("topright", c("Mean", "Median"), col = c("red", "yellow"), lwd = 3)}

densityPlot( ~ Median.Income, data = MaleFemaleEarnings, main = "Density Plot for Income", bw=bw.SJ, adjust=1, kernel=dnorm, method="adaptive")

with(MaleFemaleEarnings, qqPlot(Median.Income, dist="norm", main = "QQ Plot for Income", id=list(method="y", n=2, labels=rownames(MaleFemaleEarnings))))

par(mfrow=c(1,1))

#Figure 2.1 ...Positive skew in Histogram and density plot. 




#Shapiro Wilk normality test

shapiro.test(MaleFemaleEarnings$Median.Income)

#Normality is not seen here as p-value < 0.05 - Significant deviation from the normal distribution




#Levene test for equal variances

leveneTest(Median.Income~Gender, MaleFemaleEarnings)

#Equal variances not seen here as p-value < 0.05 - Significant difference between the variances




#Continue with non-parametric version of the test.
#Wilcoxon test

wilcox.test(Median.Income ~ Gender, data = MaleFemaleEarnings, alternative = "two.sided", paired = F)

#P-value < 0.05 reject the null and there is a significant difference in median income







#visualisation
ggplot(data = MaleFemaleEarnings, aes(x=Gender, y=Median.Income, color = Gender)) +
  geom_boxplot(linewidth = 0.8, width = 0.8) +
  scale_fill_manual(values = c("#0000FF", "#FF0000")) +
  theme_minimal(base_size = 12) +
  theme(axis.title.x = element_text(size = 14,  vjust = -5),
        axis.title.y = element_text(size = 14, vjust = 5),
        plot.title = element_text(size = 16, vjust = 5),
        plot.margin = unit(c(1,1,1,1), "cm")) +
  xlab("Gender") +
  ylab("Median Income ($)") +
  labs(title = expression(underline("Median Income($) by Gender in the US 1967 - 2021")))
#Figure 2.2 ...



#Effect size

wilcox_effsize(MaleFemaleEarnings, Median.Income ~ Gender, paired = F)

#Large magnitude, effsize = 0.862.


```

#### Code Part 3:

```{r echo=TRUE, message=FALSE, warning=FALSE, results="hide", eval = F}
#Loading Packages
library(knitr)
library(kableExtra)
library(psych)
library(car)
library(ggplot2)
library(multcomp)
library(ggeffects)
library(phia)
library(effects)

#Importing the data
GenderIncomeDegree <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/GenderIncomeDegree.csv")

names(GenderIncomeDegree) <- c("Year", "Gender", "Median.Income", "Degree")



#checking the structure of the data

str(GenderIncomeDegree)

GenderIncomeDegree$Gender <- as.factor(GenderIncomeDegree$Gender)

GenderIncomeDegree$Degree <- as.factor(GenderIncomeDegree$Degree)

GenderIncomeDegree$Median.Income <- as.integer(GenderIncomeDegree$Median.Income)

str(GenderIncomeDegree)




#Initial look at the data

head(GenderIncomeDegree)

head(GenderIncomeDegree[GenderIncomeDegree$Degree == "Masters",])

TableIncomeDegree <- rbind(head(GenderIncomeDegree,4), "...", head(GenderIncomeDegree[GenderIncomeDegree$Degree == "Masters",],2), tail(GenderIncomeDegree[GenderIncomeDegree$Degree == "Masters",],2), "...", tail(GenderIncomeDegree,4))

kable(TableIncomeDegree,
      caption = "Table 3.1: Initial look at how our data is laid out. Providing Income data for both Genders and 3 different degree types in the US from 1991 - 2022. The Income in this dataset accounts for inflation and is in 2022 Dollars ($).") %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)



#Summary Statistics Two Way

SummaryStatsTable <- describeBy(GenderIncomeDegree$Median.Income, group = GenderIncomeDegree$Gender : GenderIncomeDegree$Degree, mat = T)

KableStatsTable <- as.data.frame(rbind((SummaryStatsTable$group1),(SummaryStatsTable$n),(SummaryStatsTable$mean),(SummaryStatsTable$sd),(SummaryStatsTable$median),(SummaryStatsTable$min),(SummaryStatsTable$max),(SummaryStatsTable$range),(SummaryStatsTable$skew),(SummaryStatsTable$kurtosis),(SummaryStatsTable$se)))

names(KableStatsTable) <- c("F:Bachelors","F:Doctorate","F:Masters", "M:Bachelors","M:Doctorate","M:Masters")

KableStatsTable <- KableStatsTable[-1, ]

KableStatsTable2 <- as.data.frame(t(KableStatsTable))

names(KableStatsTable2) <- c("n", "mean", "sd", "median", "min", "max", "range", "skew", "kurtosis", "se")



#changing some types

KableStatsTable2$n <- as.integer(KableStatsTable2$n)

KableStatsTable2$mean <- as.numeric(KableStatsTable2$mean)

KableStatsTable2$sd <- as.numeric(KableStatsTable2$sd)

KableStatsTable2$median <- as.numeric(KableStatsTable2$median)

KableStatsTable2$min <- as.numeric(KableStatsTable2$min)

KableStatsTable2$max <- as.numeric(KableStatsTable2$max)

KableStatsTable2$range <- as.numeric(KableStatsTable2$range)

KableStatsTable2$skew <- as.numeric(KableStatsTable2$skew)

KableStatsTable2$kurtosis <- as.numeric(KableStatsTable2$kurtosis)

KableStatsTable2$se <- as.numeric(KableStatsTable2$se)



#Kable Statistics table

kable(KableStatsTable2, caption = "Table 3.2: Summary Statistics of the median income data for both Genders and 3 degree types in the US from 1991-2021.") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  row_spec(0, bold = T, font_size = 15)


#Anova Model - Two way

TwoWayM1 <- aov(Median.Income ~ Gender * Degree, data = GenderIncomeDegree)

TwoWayM1



#Diagnostic Checks

#homoscedasticity 

par(mfrow = c(2,2))

plot(TwoWayM1)

par(mfrow = c(1,1))
#Figure 3.1:...


#No more than 3 times smallest, looks fine here



#Normal Distribution of residuals

Res_Epsilon <- TwoWayM1$residuals

shapiro.test(Res_Epsilon)

ggplot(GenderIncomeDegree, aes(x = Res_Epsilon)) +
  geom_histogram(aes(y=..density..),
                 colour = 1, fill = "white", bins = 14) +
  geom_density(lwd = 1, colour = 4, fill = 4, alpha = 0.25) +
  theme_minimal(base_size = 12) +
  theme(axis.title.x = element_text(size = 14,  vjust = -5),
        axis.title.y = element_text(size = 14, vjust = 5, hjust = 0.6),
        plot.title = element_text(size = 16, vjust = 5),
        plot.margin = unit(c(1,1,1,1), "cm")) +
  xlab("Residuals") +
  ylab("Density") +
  labs(title = expression(underline("Histogram of Residuals | Two Way ANOVA Model 1")))
#Figure 3.2

#Residuals are normally distributed




#Analyzing the data

anova(TwoWayM1)






#Interaction between variables

plot(allEffects(TwoWayM1), ylab={"Median Income ($)"},main={"Effect Plot of Median Income ($)"}, lwd = 2, lty = 5, multiline = T, rug = T, colors = c("Red", "DeepSkyBlue"))
#Figure 3.3:...
  

#Another

TwoWayPlot <- ggeffect(TwoWayM1, terms = c("Gender", "Degree"))

plot(TwoWayPlot) +
  scale_x_continuous(labels = c("Female", "Male"), breaks = c(1,2)) +
  theme_minimal(base_size = 12) +
  theme(axis.title.x = element_text(size = 14,  vjust = -4),
        axis.title.y = element_text(size = 14, vjust = 6),
        plot.title = element_text(size = 16, vjust = 6),
        plot.margin = unit(c(1,1,1,1), "cm")) +
  labs(title = expression(underline("Predicted Values of Median Income ($)"))) +
  ylab("Median Income ($)") +
  xlab("Gender")
#Figure 3.4:...





#Tukey HSD

#Test 1 - Post Hoc test for differences between Genders within Degrees

t1 <- testInteractions(TwoWayM1, pairwise = "Gender", fixed = "Degree", adjustment = "holm")

t1$`Pr(>F)` <- as.numeric(t1$`Pr(>F)`)

t1$`Pr(>F)` <- formatC(t1$`Pr(>F)`, format = "e", digits = 2)

kable(t1, caption = "Table 3.3: Pairwise tests to test for interaction between Genders within Degree type (adjustment = Holms test).") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  row_spec(0, bold = T, font_size = 15)



#Test 2 - Post Hoc test for differences between Degrees within Genders

t2 <- testInteractions(TwoWayM1, pairwise = "Degree", fixed = "Gender", adjustment = "holm")

t2$`Pr(>F)` <- as.numeric(t2$`Pr(>F)`)

t2$`Pr(>F)` <- formatC(t2$`Pr(>F)`, format = "e", digits = 2)

kable(t2, caption = "Table 3.4: Pairwise tests to test for interaction between Degree type within Genders (adjustment = Holms test).") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  row_spec(0, bold = T, font_size = 15)


```

#### Code Part 4:

```{r echo=TRUE, message=FALSE, warning=FALSE, results="hide", eval = F}
#Packages
library(kableExtra)
library(dplyr)
library(knitr)
library(ggplot2)
library(corrplot)
library(vcd)


#Data
EmpStatusGender <- read.csv("C:/Users/edren/OneDrive/Desktop/Data Analysis/R Working Directory/Final Project Assignment/Datasets/DataSets/EmpStatusGender_25andOverBachelorsOrHigher.csv", stringsAsFactors=TRUE)

#Clean

names(EmpStatusGender) <- c("Year", "Q", "Emp.Status", "Gender", "Total.Count")


#Initial Look at our Data

head(EmpStatusGender)

tail(EmpStatusGender)

TableEmpGender1 <- rbind(head(EmpStatusGender), "...", tail(EmpStatusGender))

TableEmpGender1

kable(TableEmpGender1, row.names = F, caption = "Table 4.1: BLS Beta Labs data regarding Total Employed/Unemployed from 2015-2021 (Q1-Q4) by Gender. The count of this data is in thousands (K)") %>%
  kable_styling(bootstrap_options = "striped") %>%
  row_spec(0, bold = T, font_size = 15)


#Select Random Year and Q

selected_year <- sample(unique(EmpStatusGender$Year), 1)

selected_quarter <- sample(unique(EmpStatusGender$Q), 1)

RandomSample <- EmpStatusGender %>%
  filter(Year == selected_year, Q == selected_quarter)



#Looking at our Sample

kable(RandomSample, row.names = F, caption = "Table 4.2: Random Year and Q selected to generate a Sample from our original data from BLS Beta Labs data. Showing Total Employed/Unemployed by Gender. The count of this data is in thousands (K)") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  row_spec(0, bold = T, font_size = 15)



#Contingency Table

GroupedData <- xtabs(Total.Count ~ Emp.Status + Gender, data = RandomSample)

ContingencyTable <- as.data.frame.matrix(GroupedData)

kable(ContingencyTable, caption = "Table 4.3: ") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  row_spec(0, bold = T, font_size = 15)




#Initial visualization

ggplot(data = RandomSample, aes(x = Gender, y = Total.Count, fill = Emp.Status)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  ggtitle("Attrition | Job Level - Observed Values") +
  scale_fill_brewer(name = "Employment", palette = "Dark2") +
  theme_minimal(base_size = 12) +
  theme(axis.title.x = element_text(size = 14,  vjust = -5),
        axis.title.y = element_text(size = 14, vjust = 5, hjust = 0.6),
        plot.title = element_text(size = 16, vjust = 5),
        plot.margin = unit(c(1,1,1,1), "cm")) +
  xlab("Gender") +
  ylab("Frequency (in thousands - k)") +
  labs(title = expression(underline("Counts of Employment by Gender in the US")))
#Figure 4.1:.. **NOT USED**


#Test

Test1 <- chisq.test(ContingencyTable, correct = FALSE)

Test1

#N > 40, not using Yates Correction



##Expected Values

kable(round(Test1$expected,3), caption = "Table 4.4: Expected Values of our random sample of the population data. The count of this data is in thousands (K)") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  row_spec(0, bold = T, font_size = 15) %>%
  add_header_above(c("Employment" = 1, "Expected Values" = 2))

#More than 20% of expected values are not less than 5. won't use fisher




#Table for Discussion

#Observed Values, Expected, Chi Squared Components in 1 table

merged_tables <- cbind(Test1$observed, round(Test1$expected,3), round(Test1$residuals^2,4))

kable(merged_tables, caption = "Table 4.5: Observed Values, Expected Values and the Chi-Square Components of our random sample of the population data. The count of this data is in thousands (K)") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  row_spec(0, bold = T, font_size = 15) %>%
  add_header_above(c(" " = 1, "Observed Values" = 2, "Expected Values" = 2, "Chi-Squared Components" = 2))


#Test output

Test1



#Visualisation... For Discussion. Corrplot


association_plot <- assoc(GroupedData, shade = T, las = 3, main = "Association Plot of Gender and Employment Status")


#Figure: 4.2: Association Plot showing Pearson residuals for our Chi Squared test of the association between Gender and Employment Status. 
# A positive association between Gender and Employment status is indicated with a bar above the dotted line, 
# A negative association between Gender and Employment status is indicated with a bar below the dotted line.
# The strength of the association is indicated by the size of the bar.

```
