Houcheng Li, Nan Xu, Aisha Shigna Nadukkandy
Packages used for the analysis:
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(dplyr)
Dataset 1:
income_distribution <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-09/income_distribution.csv')
## Rows: 2916 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): race, income_bracket
## dbl (7): year, number, income_median, income_med_moe, income_mean, income_me...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
income_distribution
## # A tibble: 2,916 × 9
## year race number income_median income_med_moe income_mean income_mean_moe
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019 All Ra… 1.28e8 68703 904 98088 1042
## 2 2019 All Ra… 1.28e8 68703 904 98088 1042
## 3 2019 All Ra… 1.28e8 68703 904 98088 1042
## 4 2019 All Ra… 1.28e8 68703 904 98088 1042
## 5 2019 All Ra… 1.28e8 68703 904 98088 1042
## 6 2019 All Ra… 1.28e8 68703 904 98088 1042
## 7 2019 All Ra… 1.28e8 68703 904 98088 1042
## 8 2019 All Ra… 1.28e8 68703 904 98088 1042
## 9 2019 All Ra… 1.28e8 68703 904 98088 1042
## 10 2018 All Ra… 1.29e8 64324 704 91652 914
## # ℹ 2,906 more rows
## # ℹ 2 more variables: income_bracket <chr>, income_distribution <dbl>
extract <- income_distribution[income_distribution$race == "All Races",]
data1 <- extract[,c(1,4,6)]
data1 <- data1[!duplicated(data1$year),]
data1$year <- as.factor(data1$year)
data2 <- melt(data1, id.vars = "year")
head(data2)
## year variable value
## 1 2019 income_median 68703
## 2 2018 income_median 64324
## 3 2017 income_median 64007
## 4 2016 income_median 62898
## 5 2015 income_median 60987
## 6 2014 income_median 58001
ggplot(data2, aes(x=factor(year), y=value, colour=variable, group=variable)) +
geom_line(size=2) +
theme_minimal() +
scale_color_manual(values = c("income_median" = "blue", "income_mean" = "red")) +
labs(x = "Year", y = "Income", title = "Change in income over the years for all races combined") +
theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 12),
axis.title = element_text(colour = 'black', family = 'Georgia', size = 16),
axis.text = element_text(colour = 'black', family = 'Georgia', size = 7, angle = 45),
legend.text = element_text(colour = 'black', family = 'Georgia', size = 10),
legend.position = 'top',
legend.title = element_blank(),
plot.background = element_rect(fill = 'grey60'),
panel.background = element_rect(fill = 'grey60', colour = 'grey60'))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
data3 <- extract[,c(1,8,9)]
colour=c("#DC143C","#0000FF","#20B2AA","#FFA500","#9370DB","#98FB98","#F08080","#1E90FF","#7CFC00","#FFFF00",
"#808000","#FF00FF","#FA8072","#7B68EE","#9400D3","#800080","#A0522D","#D2B48C","#D2691E","#87CEEB","#40E0D0","#5F9EA0",
"#FF1493","#0000CD","#008B8B","#FFE4B5","#8A2BE2","#228B22","#E9967A","#4682B4","#32CD32","#F0E68C","#FFFFE0","#EE82EE",
"#FF6347","#6A5ACD","#9932CC","#8B008B","#8B4513","#DEB887")
data3$income_bracket <- factor(data3$income_bracket,levels=unique(data3$income_bracket))
head(data3)
## # A tibble: 6 × 3
## year income_bracket income_distribution
## <dbl> <fct> <dbl>
## 1 2019 Under $15,000 9.1
## 2 2019 $15,000 to $24,999 8
## 3 2019 $25,000 to $34,999 8.3
## 4 2019 $35,000 to $49,999 11.7
## 5 2019 $50,000 to $74,999 16.5
## 6 2019 $75,000 to $99,999 12.3
ggplot(data3,aes(x=year, y=income_distribution,fill=factor(income_bracket))) +
geom_col(position = "fill", width = 0.6) +
labs(x = "Year",y = "Bracket") +
scale_fill_manual(values = colour) +
labs(x = "Year", y = "Income Distribution", title = "CIncome distribution between different salary groups in US") +
theme_minimal() +
theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 12),
axis.title = element_text(colour = 'black', family = 'Georgia', size = 16),
axis.text = element_text(colour = 'black', family = 'Georgia', size = 7, angle = 45),
legend.text = element_text(colour = 'black', family = 'Georgia', size = 6),
legend.position = 'top',
legend.title = element_blank(),
plot.background = element_rect(fill = 'grey60'),
panel.background = element_rect(fill = 'grey60', colour = 'grey60'))
race_exclusion <- c("Asian Alone or in Combination", "Black Alone or in Combination")
filtered_data_race <- income_distribution %>%
filter(!race %in% race_exclusion) %>%
group_by(race, year) %>%
summarise(MeanIncome = mean(income_mean, na.rm = TRUE))
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
custom_colors <- c("red", "blue", "orange", "green", "violet", "yellow")
ggplot(filtered_data_race, aes(x = year, y = MeanIncome, color = race)) +
geom_line(aes(linewidth = 1.5), na.rm = TRUE) +
geom_point() +
labs(x = "Year", y = "Mean Income", title = "Trends in Mean income over the years (1967 - 2019)") +
scale_color_manual(values = custom_colors) +
theme_minimal() +
theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 12),
axis.title = element_text(colour = 'black', family = 'Georgia', size = 16),
axis.text = element_text(colour = 'black', family = 'Georgia', size = 14),
legend.text = element_text(colour = 'black', family = 'Georgia', size = 10),
legend.position = 'top',
legend.title = element_blank(),
plot.background = element_rect(fill = 'grey60'),
panel.background = element_rect(fill = 'grey60', colour = 'grey60'))
## Warning: Removed 1 rows containing missing values (`geom_point()`).
To analyse thsi we used two datssets, that provides data about the family wealth by race/year/measure normalized to 2016, and the amount of money people own after their retirement.
race_wealth <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-09/race_wealth.csv')
## Rows: 96 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): type, race
## dbl (2): year, wealth_family
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
retirement <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-09/retirement.csv')
## Rows: 30 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): race
## dbl (2): year, retirement
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Graphs:
wealthperrace <- race_wealth %>%
group_by(race, year) %>%
summarise(wealth = mean(wealth_family, na.rm= TRUE))
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
wealth_plot <- ggplot(wealthperrace, aes(x = year, y = wealth, color = race)) +
geom_line(aes(linewidth = 1), na.rm = TRUE) +
geom_point(na.rm = TRUE) +
labs(x = "Year", y = "Distribution of wealth per race", title = "Distribution of average family wealth by race (1963-2016)") +
scale_y_continuous(labels = scales::dollar_format(prefix = "$")) +
scale_color_manual(values = c("Black" = "blue", "Hispanic" = "green", "White" = "red")) +
theme_minimal() +
theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 18),
axis.title = element_text(colour = 'black', family = 'Georgia', size = 14),
axis.text = element_text(colour = 'black', family = 'Georgia', size = 14),
legend.text = element_text(colour = 'black', family = 'Georgia', size = 16),
legend.position = 'top',
legend.title = element_blank(),
plot.background = element_rect(fill = 'grey60'),
panel.background = element_rect(fill = 'grey60', colour = 'grey60'))
wealth_retirement <- retirement %>%
group_by(race, year) %>%
summarise(wealth_ret = mean(retirement, na.rm= TRUE))
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
ret_plot <- ggplot(wealth_retirement, aes(x = year, y = wealth_ret, color = race)) +
geom_line(aes(linewidth = 1), na.rm = TRUE) +
geom_point(na.rm = TRUE) +
labs(x = "Year", y = "Retirement wealth", title = "Distribution of retirement wealth by race (1963-2016)") +
scale_y_continuous(labels = scales::dollar_format(prefix = "$")) +
scale_color_manual(values = c("Black" = "blue", "Hispanic" = "green", "White" = "red")) +
theme_minimal() +
theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 18),
axis.title = element_text(colour = 'black', family = 'Georgia', size = 14),
axis.text = element_text(colour = 'black', family = 'Georgia', size = 14),
legend.text = element_text(colour = 'black', family = 'Georgia', size = 16),
legend.position = 'top',
legend.title = element_blank(),
plot.background = element_rect(fill = 'grey60'),
panel.background = element_rect(fill = 'grey60', colour = 'grey60'))
combined_plots <- grid.arrange(wealth_plot, ret_plot, ncol = 2)
print(combined_plots)
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]