R Markdown

Wealth Inequality in United States

Houcheng Li, Nan Xu, Aisha Shigna Nadukkandy

Packages used for the analysis:

library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(dplyr)

Dataset 1:

income_distribution <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-09/income_distribution.csv')
## Rows: 2916 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): race, income_bracket
## dbl (7): year, number, income_median, income_med_moe, income_mean, income_me...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
income_distribution
## # A tibble: 2,916 × 9
##     year race    number income_median income_med_moe income_mean income_mean_moe
##    <dbl> <chr>    <dbl>         <dbl>          <dbl>       <dbl>           <dbl>
##  1  2019 All Ra… 1.28e8         68703            904       98088            1042
##  2  2019 All Ra… 1.28e8         68703            904       98088            1042
##  3  2019 All Ra… 1.28e8         68703            904       98088            1042
##  4  2019 All Ra… 1.28e8         68703            904       98088            1042
##  5  2019 All Ra… 1.28e8         68703            904       98088            1042
##  6  2019 All Ra… 1.28e8         68703            904       98088            1042
##  7  2019 All Ra… 1.28e8         68703            904       98088            1042
##  8  2019 All Ra… 1.28e8         68703            904       98088            1042
##  9  2019 All Ra… 1.28e8         68703            904       98088            1042
## 10  2018 All Ra… 1.29e8         64324            704       91652             914
## # ℹ 2,906 more rows
## # ℹ 2 more variables: income_bracket <chr>, income_distribution <dbl>
extract <- income_distribution[income_distribution$race == "All Races",]
data1 <- extract[,c(1,4,6)]
data1 <- data1[!duplicated(data1$year),]
data1$year <- as.factor(data1$year)
data2 <- melt(data1, id.vars = "year")
head(data2)
##   year      variable value
## 1 2019 income_median 68703
## 2 2018 income_median 64324
## 3 2017 income_median 64007
## 4 2016 income_median 62898
## 5 2015 income_median 60987
## 6 2014 income_median 58001
ggplot(data2, aes(x=factor(year), y=value, colour=variable, group=variable)) + 
  geom_line(size=2) +
  theme_minimal() +
  scale_color_manual(values = c("income_median" = "blue", "income_mean" = "red")) +
  labs(x = "Year", y = "Income", title = "Change in income over the years for all races combined") +
  theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
        plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 12),
        axis.title = element_text(colour = 'black', family = 'Georgia', size = 16),
        axis.text = element_text(colour = 'black', family = 'Georgia', size = 7, angle = 45),
        legend.text = element_text(colour = 'black', family = 'Georgia', size = 10),
        legend.position = 'top',
        legend.title = element_blank(),
        plot.background = element_rect(fill = 'grey60'),
        panel.background = element_rect(fill = 'grey60', colour = 'grey60'))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

data3 <- extract[,c(1,8,9)]
colour=c("#DC143C","#0000FF","#20B2AA","#FFA500","#9370DB","#98FB98","#F08080","#1E90FF","#7CFC00","#FFFF00",  
         "#808000","#FF00FF","#FA8072","#7B68EE","#9400D3","#800080","#A0522D","#D2B48C","#D2691E","#87CEEB","#40E0D0","#5F9EA0",
         "#FF1493","#0000CD","#008B8B","#FFE4B5","#8A2BE2","#228B22","#E9967A","#4682B4","#32CD32","#F0E68C","#FFFFE0","#EE82EE",
         "#FF6347","#6A5ACD","#9932CC","#8B008B","#8B4513","#DEB887")
data3$income_bracket <- factor(data3$income_bracket,levels=unique(data3$income_bracket))
head(data3)
## # A tibble: 6 × 3
##    year income_bracket     income_distribution
##   <dbl> <fct>                            <dbl>
## 1  2019 Under $15,000                      9.1
## 2  2019 $15,000 to $24,999                 8  
## 3  2019 $25,000 to $34,999                 8.3
## 4  2019 $35,000 to $49,999                11.7
## 5  2019 $50,000 to $74,999                16.5
## 6  2019 $75,000 to $99,999                12.3
ggplot(data3,aes(x=year, y=income_distribution,fill=factor(income_bracket))) + 
  geom_col(position = "fill", width = 0.6) +
  labs(x = "Year",y = "Bracket") +
  scale_fill_manual(values = colour) +
  labs(x = "Year", y = "Income Distribution", title = "CIncome distribution between different salary groups in US") +
  theme_minimal() +
  theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
        plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 12),
        axis.title = element_text(colour = 'black', family = 'Georgia', size = 16),
        axis.text = element_text(colour = 'black', family = 'Georgia', size = 7, angle = 45),
        legend.text = element_text(colour = 'black', family = 'Georgia', size = 6),
        legend.position = 'top',
        legend.title = element_blank(),
        plot.background = element_rect(fill = 'grey60'),
        panel.background = element_rect(fill = 'grey60', colour = 'grey60'))

race_exclusion <- c("Asian Alone or in Combination", "Black Alone or in Combination")

filtered_data_race <- income_distribution %>%
  filter(!race %in% race_exclusion) %>%
  group_by(race, year) %>%
  summarise(MeanIncome = mean(income_mean, na.rm = TRUE))
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
custom_colors <- c("red", "blue", "orange", "green", "violet", "yellow")


ggplot(filtered_data_race, aes(x = year, y = MeanIncome, color = race)) +
  geom_line(aes(linewidth = 1.5), na.rm = TRUE) +  
  geom_point() +
  labs(x = "Year", y = "Mean Income", title = "Trends in Mean income over the years (1967 - 2019)") +
  scale_color_manual(values = custom_colors) +
  theme_minimal() +
  theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
        plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 12),
        axis.title = element_text(colour = 'black', family = 'Georgia', size = 16),
        axis.text = element_text(colour = 'black', family = 'Georgia', size = 14),
        legend.text = element_text(colour = 'black', family = 'Georgia', size = 10),
        legend.position = 'top',
        legend.title = element_blank(),
        plot.background = element_rect(fill = 'grey60'),
        panel.background = element_rect(fill = 'grey60', colour = 'grey60'))
## Warning: Removed 1 rows containing missing values (`geom_point()`).

Income share among races in US.

###upload data###
income_aggregate <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-09/income_aggregate.csv')
## Rows: 1854 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): race, income_quintile
## dbl (3): year, number, income_share
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
###check race count###
count(income_aggregate,race)
## # A tibble: 8 × 2
##   race                              n
##   <chr>                         <int>
## 1 All Races                       318
## 2 Asian Alone                     108
## 3 Asian Alone or in Combination   108
## 4 Black Alone                     318
## 5 Black Alone or in Combination   108
## 6 Hispanic                        288
## 7 White Alone                     318
## 8 White, Not Hispanic             288
###check quintile count###
count(income_aggregate,income_quintile)
## # A tibble: 6 × 2
##   income_quintile     n
##   <chr>           <int>
## 1 Fourth            309
## 2 Highest           309
## 3 Lowest            309
## 4 Second            309
## 5 Third             309
## 6 Top 5%            309
#####pretreatment for this dataset###
df1 <- income_aggregate %>% na.omit() %>% 
  subset(year>=2000 & income_quintile!="Top 5%")
count(df1,income_quintile)
## # A tibble: 5 × 2
##   income_quintile     n
##   <chr>           <int>
## 1 Fourth            154
## 2 Highest           154
## 3 Lowest            154
## 4 Second            154
## 5 Third             154
#####reorder income_quintile###
df1$income_quintile=factor(df1$income_quintile,c("Highest","Second","Third","Forth","Lowest"))
#####ggplot###
ggplot(df1,aes(year,income_share,color=income_quintile))+
  geom_line(linewidth=0.5)+facet_grid(.~race)+
  labs(title = "The income share of different races in 21th century ", xlab="Year",ylab="Income share",color="Classification")+
  scale_x_continuous(breaks = c(2000,2010,2019))+theme_bw()

Is Racial disparities cause wealth disparities in US?

To analyse thsi we used two datssets, that provides data about the family wealth by race/year/measure normalized to 2016, and the amount of money people own after their retirement.

race_wealth <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-09/race_wealth.csv')
## Rows: 96 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): type, race
## dbl (2): year, wealth_family
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
retirement <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-09/retirement.csv')
## Rows: 30 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): race
## dbl (2): year, retirement
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Graphs:

wealthperrace <- race_wealth %>% 
  group_by(race, year) %>% 
  summarise(wealth = mean(wealth_family, na.rm= TRUE))
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
wealth_plot <- ggplot(wealthperrace, aes(x = year, y = wealth, color = race)) +
  geom_line(aes(linewidth = 1), na.rm = TRUE) +
  geom_point(na.rm = TRUE) +
  labs(x = "Year", y = "Distribution of wealth per race", title = "Distribution of average family wealth by race (1963-2016)") +
  scale_y_continuous(labels = scales::dollar_format(prefix = "$")) +
  scale_color_manual(values = c("Black" = "blue", "Hispanic" = "green", "White" = "red")) +
  theme_minimal() +
  theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
        plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 18),
        axis.title = element_text(colour = 'black', family = 'Georgia', size = 14),
        axis.text = element_text(colour = 'black', family = 'Georgia', size = 14),
        legend.text = element_text(colour = 'black', family = 'Georgia', size = 16),
        legend.position = 'top',
        legend.title = element_blank(),
        plot.background = element_rect(fill = 'grey60'),
        panel.background = element_rect(fill = 'grey60', colour = 'grey60'))

wealth_retirement <-  retirement %>% 
  group_by(race, year) %>% 
  summarise(wealth_ret = mean(retirement, na.rm= TRUE))
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
ret_plot <- ggplot(wealth_retirement, aes(x = year, y = wealth_ret, color = race)) +
  geom_line(aes(linewidth = 1), na.rm = TRUE) +
  geom_point(na.rm = TRUE) +
  labs(x = "Year", y = "Retirement wealth", title = "Distribution of retirement wealth by race (1963-2016)") +
  scale_y_continuous(labels = scales::dollar_format(prefix = "$")) +
  scale_color_manual(values = c("Black" = "blue", "Hispanic" = "green", "White" = "red")) +
  theme_minimal() +
  theme(plot.title = element_text(colour = 'black', family = 'Georgia', size = 16, hjust = 0.5),
        plot.subtitle = element_text(colour = 'black', family = 'Georgia', size = 18),
        axis.title = element_text(colour = 'black', family = 'Georgia', size = 14),
        axis.text = element_text(colour = 'black', family = 'Georgia', size = 14),
        legend.text = element_text(colour = 'black', family = 'Georgia', size = 16),
        legend.position = 'top',
        legend.title = element_blank(),
        plot.background = element_rect(fill = 'grey60'),
        panel.background = element_rect(fill = 'grey60', colour = 'grey60'))

combined_plots <- grid.arrange(wealth_plot, ret_plot, ncol = 2)

print(combined_plots)
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]