Advanced R Course

Baby names (22-03-2022)

Baby Names is a dataset with baby names for USA and New Zealand. For each year from 1880 to 2017 it holds the number of children of each sex given each name.

str(babynames)

## spec_tbl_df [1,924,665 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ year: num [1:1924665] 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
##  $ sex : chr [1:1924665] "F" "F" "F" "F" ...
##  $ name: chr [1:1924665] "Mary" "Anna" "Emma" "Elizabeth" ...
##  $ n   : num [1:1924665] 7065 2604 2003 1939 1746 ...
##  $ prop: num [1:1924665] 0.0724 0.0267 0.0205 0.0199 0.0179 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   year = col_double(),
##   ..   sex = col_character(),
##   ..   name = col_character(),
##   ..   n = col_double(),
##   ..   prop = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

I added a new variable holding the initial letter for all names in order to plot the proportion of initial letters

ggplot(babynames, aes(x = year)) + 
  geom_histogram(aes(x = year, y = ..count.. / 1e3), fill = "#002546", alpha = 0.3, colour = "#002546") + 
  geom_line(aes(x = year, y = n_world/5e7), inherit.aes = FALSE ) +
  labs(title = "Total number of names per year", 
       caption = "source: tidytuesday - Baby names") + 
  theme(axis.line.x = element_line(color = 'black'),
        axis.text = element_text(size = 18, family = "AUPassata_Rg"),
        plot.title = element_text(size = 25, hjust = 0.5, family = "AUPassata_Bold"),
        text = element_text(size = 20, family = "AUPassata_Rg")) +
  xlab("Year") +
  ylab("Total number (in thousands)")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## # A tibble: 53 × 2
## # Groups:   name [53]
##    name            n
##    <chr>       <int>
##  1 Mary           88
##  2 Helen          30
##  3 Anna           25
##  4 Emma           21
##  5 Dorothy        20
##  6 Jessica        19
##  7 Linda          17
##  8 Jennifer       17
##  9 Barbara        15
## 10 Emily          15
## 11 Ashley         14
## 12 Patricia       13
## 13 Lisa           12
## 14 Betty          10
## 15 Margaret        9
## 16 Amanda          9
## 17 Susan           8
## 18 Michelle        7
## 19 Madison         7
## 20 Isabella        6
## 21 Olivia          6
## 22 Sophia          6
## 23 Elizabeth       5
## 24 Amy             5
## 25 Hannah          5
## 26 Kimberly        4
## 27 Melissa         4
## 28 Deborah         3
## 29 Brittany        3
## 30 Shirley         2
## 31 Ava             2
## 32 Ruth            1
## 33 Debra           1
## 34 Karen           1
## 35 Heather         1
## 36 Sarah           1
## 37 Samantha        1
## 38 Alexis          1
## 39 John           80
## 40 James          80
## 41 Michael        58
## 42 William        49
## 43 Robert         36
## 44 Christopher    24
## 45 Matthew        21
## 46 Jacob          18
## 47 David          16
## 48 Jason           8
## 49 Joshua          5
## 50 Ethan           5
## 51 Noah            5
## 52 Liam            5
## 53 Mason           4

In 2003, Emma was used in the TV show “Friends” as the name for Rachel and Ross’s baby

filter(pop_babynames, name == "Emma") %>% 
  select(., year, name, prop)

## # A tibble: 21 × 3
## # Groups:   year [21]
##     year name     prop
##    <dbl> <chr>   <dbl>
##  1  1881 Emma  0.0206 
##  2  1882 Emma  0.0199 
##  3  1883 Emma  0.0197 
##  4  1884 Emma  0.0188 
##  5  1885 Emma  0.0192 
##  6  1886 Emma  0.0180 
##  7  2003 Emma  0.0113 
##  8  2004 Emma  0.0107 
##  9  2005 Emma  0.0100 
## 10  2006 Emma  0.00915
## # … with 11 more rows

filter(babynames, name == "Emma")  %>% 
  ggplot(., aes(x = year, y = prop)) + 
  geom_point() + 
  labs(title = "Emma", 
       caption = "source: tidytuesday - Baby names") + 
  theme(axis.line.x = element_line(color = 'black'),
        axis.text = element_text(size = 18, family = "AUPassata_Rg"),
        plot.title = element_text(size = 25, hjust = 0.5, family = "AUPassata_Bold"),
        text = element_text(size = 20, family = "AUPassata_Rg")) +
  geom_vline(aes(xintercept = 2003), colour = "red") +
  xlab("Year") +
  ylab("Popularity")

In 1964, the film The Americanization of Emily was shown in theatres

filter(babynames, name == "Emily") %>% 
  ggplot(., aes(x = year, y = prop)) + 
  geom_line() + 
  labs(title = "Emily", 
       caption = "source: tidytuesday - Baby names") + 
  theme(axis.line.x = element_line(color = 'black'),
        axis.text = element_text(size = 18, family = "AUPassata_Rg"),
        plot.title = element_text(size = 25, hjust = 0.5, family = "AUPassata_Bold"),
        text = element_text(size = 20, family = "AUPassata_Rg")) +
  geom_vline(aes(xintercept = 1964), colour = "red") +
  xlab("Year") +
  ylab("Popularity")

Advanced R Course

Jette Steinbach

2022-09-23

Baby names (22-03-2022)