We chose to investigate 2022 data from Tanya Shapiro about the Eurovision Song Contest: https://github.com/tashapiro/eurovision-contest.
# Packages
library(tidyr)
library(dplyr)
library(ggplot2)
# Set theme for all following graphs
theme_set(theme_bw())
# Read in two datasets and create dataframes
# tuesdata <- tidytuesdayR::tt_load('2022-05-17')
# eurovision <- tuesdata[["eurovision"]]
# votes <- tuesdata[["eurovision-votes"]]
# save(eurovision, votes, file = "/Users/au734169/Documents/01_PhD/Advanced_R/eurovision.rda")
load(file = "/Users/au734169/Documents/01_PhD/Advanced_R/eurovision.rda")
# Explore data
knitr::kable(votes[1:10,])
year | semi_final | edition | jury_or_televoting | from_country | to_country | points | duplicate |
---|---|---|---|---|---|---|---|
1975 | f | 1975f | J | Belgium | Belgium | 0 | x |
1975 | f | 1975f | J | Belgium | Finland | 0 | NA |
1975 | f | 1975f | J | Belgium | France | 2 | NA |
1975 | f | 1975f | J | Belgium | Germany | 0 | NA |
1975 | f | 1975f | J | Belgium | Ireland | 12 | NA |
1975 | f | 1975f | J | Belgium | Israel | 1 | NA |
1975 | f | 1975f | J | Belgium | Italy | 6 | NA |
1975 | f | 1975f | J | Belgium | Luxembourg | 0 | NA |
1975 | f | 1975f | J | Belgium | Malta | 7 | NA |
1975 | f | 1975f | J | Belgium | Monaco | 0 | NA |
knitr::kable(eurovision[1:10,])
# str(votes)
# str(eurovision)
Question: Who won the Eurovision Song Contest each year? Was there always one winner?
# Who are the winners of the contest each year?
winners <- eurovision %>%
filter(winner == TRUE & section %in% c("final", "grand-final")) %>%
select(year, host_country, section, winner, artist, artist_country, total_points) %>%
arrange(year) # %>%
# nrow()
# There are 69 rows in the dataset.
knitr::kable(winners[1:10,])
year | host_country | section | winner | artist | artist_country | total_points |
---|---|---|---|---|---|---|
1956 | Switzerland | final | TRUE | Lys Assia | Switzerland | NA |
1957 | Germany | final | TRUE | Corry Brokken | Netherlands | 31 |
1958 | Netherlands | final | TRUE | André Claveau | France | 27 |
1959 | France | final | TRUE | Teddy Scholten | Netherlands | 21 |
1960 | United Kingdom | final | TRUE | Jacqueline Boyer | France | 32 |
1961 | France | final | TRUE | Jean-Claude Pascal | Luxembourg | 31 |
1962 | Luxembourg | final | TRUE | Isabelle Aubret | France | 26 |
1963 | United Kingdom | final | TRUE | Grethe and Jørgen Ingmann | Denmark | 42 |
1964 | Denmark | final | TRUE | Gigliola Cinquetti | Italy | 49 |
1965 | Italy | final | TRUE | France Gall | Luxembourg | 32 |
# Who are the winning countries?
ggplot(data = winners,
aes(x = forcats::fct_infreq(artist_country))) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Number of wins per country",
x = "Country",
y = "Number of wins")
# How many distinct years are present in the dataset?
eurovision %>%
distinct(year) %>%
nrow()
## [1] 67
# There are 67 distinct years in the dataset.
# Are there any years with 0 winners?
winners %>%
distinct(year) %>%
pull(year) %>% # creates a vector from a column
setdiff(c(1956:2022), .) # get all rows in x that are not in y
## [1] 2020
# There was no winner in 2020 because the contest had to be stopped due to COVID-19 :(
# Are there any years with >1 winner?
winners %>%
group_by(year) %>%
filter(n()>1) %>%
knitr::kable(.)
year | host_country | section | winner | artist | artist_country | total_points |
---|---|---|---|---|---|---|
1969 | Spain | final | TRUE | Frida Boccara | France | 18 |
1969 | Spain | final | TRUE | Lenny Kuhr | Netherlands | 18 |
1969 | Spain | final | TRUE | Lulu | United Kingdom | 18 |
1969 | Spain | final | TRUE | Salomé | Spain | 18 |
# Yes, 1969 had 4 winners.
Question: How many points are being distributed each year?
# Calculate the total number of points for each year
points <- eurovision %>%
group_by(year) %>%
filter(section %in% c("final", "grand-final")) %>%
summarize(total_points_year = sum(total_points, na.rm = TRUE))
# Get the number of countries participating by year
countries <- eurovision %>%
group_by(year) %>%
summarize(n_countries_year = n_distinct(artist_country))
# Join datasets
by_year <- full_join(points, countries, by = "year") %>%
mutate(points_per_country = total_points_year/n_countries_year)
# Plot total number of points
ggplot(data = by_year,
aes(x = year,
y = total_points_year,
size = n_countries_year)) +
geom_point() +
labs(title = "Eurovision: Total points to distribute each year",
x = "Year",
y = "Total points",
size = "Participating countries")
ggplot(data = by_year,
aes(x = year,
y = points_per_country,
color = n_countries_year)) +
geom_point() +
labs(title = "Eurovision: Points to distribute per country each year",
x = "Year",
y = "Points to distribute per country",
col = "Participating countries")
# Why were no points assigned in 1956?
eurovision %>%
filter(year == 1956) %>%
select(event, artist, artist_country, section, total_points, rank, winner) %>%
knitr::kable(.)
event | artist | artist_country | section | total_points | rank | winner |
---|---|---|---|---|---|---|
Lugano 1956 | Tonina Torielli | Italy | final | NA | 2 | FALSE |
Lugano 1956 | Michèle Arnaud | Luxembourg | final | NA | 2 | FALSE |
Lugano 1956 | Dany Dauberson | France | final | NA | 2 | FALSE |
Lugano 1956 | Freddy Quinn | Germany | final | NA | 2 | FALSE |
Lugano 1956 | Mony Marc | Belgium | final | NA | 2 | FALSE |
Lugano 1956 | Lys Assia | Switzerland | final | NA | 1 | TRUE |
Lugano 1956 | Corry Brokken | Netherlands | final | NA | 2 | FALSE |
Lugano 1956 | Franca Raimondi | Italy | final | NA | 2 | FALSE |
Lugano 1956 | Michèle Arnaud | Luxembourg | final | NA | 2 | FALSE |
Lugano 1956 | Mathé Altéry | France | final | NA | 2 | FALSE |
Lugano 1956 | Walter Andreas Schwarz | Germany | final | NA | 2 | FALSE |
Lugano 1956 | Fud Leclerc | Belgium | final | NA | 2 | FALSE |
Lugano 1956 | Lys Assia | Switzerland | final | NA | 2 | FALSE |
Lugano 1956 | Jetty Paerl | Netherlands | final | NA | 2 | FALSE |
# There was no points, only rank = 1 for the winner and rank = 2 for the rest.
# Why were no points assigned in 2020?
eurovision %>%
filter(year == 2020) %>%
select(event, artist, artist_country, section, total_points, rank, winner) %>%
knitr::kable(.)
event | artist | artist_country | section | total_points | rank | winner |
---|---|---|---|---|---|---|
Rotterdam 2020 | Ana Soklič | Slovenia | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Ulrikke | Norway | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Eden Alene | Israel | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Sandro | Cyprus | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Efendi | Azerbaijan | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Go_A | Ukraine | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Hooverphonic | Belgium | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Little Big | Russia | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Vasil | North Macedonia | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | The Roop | Lithuania | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Montaigne | Australia | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Lesley Roy | Ireland | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Damir Kedžo | Croatia | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | The Mamas | Sweden | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | VAL | Belarus | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Destiny | Malta | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | ROXEN | Romania | first-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Benny Cristo | Czech Republic | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Natalia Gordienko | Moldova | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Stefania | Greece | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Elisa | Portugal | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Daði og Gagnamagnið | Iceland | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Alicja | Poland | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Vincent Bueno | Austria | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Aksel | Finland | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Hurricane | Serbia | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Ben & Tan | Denmark | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Uku Suviste | Estonia | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Athena Manoukian | Armenia | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Gjon’s Tears | Switzerland | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | VICTORIA | Bulgaria | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Samanta Tīna | Latvia | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Tornike Kipiani | Georgia | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Arilena Ara | Albania | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Senhit | San Marino | second-semi-final | NA | NA | FALSE |
Rotterdam 2020 | Tom Leeb | France | grand-final | NA | NA | FALSE |
Rotterdam 2020 | Blas Cantó | Spain | grand-final | NA | NA | FALSE |
Rotterdam 2020 | Jeangu Macrooy | Netherlands | grand-final | NA | NA | FALSE |
Rotterdam 2020 | James Newman | United Kingdom | grand-final | NA | NA | FALSE |
Rotterdam 2020 | Ben Dolic | Germany | grand-final | NA | NA | FALSE |
Rotterdam 2020 | Diodato | Italy | grand-final | NA | NA | FALSE |
# Eurovision was cancelled and no points were assigned because of COVID-19 :(
Question: What proportion of the total points distributed that year did the winner of the final get?
# What proportion of points did the winners get?
winners_points <- left_join(winners, by_year, by = "year") %>%
mutate(total_points_prop = round((total_points / total_points_year) * 100, 1))
# Plot results
ggplot(data = winners_points, aes(x=year, y=total_points_prop, fill = artist_country)) +
geom_bar(stat = "identity") +
labs(title = "Percent of points received by Eurovision winner by year",
x = "Year",
y = "Points received by winner (%)",
fill = "Country of winner")
Question: Which countries are sending votes to each other?
# Create from-to variable
votes$from_to <- paste(votes$from_country, votes$to_country)
# Get the sum of votes
sum_votes <- votes %>%
filter(is.na(duplicate)) %>%
select(from_to, points) %>%
group_by(from_to) %>%
summarise(sum_points = sum(points)) %>%
separate(from_to, c("From", "To"))
# Plot flow of points within countries that received at least 250 points
ggplot(data = filter(sum_votes, sum_points > 250),
aes(x = From,
y = To)) +
geom_raster(aes(fill = sum_points)) +
scale_fill_viridis_c(direction = -1) +
labs(title = "Eurovision point distribution over the years 1956 - 2022 (>250 points)",
x = "From country",
y = "To country",
fill = "Total points \n from country \n to country") +
theme(axis.text.x = element_text(angle = 90))
# Plot flow of points within countries that received at least 400 points
ggplot(data = filter(sum_votes, sum_points > 400),
aes(x = From,
y = To)) +
geom_raster(aes(fill = sum_points)) +
scale_fill_viridis_c(direction = -1) +
labs(title = "Eurovision point distribution over the years 1956 - 2022 (>400 points)",
x = "From country",
y = "To country",
fill = "Total points \n from country \n to country") +
theme(axis.text.x = element_text(angle = 90))