Estimation of ancestry proportions. Make sure to match summary statistics
using snp_match()
(and to reverse frequencies correspondingly).
snp_ancestry_summary(freq, info_freq_ref, projection, correction)
Vector of frequencies from which to estimate ancestry proportions.
A data frame (or matrix) with the set of frequencies to be used as reference (one population per column).
Matrix of "loadings" for each variant/PC to be used to project allele frequencies.
Coefficients to correct for shrinkage when projecting.
vector of coefficients representing the ancestry proportions.
if (FALSE) {
# GWAS summary statistics for Epilepsy (supposedly in EUR+EAS+AFR)
gz <- runonce::download_file(
"http://www.epigad.org/gwas_ilae2018_16loci/all_epilepsy_METAL.gz",
dir = "tmp-data")
readLines(gz, n = 3)
library(dplyr)
sumstats <- bigreadr::fread2(
gz, select = c("CHR", "BP", "Allele2", "Allele1", "Freq1"),
col.names = c("chr", "pos", "a0", "a1", "freq")
) %>%
mutate_at(3:4, toupper)
# It is a good idea to filter for similar per-variant N (when available..)
all_freq <- bigreadr::fread2(
runonce::download_file("https://figshare.com/ndownloader/files/31620968",
dir = "tmp-data", fname = "ref_freqs.csv.gz"))
projection <- bigreadr::fread2(
runonce::download_file("https://figshare.com/ndownloader/files/31620953",
dir = "tmp-data", fname = "projection.csv.gz"))
matched <- snp_match(
mutate(sumstats, chr = as.integer(chr), beta = 1),
all_freq[1:5],
return_flip_and_rev = TRUE
) %>%
mutate(freq = ifelse(`_REV_`, 1 - freq, freq))
res <- snp_ancestry_summary(
freq = matched$freq,
info_freq_ref = all_freq[matched$`_NUM_ID_`, -(1:5)],
projection = projection[matched$`_NUM_ID_`, -(1:5)],
correction = c(1, 1, 1, 1.008, 1.021, 1.034, 1.052, 1.074, 1.099,
1.123, 1.15, 1.195, 1.256, 1.321, 1.382, 1.443)
)
# Some ancestry groups are very close to each other, and should be merged
group <- colnames(all_freq)[-(1:5)]
group[group %in% c("Scandinavia", "United Kingdom", "Ireland")] <- "Europe (North West)"
group[group %in% c("Europe (South East)", "Europe (North East)")] <- "Europe (East)"
tapply(res, factor(group, unique(group)), sum)
}