Read Data
This is the notebook developed in class. It shows how genotype
frequencies can be determined using R.
We want to read a dataset with genotype and phenotype
information.
s_gen_phen_data <- "https://charlotte-ngs.github.io/lbgfs2022/data/p1_mrk_one_locus.csv"
library(readr)
tbl_gen_phen <- read_delim(file = s_gen_phen_data,
delim = ";",
col_types = cols(
ID = col_integer(),
Genotype = col_integer(),
Phenotype = col_double()
))
head(tbl_gen_phen)
Descriptive statistics
library(dplyr)
tbl_gen_phen %>%
group_by(Genotype) %>%
summarise(geno_count = n()) %>%
summarise(geno_freq = geno_count / sum(geno_count))
NA
Genotypic Values
Regression of phenotypes on Genotypes using homozygous genotypes
tbl_hom_gen <- tbl_gen_phen %>%
filter(Genotype < 3)
head(tbl_hom_gen)
tbl_hom_gen %>%
group_by(Genotype) %>%
summarise(geno_count = n())
Regression model, with or without intercept? Compute the mean
phenotypic value for all animals with Genotype \(0\)
tbl_hom_gen0 <- tbl_hom_gen %>%
filter(Genotype < 1)
mean(tbl_hom_gen0$Phenotype)
[1] 0.1384745
Plot the data
library(ggplot2)
p_g0 <- ggplot(data = tbl_hom_gen, mapping = aes(x = Genotype, y = Phenotype)) +
geom_point()
p_g0
From the plot and from the mean of the animals with genotype \(0\), it becomes clear that we need to fit a
regression with intercept
lm_geno_val <- lm(Phenotype ~ Genotype, data = tbl_hom_gen)
(sry_geno_val <- summary(lm_geno_val))
Call:
lm(formula = Phenotype ~ Genotype, data = tbl_hom_gen)
Residuals:
Min 1Q Median 3Q Max
-3.1200 -0.6895 0.0200 0.7068 3.8270
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.13847 0.17801 0.778 0.437
Genotype 0.04542 0.08966 0.507 0.613
Residual standard error: 1.007 on 2182 degrees of freedom
Multiple R-squared: 0.0001176, Adjusted R-squared: -0.0003407
F-statistic: 0.2566 on 1 and 2182 DF, p-value: 0.6125
Add the regression line to the plot
p_g0_line <- p_g0 +
geom_abline(slope = sry_geno_val$coefficients["Genotype", "Estimate"],
intercept = sry_geno_val$coefficients["(Intercept)", "Estimate"])
p_g0_line
The genotypic value \(a\) is
(n_geno_a <- sry_geno_val$coefficients["Genotype", "Estimate"])
[1] 0.04541911
The genotypic values for heterozygous animals
tbl_het_gen <- tbl_gen_phen %>%
filter(Genotype > 2)
head(tbl_het_gen)
n_geno_d <- mean(tbl_het_gen$Phenotype) - sry_geno_val$coefficients["(Intercept)", "Estimate"] - n_geno_a
n_geno_d
[1] -0.05784937
tbl_het_gen$Genotype <- 1L
head(tbl_het_gen)
tbl_gen_all <- bind_rows(tbl_hom_gen, tbl_het_gen)
head(tbl_gen_all)
p_g_all <- ggplot(data = tbl_gen_all, mapping = aes(x=Genotype, y=Phenotype))+
geom_point()+
geom_abline(slope = sry_geno_val$coefficients["Genotype", "Estimate"],
intercept = sry_geno_val$coefficients["(Intercept)", "Estimate"])
p_g_all
Now that the genotypic values \(a\)
and \(d\) are computed, we can do the
transformations of the phenotypic values such that the three genotypes
have the correct genotypic value
tbl_trans_gen <- tbl_gen_all
tbl_trans_gen$Phenotype <- tbl_trans_gen$Phenotype - sry_geno_val$coefficients["(Intercept)", "Estimate"] - n_geno_a
To check the result of the transformation, we can compute the means
of the transformed phenotypic values
tbl_trans_gen %>%
group_by(Genotype) %>%
summarise(mean_phen = mean(Phenotype))
LS0tCnRpdGxlOiAiRXN0aW1hdGUgU2luZ2xlIExvY3VzIEJyZWVkaW5nIFZhbHVlcyBmcm9tIERhdGEiCmRhdGU6ICIyMDIyLTEwLTA3IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFKQpgYGAKCiMjIERpc2NsYWltZXIKQnJlZWRpbmcgdmFsdWVzIGFyZSBlc3RpbWF0ZWQgZnJvbSBkYXRhCgoKIyMgUmVhZCBEYXRhClRoaXMgaXMgdGhlIG5vdGVib29rIGRldmVsb3BlZCBpbiBjbGFzcy4gSXQgc2hvd3MgaG93IGdlbm90eXBlIGZyZXF1ZW5jaWVzIGNhbiBiZSBkZXRlcm1pbmVkIHVzaW5nIFIuCgpXZSB3YW50IHRvIHJlYWQgYSBkYXRhc2V0IHdpdGggZ2Vub3R5cGUgYW5kIHBoZW5vdHlwZSBpbmZvcm1hdGlvbi4gCgpgYGB7cn0Kc19nZW5fcGhlbl9kYXRhIDwtICJodHRwczovL2NoYXJsb3R0ZS1uZ3MuZ2l0aHViLmlvL2xiZ2ZzMjAyMi9kYXRhL3AxX21ya19vbmVfbG9jdXMuY3N2IgpsaWJyYXJ5KHJlYWRyKQp0YmxfZ2VuX3BoZW4gPC0gcmVhZF9kZWxpbShmaWxlID0gc19nZW5fcGhlbl9kYXRhLAogICAgICAgICAgICAgICAgICAgICAgICAgICBkZWxpbSA9ICI7IiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgY29sX3R5cGVzID0gY29scygKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBJRCA9IGNvbF9pbnRlZ2VyKCksCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgR2Vub3R5cGUgPSBjb2xfaW50ZWdlcigpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIFBoZW5vdHlwZSA9IGNvbF9kb3VibGUoKQogICAgICAgICAgICAgICAgICAgICAgICAgICApKQpoZWFkKHRibF9nZW5fcGhlbikKYGBgCgoKRGVzY3JpcHRpdmUgc3RhdGlzdGljcwoKYGBge3J9CmxpYnJhcnkoZHBseXIpCnRibF9nZW5fcGhlbiAlPiUKICBncm91cF9ieShHZW5vdHlwZSkgJT4lCiAgc3VtbWFyaXNlKGdlbm9fY291bnQgPSBuKCkpICU+JQogIHN1bW1hcmlzZShnZW5vX2ZyZXEgPSBnZW5vX2NvdW50IC8gc3VtKGdlbm9fY291bnQpKQoKYGBgCgoKIyMgR2Vub3R5cGljIFZhbHVlcwpSZWdyZXNzaW9uIG9mIHBoZW5vdHlwZXMgb24gR2Vub3R5cGVzIHVzaW5nIGhvbW96eWdvdXMgZ2Vub3R5cGVzCgpgYGB7cn0KdGJsX2hvbV9nZW4gPC0gdGJsX2dlbl9waGVuICU+JQogIGZpbHRlcihHZW5vdHlwZSA8IDMpCmhlYWQodGJsX2hvbV9nZW4pCmBgYAoKYGBge3J9CnRibF9ob21fZ2VuICU+JQogIGdyb3VwX2J5KEdlbm90eXBlKSAlPiUKICBzdW1tYXJpc2UoZ2Vub19jb3VudCA9IG4oKSkKYGBgCgpSZWdyZXNzaW9uIG1vZGVsLCB3aXRoIG9yIHdpdGhvdXQgaW50ZXJjZXB0PyBDb21wdXRlIHRoZSBtZWFuIHBoZW5vdHlwaWMgdmFsdWUgZm9yIGFsbCBhbmltYWxzIHdpdGggR2Vub3R5cGUgJDAkCgpgYGB7cn0KdGJsX2hvbV9nZW4wIDwtIHRibF9ob21fZ2VuICU+JQogIGZpbHRlcihHZW5vdHlwZSA8IDEpCm1lYW4odGJsX2hvbV9nZW4wJFBoZW5vdHlwZSkKYGBgCgpQbG90IHRoZSBkYXRhCgpgYGB7cn0KbGlicmFyeShnZ3Bsb3QyKQpwX2cwIDwtIGdncGxvdChkYXRhID0gdGJsX2hvbV9nZW4sIG1hcHBpbmcgPSBhZXMoeCA9IEdlbm90eXBlLCB5ID0gUGhlbm90eXBlKSkgKwogIGdlb21fcG9pbnQoKQpwX2cwCmBgYAoKCkZyb20gdGhlIHBsb3QgYW5kIGZyb20gdGhlIG1lYW4gb2YgdGhlIGFuaW1hbHMgd2l0aCBnZW5vdHlwZSAkMCQsIGl0IGJlY29tZXMgY2xlYXIgdGhhdCB3ZSBuZWVkIHRvIGZpdCBhIHJlZ3Jlc3Npb24gd2l0aCBpbnRlcmNlcHQKCmBgYHtyfQpsbV9nZW5vX3ZhbCA8LSBsbShQaGVub3R5cGUgfiBHZW5vdHlwZSwgZGF0YSA9IHRibF9ob21fZ2VuKQooc3J5X2dlbm9fdmFsIDwtIHN1bW1hcnkobG1fZ2Vub192YWwpKQpgYGAKCkFkZCB0aGUgcmVncmVzc2lvbiBsaW5lIHRvIHRoZSBwbG90CgpgYGB7cn0KcF9nMF9saW5lIDwtIHBfZzAgKyAKICBnZW9tX2FibGluZShzbG9wZSA9IHNyeV9nZW5vX3ZhbCRjb2VmZmljaWVudHNbIkdlbm90eXBlIiwgIkVzdGltYXRlIl0sCiAgICAgICAgICAgICAgaW50ZXJjZXB0ID0gc3J5X2dlbm9fdmFsJGNvZWZmaWNpZW50c1siKEludGVyY2VwdCkiLCAiRXN0aW1hdGUiXSkKcF9nMF9saW5lCmBgYAoKVGhlIGdlbm90eXBpYyB2YWx1ZSAkYSQgaXMKCmBgYHtyfQoobl9nZW5vX2EgPC0gc3J5X2dlbm9fdmFsJGNvZWZmaWNpZW50c1siR2Vub3R5cGUiLCAiRXN0aW1hdGUiXSkKYGBgCgpUaGUgZ2Vub3R5cGljIHZhbHVlcyBmb3IgaGV0ZXJvenlnb3VzIGFuaW1hbHMKCmBgYHtyfQp0YmxfaGV0X2dlbiA8LSB0YmxfZ2VuX3BoZW4gJT4lCiAgZmlsdGVyKEdlbm90eXBlID4gMikKaGVhZCh0YmxfaGV0X2dlbikKYGBgCgpgYGB7cn0Kbl9nZW5vX2QgPC0gbWVhbih0YmxfaGV0X2dlbiRQaGVub3R5cGUpIC0gc3J5X2dlbm9fdmFsJGNvZWZmaWNpZW50c1siKEludGVyY2VwdCkiLCAiRXN0aW1hdGUiXSAtIG5fZ2Vub19hCm5fZ2Vub19kCmBgYAoKCmBgYHtyfQp0YmxfaGV0X2dlbiRHZW5vdHlwZSA8LSAxTApoZWFkKHRibF9oZXRfZ2VuKQpgYGAKCmBgYHtyfQp0YmxfZ2VuX2FsbCA8LSBiaW5kX3Jvd3ModGJsX2hvbV9nZW4sIHRibF9oZXRfZ2VuKQpoZWFkKHRibF9nZW5fYWxsKQpgYGAKCmBgYHtyfQpwX2dfYWxsIDwtIGdncGxvdChkYXRhID0gdGJsX2dlbl9hbGwsIG1hcHBpbmcgPSBhZXMoeD1HZW5vdHlwZSwgIHk9UGhlbm90eXBlKSkrCiAgZ2VvbV9wb2ludCgpKyAKICBnZW9tX2FibGluZShzbG9wZSA9IHNyeV9nZW5vX3ZhbCRjb2VmZmljaWVudHNbIkdlbm90eXBlIiwgIkVzdGltYXRlIl0sCiAgICAgICAgICAgICAgaW50ZXJjZXB0ID0gc3J5X2dlbm9fdmFsJGNvZWZmaWNpZW50c1siKEludGVyY2VwdCkiLCAiRXN0aW1hdGUiXSkKcF9nX2FsbAoKYGBgCgoKTm93IHRoYXQgdGhlIGdlbm90eXBpYyB2YWx1ZXMgJGEkIGFuZCAkZCQgYXJlIGNvbXB1dGVkLCB3ZSBjYW4gZG8gdGhlIHRyYW5zZm9ybWF0aW9ucyBvZiB0aGUgcGhlbm90eXBpYyB2YWx1ZXMgc3VjaCB0aGF0IHRoZSB0aHJlZSBnZW5vdHlwZXMgaGF2ZSB0aGUgY29ycmVjdCBnZW5vdHlwaWMgdmFsdWUKCmBgYHtyfQp0YmxfdHJhbnNfZ2VuIDwtIHRibF9nZW5fYWxsCnRibF90cmFuc19nZW4kUGhlbm90eXBlIDwtIHRibF90cmFuc19nZW4kUGhlbm90eXBlIC0gc3J5X2dlbm9fdmFsJGNvZWZmaWNpZW50c1siKEludGVyY2VwdCkiLCAiRXN0aW1hdGUiXSAtIG5fZ2Vub19hCmBgYAoKVG8gY2hlY2sgdGhlIHJlc3VsdCBvZiB0aGUgdHJhbnNmb3JtYXRpb24sIHdlIGNhbiBjb21wdXRlIHRoZSBtZWFucyBvZiB0aGUgdHJhbnNmb3JtZWQgcGhlbm90eXBpYyB2YWx1ZXMKCmBgYHtyfQp0YmxfdHJhbnNfZ2VuICU+JQogIGdyb3VwX2J5KEdlbm90eXBlKSAlPiUKICBzdW1tYXJpc2UobWVhbl9waGVuID0gbWVhbihQaGVub3R5cGUpKQpgYGAKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCg==