First of all, we are going to generate plots to show the composition of different types of variants.
Freebayes produces 4 different types of variants using TYPE= in the info field. The four types are mnp, ins, del, and complex. We can extract the counts of the diffent types using a program called egrep
egrep -c "TYPE=mnp" filtered.vcf
Use the same command to get the number of ins, del and complex types. Verify the sum of the types equals the total number of SNPs in the file.
The number of SNPs, MNPs, INS, DELs, and COMPLEX in the list of filtered variants by Freebayes (QUAL > 60).
slices <- c(8539874, 137816, 407312, 470210, 324538)
labels <- c("SNP", "MNP", "INS", "DEL", "COMPLEX")
pct <- round(slices/sum(slices)*100, digits=2)
labels <- paste(labels, pct)
labels <- paste(labels, "%", sep="")
pie(slices, labels=labels, col=rainbow(length(labels)), main="Different types of variants")
variants <- round(slices/sum(slices), digits=2) # turn the raw counts into proportions
names(variants) <- c("SNP", "MNP", "INS", "DEL", "COMPLEX")
barplot(variants, col="red", main="Different types of variants", xlab="Type of variant", ylab="Percentage")
Now do the same thing but for the snpeff annotations, use the key search phrases in grep “prime_UTR”, “inframe_insertion”, “frameshift”, “intergenic”, “intron”, “missense”, “splice_”, “stop_lost”, “synonymous”, "_gene_variant" [combined upstream/downstream gene variants].
What is a circos plot?
First, install RCircos if haven’t done it already
#source("http://bioconductor.org/biocLite.R")
#biocLite("RCircos")
#biocLite("IdeoViz")
library(IdeoViz)
library(RCircos)
Download cytoband ideogram data from UCSC using package IdeoViz.
ideo <- getIdeo("equCab2")
Set up RCircos core components. One may plot both to the inside and outside of the ideogram track. Today, we are going to only plot tracks in the inside of the ideogram.
chr.exclude <- NULL
cyto.info <- ideo
tracks.inside <- 10
tracks.outside <- 0
RCircos.Set.Core.Components(cyto.info, chr.exclude, tracks.inside, tracks.outside)
##
## RCircos.Core.Components initialized.
## Type ?RCircos.Reset.Plot.Parameters to see how to modify the core components.
# plot ideogram
RCircos.Set.Plot.Area()
par(mai=c(0.25, 0.25, 0.25, 0.25));
plot.new();
plot.window(c(-2.5,2.5), c(-2.5, 2.5));
RCircos.Chromosome.Ideogram.Plot()
# plot gene labels, there are limits setup by default for the number of genes that can be plotted for each chromosome based on the size of the chromosomes. The parameter can be changed if necessary. However, plotting too many genes will have the risk of having gene symbols overlapping one another.
# First download the gene annotation files 'ensGene.txt' and 'ensemblToGeneName.txt' from UCSC http://hgdownload.cse.ucsc.edu/goldenPath/equCab2/database/
genes <- read.table(file="https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2018-August-Variant-Analysis-Workshop/master/friday/Variant-Analysis-by-R/files/ensGene.txt", sep="\t", header=F, stringsAsFactors=F)
anno <- read.table(file="https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2018-August-Variant-Analysis-Workshop/master/friday/Variant-Analysis-by-R/files/ensemblToGeneName.txt", sep="\t", header=F, stringsAsFactors=F)
idx <- match(genes$V2, anno$V1)
genes$anno <- unlist(lapply(1:dim(genes)[1], function(x){if (is.na(idx[x]) == "TRUE") {genes$V2[x]} else {anno$V2[idx[x]]}}))
# write out the gene label anntation file
write.table(genes[,c("V3","V5","V6","anno")], file="gene.label", sep="\t", row.names=F, col.names=F, quote=F)
gene.label.data <- read.table(file="https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2018-August-Variant-Analysis-Workshop/master/friday/Variant-Analysis-by-R/files/gene.label", sep="\t", header=F)
# The format of the gene information:
head(gene.label.data)
## V1 V2 V3 V4
## 1 chr1 11192 15975 SYCE1
## 2 chr1 29518 67141 ENSECAT00000018774.1
## 3 chr1 29524 71177 ENSECAT00000018811.1
## 4 chr1 132226 143850 CYP2E1
## 5 chr1 171785 172706 ENSECAT00000004254.1
## 6 chr1 183830 184742 ENSECAT00000004598.1
# The column names of gene information should be set as following.
colnames(gene.label.data) <- c("Chromosome", "chromStart", "chromEnd", "Gene")
# Create a subset of gene list. first remove genes that do not have gene symbols, because ENSEMBL IDs are too long in character and will use up too much space for demonstration purpose. The list of genes would be the genes of interest. In the class, we will randomly select 30 genes.
tmp.list <- gene.label.data[-grep("ENSECAT", gene.label.data$Gene),]
idx <- sample(1:dim(tmp.list)[1], 30, replace=F)
gene.list <- tmp.list[idx,]
name.col <- 4
side <- "in"
track.num <- 1
RCircos.Gene.Connector.Plot(gene.list, track.num, side)
## There is unsupported chromosome names in ideogram
## and chromosomes are sorted in alphabetical order.
track.num <- 2
RCircos.Gene.Name.Plot(gene.list, name.col, track.num, side)
## There is unsupported chromosome names in ideogram
## and chromosomes are sorted in alphabetical order.
# It's possible that some genes are not plotted because of the limit of plotting for each chromosome.
# check plot parameters, if needed, they may be changed.
RCircos.Get.Gene.Name.Plot.Parameters()
## chromosomes maxLabels startLoc endLoc labelWidth
## 1 chr1 12 1 6195 500
## 2 chr2 8 6495 10523 500
## 3 chr3 7 10823 14806 500
## 4 chr4 7 15106 18725 500
## 5 chr5 6 19025 22348 500
## 6 chr6 5 22648 25471 500
## 7 chr7 6 25771 29056 500
## 8 chr8 6 29356 32491 500
## 9 chr9 5 32791 35577 500
## 10 chrM 0 35877 35877 500
## 11 chrX 8 36177 40315 500
## 12 chr10 5 40615 43414 500
## 13 chr11 4 43714 45758 500
## 14 chr12 2 46058 47161 500
## 15 chr13 2 47461 48880 500
## 16 chr14 6 49180 52310 500
## 17 chr15 6 52610 55662 500
## 18 chr16 5 55962 58875 500
## 19 chr17 5 59175 61866 500
## 20 chr18 5 62166 64917 500
## 21 chr19 3 65217 67217 500
## 22 chr20 4 67517 69655 500
## 23 chr21 3 69955 71880 500
## 24 chr22 3 72180 73844 500
## 25 chr23 3 74144 76002 500
## 26 chr24 3 76302 77860 500
## 27 chr25 2 78160 79478 500
## 28 chr26 2 79778 81174 500
## 29 chr27 2 81474 82806 500
## 30 chr28 3 83106 84645 500
## 31 chr29 2 84945 86067 500
## 32 chr30 2 86367 87370 500
## 33 chr31 1 87670 88502 500
## 34 chrUn 7 88802 92718 500
# add extra tracks -- Histogram plot of CNV data. Any extra track will depend on the types of data one wants to include in the plot. The required information is the location of what will be plotted in the genome: chromosome number, start and stop position.
gene.list$CNV <- floor(runif(n=30, min=1, max=7))
data.col <- 5 # the CNV column
track.num <- 5
side <- "in"
RCircos.Histogram.Plot(gene.list, data.col, track.num, side, is.sorted=FALSE, min.value=-2)
# add extra tracks -- Scatter plot of RNASeq results
gene.expr <- gene.list
colnames(gene.expr)
## [1] "Chromosome" "chromStart" "chromEnd" "Gene" "CNV"
colnames(gene.expr) <- c("chromosome", "start", "stop", "gene.name", "CNV")
# generate random logFC data of 30 in length between -3 and 3
gene.expr$logFC <- runif(30, -3, 3)
# first three columns are required for plotting and one more column that corresponds to the data to plot
head(gene.expr)
## chromosome start stop gene.name CNV logFC
## 17806 chr28 7861299 8045131 PTPRQ 2 -1.8363523
## 2350 chr1 183656393 183657423 MGAT2 3 2.7323136
## 11608 chr2 1400343 1436278 MYSM1 1 1.8502918
## 22042 chr5 41683609 41688935 TTC24 6 0.6990219
## 13799 chr20 40903703 40935396 FOXP4 4 -2.8410517
## 27468 chrUn 2034526 2037248 MRPL40 1 -1.7550689
data.col <- 6
track.num <- 6
side <- "in"
# "by.fold" is a zero or positive number. If it's positive, then any data point with a value >= by.fold will be plotted as red color; any data point with a value <= -by.fold will be plotted as blue color; otherwise, data point will be plotted in black color.
by.fold <- 1.5
# plot scatter plot
RCircos.Scatter.Plot(gene.expr, data.col, track.num, side, by.fold, is.sorted=FALSE)
# add extra tracks -- Line plot of Coverage results
genome.cov <- gene.label.data[-grep("ENSECAT", gene.label.data$Gene),]
colnames(genome.cov) <- c("chromosome", "start", "stop", "gene.name")
genome.cov$logCOV <- rnorm(dim(genome.cov)[1], 0, 0.9)
data.col <- 5
track.num <- 7
side <- "in"
RCircos.Line.Plot(genome.cov, data.col, track.num, side, is.sorted=FALSE)
# add most inside track -- link lines and ribbons.
# generate random translocation variants data
link.data <- data.frame(Chromosome=character(), chromStart=integer(), chromEnd=integer(), Chromosome.1=character(), chromStart.1=integer(), chromEnd.1=integer(), stringsAsFactors=F)
for (i in 1:15) {
n.rand <- floor(runif(1, 1,34))
chrom <- ideo$chrom[n.rand]
str <- floor(runif(1, ideo$chromStart[n.rand], ideo$chromEnd[n.rand]))
ed <- floor(runif(1, ideo$chromStart[n.rand], ideo$chromEnd[n.rand]))
n.rand <- floor(runif(1, 1, 34))
chrom.1 <- ideo$chrom[n.rand]
str.1 <- floor(runif(1, ideo$chromStart[n.rand], ideo$chromEnd[n.rand]))
ed.1 <- str.1
if (ed < str) {
tmp <- ed
ed <- str
str <- tmp
}
link.data <- rbind(link.data, data.frame(Chromsome=chrom, chromStart=str, chromEnd=ed, Chromosome.1=chrom.1, chromStart.1=str.1, chromEnd.1=ed.1))
i <- i + 1
}
track.num <- 9
# plot link lines
RCircos.Link.Plot(link.data, track.num, TRUE, is.sorted=FALSE)
# ribbon data
ribbon.data <- link.data
colnames(ribbon.data) <- c("chromA", "chromStartA", "chromEndA", "chromB", "chromStartB", "chromEndB")
RCircos.Ribbon.Plot(ribbon.data=ribbon.data, track.num=9, by.chromosome=FALSE, twist=FALSE, is.sorted=FALSE)
Install R package “qqman” if it hasn’t been done. The develop version of the package can be installed from the source on github.
#biocLite("devtools")
library(devtools)
install_github("stephenturner/qqman")
library(qqman)
# using the example data from qqman package
head(gwasResults)
## SNP CHR BP P
## 1 rs1 1 1 0.9148060
## 2 rs2 1 2 0.9370754
## 3 rs3 1 3 0.2861395
## 4 rs4 1 4 0.8304476
## 5 rs5 1 5 0.6417455
## 6 rs6 1 6 0.5190959
manhattan(gwasResults)
# use colors for chromosomes
manhattan(gwasResults, col=c("red", "blue", "green"))
# change default horizontal line position
manhattan(gwasResults, suggestiveline=-log10(1e-6), genomewideline=-log10(1e-8), col=c("red", "blue", "green"))