UCD Bioinformatics Core Workshop

Home
CLI
Introduction to the Command Line 1
Introduction to the Command Line 2
Introduction to the Command Line 3
Extra
BashCrawl
Challenge & Homework Solutions
Software Installation
Installation using make and cmake
Conda
Introduction to R
Introduction to R Day 1
Introduction to R Day 2
Introduction to R Day 3
Challenge solutions
Alignment
Local Alignment
Python
Intro
Basic Data Types
Flow Control, Functions, Files, Errors
Writing Programs
Biopython and bamnostic/pysam
Next Steps
Bioinformatics File Types
RNA-Seq
Data Reduction
Files and Filetypes
Prepare dataset
Preprocessing raw data
Indexing a Genome
Alignment with Star
Generating counts tables
Data Analysis
Prepare R for data analysis
Annotation from BioMart
Differential Expression Analysis
Pathway Analysis

There is more than one way to solve each of these challenges, and not all solutions for each challenge are provided.

mtcars

# using only functions already seen
mtcars_avg <- colSums(mtcars) / nrow(mtcars)
mtcars2 <- rbind(mtcars, mtcars_avg)
rownames(mtcars2) <- c(rownames(mtcars), "avg")
mtcars2$hp.gt.100 <- mtcars2$hp > 100
mtcars2

# simplified
mtcars3 <- rbind(mtcars, "avg" = colMeans(mtcars))
mtcars3$hp.gt.100 <- mtcars3$hp > 100
mtcars3

iris

table(iris[iris$Sepal.Length > 6, "Species"])

# a data frame with two columns: species and sepal + petal
data.frame(Species = iris$Species,
           Length.Sum = iris$Sepal.Length + iris$Petal.Length)

min max normalization

# make small version of data2 to test formula
data3 <- data2[c(1:3),c(1:3)]
data3
minMaxNormalize <- function(x) {(x - min(x)) / (max(x) - min(x))}
t(apply(data3, 1, minMaxNormalize))

log2-fold change

# add one to all values to avoid taking log of 0 and division by zero
data4 <- t(apply(data3, 1, minMaxNormalize)) + 1
data4
log2FoldChange <- function(a,b) {log2(a / b)}
# for all genes between any two samples
mapply(log2FoldChange, data4[,"C61"], data4[,"C62"])
# for all genes between one sample and all other samples
apply(data4[,-1], 2, function(b) {log2FoldChange(data4[,1], b)})

# or more generally
lfcSample <- function(X, c) {
  lfc = log2FoldChange(X[,c], X[,-c])
  colnames(lfc) = paste("lfc", colnames(X)[c], colnames(lfc), sep = ".")
  lfc}

lfcSample(data4, 1)
# all genes, all sample comparisons
## with a for loop
for (i in 1:ncol(data4)) {
  print(lfcSample(data4, i))
}
## with an apply function
lapply(seq(dim(data4)[2]), function(c) {lfcSample(data4, c)})

Visualization Challenge

# get data and subset
gapminder = read.csv("https://github.com/ucdavis-bioinformatics-training/2023-February-Introduction-to-R-for-Bioinformatics/raw/main/R/gapminder.csv")

gapminder_1982 = gapminder[gapminder$year == 1982,]

# simple scatter plot

plot(gapminder_1982$gdpPercap, gapminder_1982$lifeExp, log="x", xlab="GDP per capita", ylab="Life Expectancy")

# scatter plot with color and legend

mycol <- c(Asia = "tomato", Europe = "chocolate4", Africa = "dodgerblue2", 
  Americas = "darkgoldenrod1", Oceania = "green4"
)
plot(gapminder_1982$gdpPercap, gapminder_1982$lifeExp, log="x", col=mycol[gapminder_1982$continent], xlab="GDP per capita", ylab="Life Expectancy")

legend("bottomright", fill=mycol, legend=names(mycol), cex=0.7)

# scatter plot with color, scaled point size, and legend

mycol <- c(Asia = "tomato", Europe = "chocolate4", Africa = "dodgerblue2", 
  Americas = "darkgoldenrod1", Oceania = "green4"
)

mycex <- function(x, minsize, maxsize){
  xs = sqrt(x)
  x_scaled = (xs - min(xs))/(max(xs) - min(xs))
  return(minsize + x_scaled * (maxsize - minsize))
}

plot(gapminder_1982$gdpPercap, gapminder_1982$lifeExp, log="x", col=mycol[gapminder_1982$continent], cex=mycex(gapminder_1982$pop, 0.2, 10), xlab="GDP per capita", ylab="Life Expectancy")
legend("bottomright", fill=mycol, legend=names(mycol), cex=0.7)

☰ Menu

BIS 180L

mtcars

iris

min max normalization

log2-fold change

Visualization Challenge