Home
Introduction and Lectures
Intro to the Workshop and Core
What is Bioinformatics?
Experimental Design and Cost Estimation
Introduction to Command-Line and the Cluster
Logging in and Transferring Files
Intro to Command-Line
Advanced Command-Line (extra)
Running jobs on the Cluster and using modules
Intro to R and Rstudio
Getting Started
Intro to R
Prepare Data in R (extra)
Data in R (extra)
dbcAmplicons
dbcAmplicons Installing Software
dbcAmplicons - Amplicons talk
dbcAmplicons - Bioinformatics talk
Dataset and Metadata
dbcAmplicons - Data processing
dbcAmplicons w/Dada2
Coming soon
Microbial Community Analysis in R
Prepare MCA Analysis
MCA Analysis in phyloseq
Support
Cheat Sheets
Software and Links
Scripts
ETC
Closing thoughts
Workshop Photos
Github
Biocore website

Using the Phyloseq package

The phyloseq package is fast becoming a good way a managing micobial community data, filtering and visualizing that data and performing analysis such as ordination. Along with the standard R environment and packages vegan and vegetarian you can perform virtually any analysis. Today we will

Load data straight from dbcAmplicons (biom file)
Filter out Phylum
Filter out additional Taxa
Filter out samples
Graphical Summaries
Ordination
Differential Abundances

Load our libraries

library(phyloseq)
library(biomformat)
library(ggplot2)
library(gridExtra)
library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.5-6

library(edgeR)

## Loading required package: limma

Read in the dataset, biom file generated from dbcAmplicons pipeline

First read in the dataset, see what the objects look like. Our Biom file, produces 3 tables: otu_table, taxa_table, sample_data. Look at the head of each. Get the sample names and tax ranks, finally view the phyloseq object. Lets draw a first bar plot.

s16sV3V5 = import_biom(BIOMfilename = "16sV3V5.biom", parseFunction = parse_taxonomy_default)

# this changes the columns names to kingdon through genus
colnames(tax_table(s16sV3V5)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")

head(otu_table(s16sV3V5))

## OTU Table:          [6 taxa and 48 samples]
##                      taxa are rows
##            sample1 sample10 sample11 sample12 sample13 sample14 sample15
## Taxa_00000    2144     2303     3235     2895     2584     2465     2907
## Taxa_00001       0        0        0        0        0        0        0
## Taxa_00002      29       18       40       36       33       42       37
## Taxa_00003      14       18       16       16       36       21       32
## Taxa_00004     142      138      214      136       88      132      148
## Taxa_00005       3       14       15       16        7       10        7
##            sample16 sample17 sample18 sample19 sample2 sample20 sample21
## Taxa_00000     2651     2751     3022     2904    3034     2547     2908
## Taxa_00001        0        0        0        0       0        0        0
## Taxa_00002       35       42       25       40      19       43       34
## Taxa_00003       23       19       12       16      37       25       13
## Taxa_00004      140      141      131      109     140      227      227
## Taxa_00005        6        6        4        8      12        3        9
##            sample22 sample23 sample24 sample25 sample26 sample27 sample28
## Taxa_00000     1946     2790     2146     2291     2046     2283     2292
## Taxa_00001        0        0        0        0        0        0        0
## Taxa_00002       24       32       33       23       23       21       26
## Taxa_00003       14       15       20       15       16       11       20
## Taxa_00004       90      161      105      135      104       93       99
## Taxa_00005       10        3        3       10        3       13        4
##            sample29 sample3 sample30 sample31 sample32 sample33 sample34
## Taxa_00000     2349    2483     2126     2336     2576     2478     1909
## Taxa_00001        0       0        0        0        0        0        0
## Taxa_00002       34      22       27       33       33       26       28
## Taxa_00003       27      21       14       18       23       12       16
## Taxa_00004      137     163      188      107      200       99      176
## Taxa_00005        1       5        7        8       12        6        5
##            sample35 sample36 sample37 sample38 sample39 sample4 sample40
## Taxa_00000     2666     2672     2605     2403     2552    2795     2073
## Taxa_00001        0        0        0        0        0       0        0
## Taxa_00002       36       28       19       34       28      39       28
## Taxa_00003       30       28       19       15       31      17       22
## Taxa_00004      319      202      271      203      170     133       89
## Taxa_00005        2        8        3        0        6       7        8
##            sample41 sample42 sample43 sample44 sample45 sample46 sample47
## Taxa_00000     2519     3647     3352     3559     3270     2337     2835
## Taxa_00001        0        0        0        0        0        0        0
## Taxa_00002       30       44       34       41       29       28       21
## Taxa_00003       14       39       21       28       31       17       28
## Taxa_00004      160      212      222      284      236      178      187
## Taxa_00005        2       10        8        4        5       10        8
##            sample48 sample5 sample6 sample7 sample8 sample9
## Taxa_00000     2258    2214    3315    3078    2848    3107
## Taxa_00001        0       0       0       1       0       0
## Taxa_00002       29      28      27      35      32      35
## Taxa_00003       25      26      33      27      34      24
## Taxa_00004      146     195     185     399     310     241
## Taxa_00005        6       3      12       4       3       2

head(sample_data(s16sV3V5))

##           primers Replicate   Treatment Timepoint
## sample1  16S_V3V5         4 ABC_Control        T1
## sample10 16S_V3V5         1     Control        T2
## sample11 16S_V3V5         2     Control        T2
## sample12 16S_V3V5         3     Control        T2
## sample13 16S_V3V5         4     Control        T2
## sample14 16S_V3V5         1  Condition1        T2

head(tax_table(s16sV3V5))

## Taxonomy Table:     [6 taxa by 6 taxonomic ranks]:
##            Kingdom       Phylum            
## Taxa_00000 "d__Bacteria" NA                
## Taxa_00001 "d__Bacteria" "p__Acetothermia" 
## Taxa_00002 "d__Bacteria" "p__Acidobacteria"
## Taxa_00003 "d__Bacteria" "p__Acidobacteria"
## Taxa_00004 "d__Bacteria" "p__Acidobacteria"
## Taxa_00005 "d__Bacteria" "p__Acidobacteria"
##            Class                                  
## Taxa_00000 NA                                     
## Taxa_00001 "c__Acetothermia_genera_incertae_sedis"
## Taxa_00002 NA                                     
## Taxa_00003 "c__Acidobacteria_Gp1"                 
## Taxa_00004 "c__Acidobacteria_Gp10"                
## Taxa_00005 "c__Acidobacteria_Gp11"                
##            Order                                  
## Taxa_00000 NA                                     
## Taxa_00001 "o__Acetothermia_genera_incertae_sedis"
## Taxa_00002 NA                                     
## Taxa_00003 NA                                     
## Taxa_00004 "o__Gp10"                              
## Taxa_00005 "o__Gp11"                              
##            Family                                 
## Taxa_00000 NA                                     
## Taxa_00001 "f__Acetothermia_genera_incertae_sedis"
## Taxa_00002 NA                                     
## Taxa_00003 NA                                     
## Taxa_00004 "f__Gp10"                              
## Taxa_00005 "f__Gp11"                              
##            Genus                                  
## Taxa_00000 NA                                     
## Taxa_00001 "g__Acetothermia_genera_incertae_sedis"
## Taxa_00002 NA                                     
## Taxa_00003 NA                                     
## Taxa_00004 "g__Gp10"                              
## Taxa_00005 "g__Gp11"

rank_names(s16sV3V5)

## [1] "Kingdom" "Phylum"  "Class"   "Order"   "Family"  "Genus"

sample_variables(s16sV3V5)

## [1] "primers"   "Replicate" "Treatment" "Timepoint"

s16sV3V5

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1344 taxa and 48 samples ]
## sample_data() Sample Data:       [ 48 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 1344 taxa by 6 taxonomic ranks ]

plot_bar(s16sV3V5, fill = "Phylum") + theme(legend.position="bottom" ) +  scale_fill_manual(values = rainbow(length(unique(tax_table(s16sV3V5)[,"Phylum"]))-1))

Filtering our dataset

First lets remove of the feature with ambiguous phylum annotation.

s16sV3V5 <- subset_taxa(s16sV3V5, !is.na(Phylum) & !Phylum %in% c("", "uncharacterized"))
s16sV3V5

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1342 taxa and 48 samples ]
## sample_data() Sample Data:       [ 48 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 1342 taxa by 6 taxonomic ranks ]

Lets generate a prevelance table (number of samples each taxa occurs in) for each taxa.

prevelancedf = apply(X = otu_table(s16sV3V5),
                 MARGIN = 1,
                 FUN = function(x){sum(x > 0)})
# Add taxonomy and total read counts to this data.frame
prevelancedf = data.frame(Prevalence = prevelancedf,
                      TotalAbundance = taxa_sums(s16sV3V5),
                      tax_table(s16sV3V5))
colnames(prevelancedf) <- c("Prevalence", "TotalAbundance", colnames(tax_table(s16sV3V5)))

prevelancedf[1:10,]

##            Prevalence TotalAbundance     Kingdom           Phylum
## Taxa_00001          1              1 d__Bacteria  p__Acetothermia
## Taxa_00002         48           1483 d__Bacteria p__Acidobacteria
## Taxa_00003         48           1049 d__Bacteria p__Acidobacteria
## Taxa_00004         48           8312 d__Bacteria p__Acidobacteria
## Taxa_00005         47            321 d__Bacteria p__Acidobacteria
## Taxa_00006         23             34 d__Bacteria p__Acidobacteria
## Taxa_00007          8             13 d__Bacteria p__Acidobacteria
## Taxa_00008         48           1356 d__Bacteria p__Acidobacteria
## Taxa_00009         48           7213 d__Bacteria p__Acidobacteria
## Taxa_00010         48           2285 d__Bacteria p__Acidobacteria
##                                            Class
## Taxa_00001 c__Acetothermia_genera_incertae_sedis
## Taxa_00002                                  <NA>
## Taxa_00003                  c__Acidobacteria_Gp1
## Taxa_00004                 c__Acidobacteria_Gp10
## Taxa_00005                 c__Acidobacteria_Gp11
## Taxa_00006                 c__Acidobacteria_Gp12
## Taxa_00007                 c__Acidobacteria_Gp13
## Taxa_00008                 c__Acidobacteria_Gp15
## Taxa_00009                 c__Acidobacteria_Gp16
## Taxa_00010                 c__Acidobacteria_Gp17
##                                            Order
## Taxa_00001 o__Acetothermia_genera_incertae_sedis
## Taxa_00002                                  <NA>
## Taxa_00003                                  <NA>
## Taxa_00004                               o__Gp10
## Taxa_00005                               o__Gp11
## Taxa_00006                               o__Gp12
## Taxa_00007                               o__Gp13
## Taxa_00008                               o__Gp15
## Taxa_00009                               o__Gp16
## Taxa_00010                               o__Gp17
##                                           Family
## Taxa_00001 f__Acetothermia_genera_incertae_sedis
## Taxa_00002                                  <NA>
## Taxa_00003                                  <NA>
## Taxa_00004                               f__Gp10
## Taxa_00005                               f__Gp11
## Taxa_00006                               f__Gp12
## Taxa_00007                               f__Gp13
## Taxa_00008                               f__Gp15
## Taxa_00009                               f__Gp16
## Taxa_00010                               f__Gp17
##                                            Genus
## Taxa_00001 g__Acetothermia_genera_incertae_sedis
## Taxa_00002                                  <NA>
## Taxa_00003                                  <NA>
## Taxa_00004                               g__Gp10
## Taxa_00005                               g__Gp11
## Taxa_00006                               g__Gp12
## Taxa_00007                               g__Gp13
## Taxa_00008                               g__Gp15
## Taxa_00009                               g__Gp16
## Taxa_00010                               g__Gp17

Whole phylum filtering

Now lets investigate low prevelance/abundance phylum and subset them out.

summary_prevalence <- plyr::ddply(prevelancedf, "Phylum", function(df1){
  data.frame(mean_prevalence=mean(df1$Prevalence),total_abundance=sum(df1$TotalAbundance,na.rm = T),stringsAsFactors = F)
})
summary_prevalence

##                            Phylum mean_prevalence total_abundance
## 1                 p__Acetothermia         1.00000               1
## 2                p__Acidobacteria        35.72973          521716
## 3               p__Actinobacteria        23.36910          171632
## 4                    p__Aquificae        15.00000              72
## 5              p__Armatimonadetes        33.57143            2966
## 6                 p__Atribacteria         8.00000              10
## 7                p__Bacteroidetes        24.49618          451903
## 8                         p__BRC1        48.00000             451
## 9     p__candidate division WPS-1        48.00000           40707
## 10    p__Candidatus Calescamantes         1.00000               1
## 11 p__Candidatus Saccharibacteria        48.00000           14731
## 12                  p__Chlamydiae        20.50000             514
## 13                 p__Chloroflexi        33.51724           54899
## 14   p__Cyanobacteria/Chloroplast        23.84615            3738
## 15         p__Deinococcus-Thermus         3.75000              19
## 16               p__Elusimicrobia        25.33333             208
## 17               p__Fibrobacteres        25.00000             496
## 18                  p__Firmicutes        13.02542           67371
## 19                p__Fusobacteria         1.00000               2
## 20            p__Gemmatimonadetes        48.00000          174765
## 21             p__Hydrogenedentes        33.00000              90
## 22             p__Ignavibacteriae        31.33333             447
## 23             p__Latescibacteria        48.00000             471
## 24                 p__Nitrospirae        25.00000           25979
## 25               p__Parcubacteria        48.00000             477
## 26              p__Planctomycetes        37.86364           19036
## 27                p__Poribacteria         9.00000              13
## 28              p__Proteobacteria        25.16216         1090361
## 29                p__Spirochaetes        14.12500             424
## 30                         p__SR1        14.00000              24
## 31               p__Synergistetes         3.25000              13
## 32                 p__Tenericutes         7.50000              57
## 33       p__Thermodesulfobacteria         9.00000              14
## 34                 p__Thermotogae         1.00000               2
## 35             p__Verrucomicrobia        33.40909           43057

Using the table above, determine the phyla to filter

sum(summary_prevalence$total_abundance)*0.001

## [1] 2686.667

table(summary_prevalence$total_abundance/sum(summary_prevalence$total_abundance) >= 0.001)

## 
## FALSE  TRUE 
##    21    14

keepPhyla <- summary_prevalence$Phylum[summary_prevalence$total_abundance/sum(summary_prevalence$total_abundance) >= 0.001]

s16sV3V5.1 = subset_taxa(s16sV3V5, Phylum %in% keepPhyla)
summary_prevalence <- summary_prevalence[summary_prevalence$Phylum %in% keepPhyla,]
summary_prevalence

##                            Phylum mean_prevalence total_abundance
## 2                p__Acidobacteria        35.72973          521716
## 3               p__Actinobacteria        23.36910          171632
## 5              p__Armatimonadetes        33.57143            2966
## 7                p__Bacteroidetes        24.49618          451903
## 9     p__candidate division WPS-1        48.00000           40707
## 11 p__Candidatus Saccharibacteria        48.00000           14731
## 13                 p__Chloroflexi        33.51724           54899
## 14   p__Cyanobacteria/Chloroplast        23.84615            3738
## 18                  p__Firmicutes        13.02542           67371
## 20            p__Gemmatimonadetes        48.00000          174765
## 24                 p__Nitrospirae        25.00000           25979
## 26              p__Planctomycetes        37.86364           19036
## 28              p__Proteobacteria        25.16216         1090361
## 35             p__Verrucomicrobia        33.40909           43057

s16sV3V5.1

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1290 taxa and 48 samples ]
## sample_data() Sample Data:       [ 48 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 1290 taxa by 6 taxonomic ranks ]

Individual Taxa Filtering

Subset to the remaining phyla by prevelance.

prevelancedf1 = subset(prevelancedf, Phylum %in% get_taxa_unique(s16sV3V5.1, taxonomic.rank = "Phylum"))
ggplot(prevelancedf1, aes(TotalAbundance,Prevalence / nsamples(s16sV3V5.1),color=Phylum)) +
  # Include a guess for parameter
  geom_hline(yintercept = 0.10, alpha = 0.5, linetype = 2) + geom_point(size = 2, alpha = 0.7) +
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~Phylum) + theme(legend.position="none")

Sometimes you see a clear break, however we aren’t seeing one here. In this case I’m moslty interested in those organisms consistantly present in the dataset, so I’m removing all taxa present in less than 50% of samples.

#  Define prevalence threshold as 10% of total samples ~ set of replicates
prevalenceThreshold = 0.10 * nsamples(s16sV3V5.1)
prevalenceThreshold

## [1] 4.8

# Execute prevalence filter, using `prune_taxa()` function
keepTaxa = rownames(prevelancedf1)[(prevelancedf1$Prevalence >= prevalenceThreshold)]
length(keepTaxa)

## [1] 932

s16sV3V5.2 = prune_taxa(keepTaxa, s16sV3V5.1)
s16sV3V5.2

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 932 taxa and 48 samples ]
## sample_data() Sample Data:       [ 48 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 932 taxa by 6 taxonomic ranks ]

Agglomerate taxa at the Genus level (combine all with the same name) keeping all taxa without genus level assignment

length(get_taxa_unique(s16sV3V5.2, taxonomic.rank = "Genus"))

## [1] 759

s16sV3V5.3 = tax_glom(s16sV3V5.2, "Genus", NArm = FALSE)
s16sV3V5.3

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 932 taxa and 48 samples ]
## sample_data() Sample Data:       [ 48 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 932 taxa by 6 taxonomic ranks ]

## out of curiosity how many "reads" does this leave us at???
sum(colSums(otu_table(s16sV3V5.3)))

## [1] 2682008

Now lets filter out samples (outliers and low performing samples)

Do some simple ordination looking for outlier samples, first we variance stabilize the data with a log transform, the perform PCoA using bray’s distances

logt  = transform_sample_counts(s16sV3V5.3, function(x) log(1 + x) )
out.pcoa.logt <- ordinate(logt, method = "MDS", distance = "bray")
evals <- out.pcoa.logt$values$Eigenvalues
plot_ordination(logt, out.pcoa.logt, type = "samples",
                color = "Treatment", shape = "Timepoint") + labs(col = "Treatment") +
                coord_fixed(sqrt(evals[2] / evals[1]))

You could also use the MDS method of ordination here, edit the code to do so. Can also edit the distance method used to jaccard, jsd, euclidean. Play with changing those parameters

#Can view the distance method options with
?distanceMethodList

# can veiw the oridinate methods with
?ordinate

Show taxa proportions per sample (quickplot)

grid.arrange(nrow = 3,
qplot(as(otu_table(logt),"matrix")[, "sample1"], geom = "histogram", bins=30) +
  xlab("Relative abundance"),

qplot(as(otu_table(logt),"matrix")[, "sample34"], geom = "histogram", bins=30) +
  xlab("Relative abundance"),

qplot(as(otu_table(logt),"matrix")[, "sample44"], geom = "histogram", bins=30) +
  xlab("Relative abundance")
)

# if you needed to remove candidate outliers, can use the below to remove sample Slashpile18
#s16sV3V5.pruned <- prune_samples(sample_names(s16sV3V5.3) != c("sample1","sample2"), s16sV3V5.3)

Look for low perfroming samples

qplot(colSums(otu_table(s16sV3V5.3)),bins=30) + xlab("Logged counts-per-sample")

s16sV3V5.4 <- prune_samples(sample_sums(s16sV3V5.3)>=10000, s16sV3V5.3)
s16sV3V5.4

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 932 taxa and 48 samples ]
## sample_data() Sample Data:       [ 48 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 932 taxa by 6 taxonomic ranks ]

Investigate transformations. We transform microbiome count data to account for differences in library size, variance, scale, etc.

RLE - is the scaling factor method proposed by Anders and Huber (2010). We call it “relative log expression”, as median library is calculated from the geometric mean of all columns and the median ratio of each sample to the median library is taken as the scale factor.

## for Firmictures
plot_abundance = function(physeq, meta, title = "",
			     Facet = "Order", Color = "Order"){
  # Arbitrary subset, based on Phylum, for plotting
  p1f = subset_taxa(physeq, Phylum %in% c("p__Firmicutes"))
  mphyseq = psmelt(p1f)
  mphyseq <- subset(mphyseq, Abundance > 0)
  ggplot(data = mphyseq, mapping = aes_string(x = meta,y = "Abundance",
                                 color = Color, fill = Color)) +
    geom_violin(fill = NA) +
    geom_point(size = 1, alpha = 0.3,
                position = position_jitter(width = 0.3)) +
    facet_wrap(facets = Facet) + scale_y_log10()+
    theme(legend.position="none")
}

# transform counts into "relative abundances"
s16sV3V5.4ra = transform_sample_counts(s16sV3V5.4, function(x){x / sum(x)})

# transform counts into "hellinger standardized counts"
s16sV3V5.4hell <- s16sV3V5.4
otu_table(s16sV3V5.4hell) <- otu_table(decostand(otu_table(s16sV3V5.4hell), method = "hellinger"), taxa_are_rows=TRUE)

# RLE counts
s16sV3V5.4RLE <- s16sV3V5.4
RLE_normalization <- function(phyloseq){
  prior.count = 1
  count_scale = median(sample_sums(phyloseq))
  m = as(otu_table(phyloseq), "matrix")
  d = DGEList(counts=m, remove.zeros = FALSE)
  z = calcNormFactors(d, method="RLE")
  y <- as.matrix(z)
  lib.size <- z$samples$lib.size * z$samples$norm.factors
  ## rescale to median sample count
  out <- round(count_scale * sweep(y,MARGIN=2, STATS=lib.size,FUN = "/"))
  dimnames(out) <- dimnames(y)
  out
}
otu_table(s16sV3V5.4RLE) <- otu_table(RLE_normalization(s16sV3V5.4), taxa_are_rows=TRUE)
s16sV3V5.4logRLE = transform_sample_counts(s16sV3V5.4RLE, function(x){ log2(x +1)})

plotOriginal = plot_abundance(s16sV3V5.4, "Treatment", title="original")
plotRelative = plot_abundance(s16sV3V5.4ra, "Treatment", title="relative")
plotHellinger = plot_abundance(s16sV3V5.4hell, "Treatment", title="Hellinger")
plotLogRLE = plot_abundance(s16sV3V5.4logRLE, "Treatment", title="Log")
# Combine each plot into one graphic.
grid.arrange(nrow = 4, plotOriginal, plotRelative, plotHellinger, plotLogRLE)

[Normalization and microbial differential abundance strategies depend upon data characteristics] (https://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-017-0237-y)

plot_richness(s16sV3V5.4RLE, measures=c("Observed","Chao1"))

## Warning: Removed 48 rows containing missing values (geom_errorbar).

plot_richness(s16sV3V5.4RLE, x = "Treatment", color="Timepoint", measures=c("Chao1", "Shannon"))

## Warning: Removed 48 rows containing missing values (geom_errorbar).

# Other Richness measures, "Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher" try some of these others.

er <- estimate_richness(s16sV3V5.4RLE, measures=c("Chao1", "Shannon"))

res.aov <- aov(er$Shannon ~ Treatment + Timepoint, data = as(sample_data(s16sV3V5.4RLE),"data.frame"))
# Summary of the analysis
summary(res.aov)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Treatment    5 0.1092 0.02185   8.072 2.26e-05 ***
## Timepoint    1 0.2078 0.20776  76.761 6.13e-11 ***
## Residuals   41 0.1110 0.00271                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Graphical Summaries

# Subset dataset by phylum
s16sV3V5.4RLE_acidob = subset_taxa(s16sV3V5.4RLE, Phylum=="p__Acidobacteria")
title = "plot_bar; Acidobacteria-only"
plot_bar(s16sV3V5.4RLE_acidob, "Treatment", "Abundance", "Family", title=title)

prop  = transform_sample_counts(s16sV3V5.4, function(x) x / sum(x) )
keepTaxa <- ((apply(otu_table(prop) >= 0.005,1,sum,na.rm=TRUE) > 2) | (apply(otu_table(prop) >= 0.05, 1, sum,na.rm=TRUE) > 0))
table(keepTaxa)

## keepTaxa
## FALSE  TRUE 
##   872    60

s16sV3V5.4RLE_trim <- prune_taxa(keepTaxa,s16sV3V5.4RLE)

plot_heatmap(s16sV3V5.4RLE_trim, "PCoA", distance="bray", sample.label="Treatment", taxa.label="Genus", low="#FFFFCC", high="#000033", na.value="white")

plot_net(s16sV3V5.4RLE_trim, maxdist=0.4, color="Treatment", shape="Timepoint")

hell.tip.labels <- as(get_variable(s16sV3V5.4RLE, "Treatment"), "character")
# This is the actual hierarchical clustering call, specifying average-linkage clustering
d <- distance(s16sV3V5.4RLE_trim, method="bray", type="samples")
RLE.hclust     <- hclust(d, method="average")
plot(RLE.hclust)

#Lets write out a plot
pdf("My_dendro.pdf", width=7, height=7, pointsize=8)
plot(RLE.hclust)
dev.off()

## quartz_off_screen 
##                 2

png("My_dendro.png", width = 7, height = 7, res=300, units = "in")
plot(RLE.hclust)
dev.off()

## quartz_off_screen 
##                 2

Ordination

v4.RLE.ord <- ordinate(s16sV3V5.4RLE_trim, "NMDS", "bray")

## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.09763903 
## Run 1 stress 0.09764394 
## ... Procrustes: rmse 0.0007765653  max resid 0.003487272 
## ... Similar to previous best
## Run 2 stress 0.09756916 
## ... New best solution
## ... Procrustes: rmse 0.006211429  max resid 0.03783506 
## Run 3 stress 0.1183573 
## Run 4 stress 0.09763896 
## ... Procrustes: rmse 0.006244267  max resid 0.03801658 
## Run 5 stress 0.09764042 
## ... Procrustes: rmse 0.006300215  max resid 0.03833707 
## Run 6 stress 0.1183573 
## Run 7 stress 0.09764753 
## ... Procrustes: rmse 0.006422066  max resid 0.03872499 
## Run 8 stress 0.116453 
## Run 9 stress 0.09756901 
## ... New best solution
## ... Procrustes: rmse 0.0004025685  max resid 0.001596649 
## ... Similar to previous best
## Run 10 stress 0.116453 
## Run 11 stress 0.09756958 
## ... Procrustes: rmse 0.0001505171  max resid 0.0007177248 
## ... Similar to previous best
## Run 12 stress 0.114047 
## Run 13 stress 0.1183522 
## Run 14 stress 0.1286158 
## Run 15 stress 0.118443 
## Run 16 stress 0.1183573 
## Run 17 stress 0.1184438 
## Run 18 stress 0.09789638 
## ... Procrustes: rmse 0.008618325  max resid 0.04188829 
## Run 19 stress 0.09756889 
## ... New best solution
## ... Procrustes: rmse 0.0003496162  max resid 0.001412773 
## ... Similar to previous best
## Run 20 stress 0.1183573 
## *** Solution reached

p1 = plot_ordination(s16sV3V5.4RLE_trim, v4.RLE.ord, type="taxa", color="Phylum", title="taxa")
print(p1)

p1 + facet_wrap(~Phylum, 5)

p2 = plot_ordination(s16sV3V5.4RLE_trim, v4.RLE.ord, type="samples", color="Timepoint", shape="Treatment")
p2

p2 + geom_polygon(aes(fill=Treatment)) + geom_point(size=5) + ggtitle("samples")

p2

p2 = plot_ordination(s16sV3V5.4RLE_trim, v4.RLE.ord, type="biplot", color="Timepoint", shape="Treatment") +
                     scale_shape_manual(values=1:7)
p2

write.table(otu_table(s16sV3V5.4RLE_trim), file = "RLE_stand_results_otu.txt",sep="\t")

Now try doing oridination with other transformations, such as relative abundance, log. Also looks and see if you can find any trends in the variable Dist_from_edge.

Differential Abundances

For differential abundances we use RNAseq pipeline EdgeR and limma voom.

m = as(otu_table(s16sV3V5.4), "matrix")
# Define gene annotations (`genes`) as tax_table
taxonomy = tax_table(s16sV3V5.4, errorIfNULL=FALSE)
if( !is.null(taxonomy) ){
  taxonomy = data.frame(as(taxonomy, "matrix"))
}
# Now turn into a DGEList
d = DGEList(counts=m, genes=taxonomy, remove.zeros = TRUE)

## reapply filter
prop  = transform_sample_counts(s16sV3V5.4, function(x) x / sum(x) )
keepTaxa <- ((apply(otu_table(prop) >= 0.005,1,sum,na.rm=TRUE) > 2) | (apply(otu_table(prop) >= 0.05, 1, sum,na.rm=TRUE) > 0))
table(keepTaxa)

## keepTaxa
## FALSE  TRUE 
##   872    60

d <- d[keepTaxa,]


# Calculate the normalization factors
z = calcNormFactors(d, method="RLE")
# Check for division by zero inside `calcNormFactors`
if( !all(is.finite(z$samples$norm.factors)) ){
  stop("Something wrong with edgeR::calcNormFactors on this data,
       non-finite $norm.factors, consider changing `method` argument")
}

plotMDS(z, col = as.numeric(factor(sample_data(s16sV3V5.4)$Treatment)), labels = sample_names(s16sV3V5.4), cex=0.5)

# Creat a model based on Treatment and depth
mm <- model.matrix( ~ Treatment + Timepoint, data=data.frame(as(sample_data(s16sV3V5.4),"matrix"))) # specify model with no intercept for easier contrasts
mm

##          (Intercept) TreatmentABC_Condition2 TreatmentABC_Control
## sample1            1                       0                    1
## sample10           1                       0                    0
## sample11           1                       0                    0
## sample12           1                       0                    0
## sample13           1                       0                    0
## sample14           1                       0                    0
## sample15           1                       0                    0
## sample16           1                       0                    0
## sample17           1                       0                    0
## sample18           1                       0                    0
## sample19           1                       0                    0
## sample2            1                       0                    0
## sample20           1                       0                    0
## sample21           1                       0                    0
## sample22           1                       0                    1
## sample23           1                       0                    1
## sample24           1                       0                    1
## sample25           1                       0                    1
## sample26           1                       0                    0
## sample27           1                       0                    0
## sample28           1                       0                    0
## sample29           1                       0                    0
## sample3            1                       0                    0
## sample30           1                       1                    0
## sample31           1                       1                    0
## sample32           1                       1                    0
## sample33           1                       1                    0
## sample34           1                       0                    0
## sample35           1                       0                    0
## sample36           1                       0                    0
## sample37           1                       0                    0
## sample38           1                       0                    0
## sample39           1                       0                    0
## sample4            1                       0                    0
## sample40           1                       0                    0
## sample41           1                       0                    0
## sample42           1                       0                    0
## sample43           1                       0                    0
## sample44           1                       0                    0
## sample45           1                       0                    0
## sample46           1                       0                    1
## sample47           1                       0                    1
## sample48           1                       0                    1
## sample5            1                       0                    0
## sample6            1                       1                    0
## sample7            1                       1                    0
## sample8            1                       1                    0
## sample9            1                       1                    0
##          TreatmentCondition1 TreatmentCondition2 TreatmentControl
## sample1                    0                   0                0
## sample10                   0                   0                1
## sample11                   0                   0                1
## sample12                   0                   0                1
## sample13                   0                   0                1
## sample14                   1                   0                0
## sample15                   1                   0                0
## sample16                   1                   0                0
## sample17                   1                   0                0
## sample18                   0                   1                0
## sample19                   0                   1                0
## sample2                    0                   0                0
## sample20                   0                   1                0
## sample21                   0                   1                0
## sample22                   0                   0                0
## sample23                   0                   0                0
## sample24                   0                   0                0
## sample25                   0                   0                0
## sample26                   0                   0                0
## sample27                   0                   0                0
## sample28                   0                   0                0
## sample29                   0                   0                0
## sample3                    0                   0                0
## sample30                   0                   0                0
## sample31                   0                   0                0
## sample32                   0                   0                0
## sample33                   0                   0                0
## sample34                   0                   0                1
## sample35                   0                   0                1
## sample36                   0                   0                1
## sample37                   0                   0                1
## sample38                   1                   0                0
## sample39                   1                   0                0
## sample4                    0                   0                0
## sample40                   1                   0                0
## sample41                   1                   0                0
## sample42                   0                   1                0
## sample43                   0                   1                0
## sample44                   0                   1                0
## sample45                   0                   1                0
## sample46                   0                   0                0
## sample47                   0                   0                0
## sample48                   0                   0                0
## sample5                    0                   0                0
## sample6                    0                   0                0
## sample7                    0                   0                0
## sample8                    0                   0                0
## sample9                    0                   0                0
##          TimepointT2
## sample1            0
## sample10           1
## sample11           1
## sample12           1
## sample13           1
## sample14           1
## sample15           1
## sample16           1
## sample17           1
## sample18           1
## sample19           1
## sample2            0
## sample20           1
## sample21           1
## sample22           1
## sample23           1
## sample24           1
## sample25           1
## sample26           1
## sample27           1
## sample28           1
## sample29           1
## sample3            0
## sample30           1
## sample31           1
## sample32           1
## sample33           1
## sample34           0
## sample35           0
## sample36           0
## sample37           0
## sample38           0
## sample39           0
## sample4            0
## sample40           0
## sample41           0
## sample42           0
## sample43           0
## sample44           0
## sample45           0
## sample46           0
## sample47           0
## sample48           0
## sample5            0
## sample6            0
## sample7            0
## sample8            0
## sample9            0
## attr(,"assign")
## [1] 0 1 1 1 1 1 2
## attr(,"contrasts")
## attr(,"contrasts")$Treatment
## [1] "contr.treatment"
## 
## attr(,"contrasts")$Timepoint
## [1] "contr.treatment"

y <- voom(d, mm, plot = T)

fit <- lmFit(y, mm)
head(coef(fit))

##            (Intercept) TreatmentABC_Condition2 TreatmentABC_Control
## Taxa_00004    11.42021              0.56154953          0.247251333
## Taxa_00028    14.43708              0.09255719          0.004571592
## Taxa_00031    13.57321              0.04341135          0.200015067
## Taxa_00032    11.90593              0.06808778          0.201142692
## Taxa_00033    13.78071             -0.01281357         -0.053679539
## Taxa_00036    16.58393             -0.01195907          0.010459057
##            TreatmentCondition1 TreatmentCondition2 TreatmentControl
## Taxa_00004           0.2225633          0.33218576        0.6937022
## Taxa_00028           0.2066517          0.18367808        0.2417164
## Taxa_00031           0.1464268          0.22360593        0.1845047
## Taxa_00032           0.4280512          0.42465403        0.4160526
## Taxa_00033           0.2197528          0.45398019        0.3521470
## Taxa_00036           0.1600282         -0.03887503        0.1554179
##            TimepointT2
## Taxa_00004 -0.47590364
## Taxa_00028  0.31347126
## Taxa_00031 -0.25043174
## Taxa_00032 -0.23676668
## Taxa_00033  0.06211118
## Taxa_00036  0.24498846

# single contrast comparing Timepoint 5 - 20
contr <- makeContrasts(TimpointT2vT1 = "TimepointT2",
                       levels = colnames(coef(fit)))

## Warning in makeContrasts(TimpointT2vT1 = "TimepointT2", levels =
## colnames(coef(fit))): Renaming (Intercept) to Intercept

tmp <- contrasts.fit(fit, contr)

## Warning in contrasts.fit(fit, contr): row names of contrasts don't match
## col names of coefficients

tmp <- eBayes(tmp)
tmp2 <- topTable(tmp, coef=1, sort.by = "P", n = Inf)
tmp2$Taxa <- rownames(tmp2)
tmp2 <- tmp2[,c("Taxa","logFC","AveExpr","P.Value","adj.P.Val")]
length(which(tmp2$adj.P.Val < 0.05)) # number of Differentially abundant taxa

## [1] 48

# 48
sigtab = cbind(as(tmp2, "data.frame"), as(tax_table(s16sV3V5.4)[rownames(tmp2), ], "matrix"))

One last plot

theme_set(theme_bw())
scale_fill_discrete <- function(palname = "Set1", ...) {
    scale_fill_brewer(palette = palname, ...)
}
sigtabgen = subset(sigtab, !is.na(Genus))
# Phylum order
x = tapply(sigtabgen$logFC, sigtabgen$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels = names(x))
# Genus order
x = tapply(sigtabgen$logFC, sigtabgen$Genus, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels = names(x))
ggplot(sigtabgen, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=3) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5))

☰ Menu

Sept. 2019 Microbial Community Analysis Workshop