All data used to produce this document are available on this repository: https://forgemia.inra.fr/migale/metabarfood/. The code used is available in the code chunks.

Introduction

This report contains complementary analyses performed for the METABARFOOD project.

Questions
  • Is there a correlation between abundance and amplicon length?
  • Analysis of β-diversity

Correlation abundance and amplicon length

We want to check if there is a correlation between abundance and amplicon length. We look at DNA and PCR samples separately, by plotting a linear regression.

amplicons <- c("ITS1","ITS2", "D1D2", "RPB2")
types <- c("ADN","PCR")

for(marker in c("ITS1","ITS2")){
  cat("  \n##",marker, " \n\n")
  for(type in types){
    if(type=="ADN"){
      cat("  \nDNA", " \n\n")
    }else{
      cat("  \n",type, " \n\n")  
    }
    eco <- "MEAT"
    if(!file.exists(paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))){
      biom <- paste(eco,marker,type,"dada2_frogs.biom",sep = "/")
      physeq <- import_frogs(biom, taxMethod = "blast")
      saveRDS(physeq,paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))  
    }else{
      physeq <- readRDS(paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))
    }
    physeqr  = transform_sample_counts(physeq, function(x) x / sum(x) )
    physeqfr = filter_taxa(physeqr, function(x) sum(x) > .00005, TRUE)
    
    tib <- psmelt(physeqfr) %>% as_tibble() %>% mutate(Length = nchar(OTU))
    p1 <- ggplotRegression(lm(Abundance ~ Length, data = tib), eco, marker, type)
    
    eco <- "CHEESE"
    if(!file.exists(paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))){
      biom <- paste(eco,marker,type,"dada2_frogs.biom",sep = "/")
      physeq <- import_frogs(biom, taxMethod = "blast")
      saveRDS(physeq,paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))  
    }else{
      physeq <- readRDS(paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))
    }
    physeqr  = transform_sample_counts(physeq, function(x) x / sum(x) )
    physeqfr = filter_taxa(physeqr, function(x) sum(x) > .00005, TRUE)
    
    tib <- psmelt(physeqfr) %>% as_tibble() %>% mutate(Length = nchar(OTU))
    p2 <- ggplotRegression(lm(Abundance ~ Length, data = tib), eco, marker, type)
    
    eco <- "BREAD"
    if(!file.exists(paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))){
      biom <- paste(eco,marker,type,"dada2_frogs.biom",sep = "/")
      physeq <- import_frogs(biom, taxMethod = "blast")
      saveRDS(physeq,paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))  
    }else{
      physeq <- readRDS(paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))
    }
    physeqr  = transform_sample_counts(physeq, function(x) x / sum(x) )
    physeqfr = filter_taxa(physeqr, function(x) sum(x) > .00005, TRUE)
    
    tib <- psmelt(physeqfr) %>% as_tibble() %>% mutate(Length = nchar(OTU))
    p3 <- ggplotRegression(lm(Abundance ~ Length, data = tib), eco, marker, type)
    
    eco <- "WINE"
    if(!file.exists(paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))){
      biom <- paste(eco,marker,type,"dada2_frogs.biom",sep = "/")
      physeq <- import_frogs(biom, taxMethod = "blast")
      saveRDS(physeq,paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))  
    }else{
      physeq <- readRDS(paste(eco,marker,type,"dada2_frogs_physeq.rds",sep = "/"))
    }
    physeqr  = transform_sample_counts(physeq, function(x) x / sum(x) )
    physeqfr = filter_taxa(physeqr, function(x) sum(x) > .00005, TRUE)
    
    tib <- psmelt(physeqfr) %>% as_tibble() %>% mutate(Length = nchar(OTU))
    p4 <- ggplotRegression(lm(Abundance ~ Length, data = tib), eco, marker, type)
    
    print(ggarrange(p1, p2, p3, p4, ncol=2, nrow=2, common.legend = TRUE, legend="right"))
    cat("\n\n")
  }
}

ITS1

DNA

PCR

ITS2

DNA

PCR

Marker comparison based on β-diversity

Rarefaction

data <- readRDS("data/all_samples.rds")
data
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2566 taxa and 119 samples ]
## sample_data() Sample Data:       [ 119 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 2566 taxa by 7 taxonomic ranks ]
sample_variables(data)
## [1] "Eco"      "SampleID" "Marker"   "Type"
nsamples(data)
## [1] 119

For each marker, only samples having >1000 reads were kept for the analysis.

data_ITS1 <- subset_samples(data, Marker == "ITS1")
data_ITS1
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2566 taxa and 35 samples ]
## sample_data() Sample Data:       [ 35 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 2566 taxa by 7 taxonomic ranks ]
min(sample_sums(data_ITS1))
## [1] 266
data_ITS1_rare <- rarefy_even_depth(data_ITS1, sample.size = min(sample_sums(data_ITS1)), rngseed = 1234)
data_ITS1_rare
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 460 taxa and 35 samples ]
## sample_data() Sample Data:       [ 35 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 460 taxa by 7 taxonomic ranks ]

The final object ITS1 contains 34 samples.

data_ITS2 <- subset_samples(data, Marker == "ITS2")
data_ITS2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2566 taxa and 35 samples ]
## sample_data() Sample Data:       [ 35 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 2566 taxa by 7 taxonomic ranks ]
min(sample_sums(data_ITS2))
## [1] 153
data_ITS2_rare <- rarefy_even_depth(data_ITS2, sample.size = min(sample_sums(data_ITS2)), rngseed = 1234)
data_ITS2_rare
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 322 taxa and 35 samples ]
## sample_data() Sample Data:       [ 35 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 322 taxa by 7 taxonomic ranks ]

The final object ITS1 contains 33 samples.

data_D1D2 <- subset_samples(data, Marker == "D1D2")
data_D1D2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2566 taxa and 29 samples ]
## sample_data() Sample Data:       [ 29 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 2566 taxa by 7 taxonomic ranks ]
data_D1D2 <- subset_samples(data_D1D2, sample_names(data_D1D2) != "F4SR2_MEAT_D1D2")
min(sample_sums(data_D1D2))
## [1] 1085
data_D1D2_rare <- rarefy_even_depth(data_D1D2, sample.size = min(sample_sums(data_D1D2)), rngseed = 1234)
data_D1D2_rare
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 664 taxa and 28 samples ]
## sample_data() Sample Data:       [ 28 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 664 taxa by 7 taxonomic ranks ]

The final object D1D2 contains only 21 samples.

data_RPB2 <- subset_samples(data, Marker == "RPB2")
data_RPB2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2566 taxa and 20 samples ]
## sample_data() Sample Data:       [ 20 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 2566 taxa by 7 taxonomic ranks ]
min(sample_sums(data_RPB2))
## [1] 14
data_RPB2_rare <- rarefy_even_depth(data_RPB2, sample.size = min(sample_sums(data_RPB2)), rngseed = 1234)
data_RPB2_rare
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 24 taxa and 20 samples ]
## sample_data() Sample Data:       [ 20 samples by 4 sample variables ]
## tax_table()   Taxonomy Table:    [ 24 taxa by 7 taxonomic ranks ]

For RPB2, only CHEESE samples and one sample from WINE have enough sequencing depth. So we stopped analyzing this marker.

β-diversity

ITS1

dist_bc_ITS1 <- distance(data_ITS1_rare, method = "bray")
PCoA_bc_ITS1 <- ordinate(data_ITS1_rare, method = "PCoA", distance = "bray")
pITS1 <- plot_ordination(data_ITS1_rare,ordination = PCoA_bc_ITS1, color = "Eco")
pITS1 + ggtitle("ITS1")

adonis2(formula = dist_bc_ITS1~Eco, data = data.frame(sample_data(data_ITS1_rare)))
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_bc_ITS1 ~ Eco, data = data.frame(sample_data(data_ITS1_rare)))
##          Df SumOfSqs      R2      F Pr(>F)    
## Eco       3   8.3985 0.57276 13.853  0.001 ***
## Residual 31   6.2648 0.42724                  
## Total    34  14.6634 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ITS2

dist_bc_ITS2 <- distance(data_ITS2_rare, method = "bray")
PCoA_bc_ITS2 <- ordinate(data_ITS2_rare, method = "PCoA", distance = "bray")
pITS2 <- plot_ordination(data_ITS2_rare,ordination = PCoA_bc_ITS2, color = "Eco")
pITS2 + ggtitle("ITS2")

adonis2(formula = dist_bc_ITS2~Eco, data = data.frame(sample_data(data_ITS2_rare)))
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_bc_ITS2 ~ Eco, data = data.frame(sample_data(data_ITS2_rare)))
##          Df SumOfSqs      R2      F Pr(>F)    
## Eco       3   8.6562 0.59332 15.075  0.001 ***
## Residual 31   5.9333 0.40668                  
## Total    34  14.5895 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

D1D2

dist_bc_D1D2 <- distance(data_D1D2_rare, method = "bray")
PCoA_bc_D1D2 <- ordinate(data_D1D2_rare, method = "PCoA", distance = "bray")
pD1D2 <- plot_ordination(data_D1D2_rare,ordination = PCoA_bc_D1D2, color = "Eco")
pD1D2 + ggtitle("D1/D2")

adonis2(formula = dist_bc_D1D2~Eco, data = data.frame(sample_data(data_D1D2_rare)))
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist_bc_D1D2 ~ Eco, data = data.frame(sample_data(data_D1D2_rare)))
##          Df SumOfSqs      R2      F Pr(>F)    
## Eco       3   7.3066 0.65087 14.914  0.001 ***
## Residual 24   3.9194 0.34913                  
## Total    27  11.2260 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Conclusion : in terms of capacity to discriminate samples based on their ecosystem of origin we can rank the markers as follow: D1/D2>ITS2>ITS1. However, given the low number of samples with exploitable data obtained with D1/D2 on real samples (21 out of 36), we should not recommend this marker. ITS1 and ITS2 exhibited similar results with a little advantage for ITS2.