All data used to produce this document are available on this repository: https://forgemia.inra.fr/migale/metabarfood/. The code used is available in the code chunks.
source("functions.R")


plot_divergence <- function(dataset, divergence_file){
  p1<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")) %>% mutate(its=factor(its, levels=c("ITS1","ITS2","D1/D2","RPB2"))) , aes(x=sample, color=data, y=Divergence, fill=SampleID))  + 
    geom_boxplot() + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    #stat_summary(fun = "median", geom = "point", aes(color = tool)) +
    #scale_fill_manual(values=method_palette_wo_real, aesthetics = c("colour", "fill")) +
    #stat_compare_means(ref.group = "FROGS", method = "wilcox.test", label="p.signif" ) +
    ggtitle("Divergence (%) at species level", subtitle = "") +
    scale_y_continuous(expand = expansion(0, 0)) + 
      scale_x_discrete(expand = expansion(0, 0)) #+
    #expand_limits(y = 0)
  return(p1)
}


plot_metrics <- function(dataset, divergence_file){
  p2<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")) %>% mutate(data=str_replace(data,"ADN","DNA")), aes(x=sample, color=data,y=False.Negative, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + guides(fill = FALSE) +
    scale_y_continuous(expand = expansion(0, 0)) + 
      scale_x_discrete(expand = expansion(0, 0)) +
    theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), legend.position = "none") #+
    #expand_limits(y = 0)
  p3<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")), aes(x=sample, color=data, y=False.Positive, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + guides(fill = FALSE) +
    scale_y_continuous(expand = expansion(0, 0)) + 
      scale_x_discrete(expand = expansion(0, 0)) +
    theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), legend.position = "none") #+
    #expand_limits(y = 0)
  p4<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")), aes(x=sample, color=data, y=True.Positive, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + guides(fill = FALSE) +
    scale_y_continuous(expand = expansion(0, 0)) + 
      scale_x_discrete(expand = expansion(0, 0)) +
    theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), legend.position = "none") #+
    #expand_limits(y = 0)
  p <- ggarrange(p2, p3, p4, ncol=3, nrow=1, common.legend = TRUE, legend="right")
  p <- annotate_figure(p,
                       top = text_grob("Metrics at species level")
  )
  return(p)
}


plot_precision_and_recall <- function(dataset, divergence_file){
  infos <- as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")) %>% mutate(data=str_replace(data,"ADN","DNA")) %>% mutate(Precision = (True.Positive/(True.Positive + False.Positive))) %>% mutate(Recall.Rate = (True.Positive/(True.Positive + False.Negative)))
  p1<-ggplot(infos, aes(x=sample, color=data, y=Recall.Rate, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + guides(fill = FALSE) + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) #+
    #theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) #+
    #expand_limits(y = 0)
  p2<-ggplot(infos, aes(x=sample, color=data, y=Precision, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))# +
    #theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) #+
    #expand_limits(y = 0)
  return(ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="right"))
  
}

metabarfood <- TRUE
amplicons <- c("ITS1","ITS2", "D1D2", "RPB2")
types <- c("ADN","PCR")
absent_color <- "#F6CFB8"
absent_color <- "black"
partially_color <- "#b1c5c4"
partially_color <- "#DAF7A6"

perfect_color <- "#cce5ff"
perfect_color <- "#86a5a4"
perfect_color <- "#cce5ff"
#tools <- c("DADA2_FROGS", "FROGS", "USEARCH", "DADA2-pe", "DADA2-se", "QIIME-se", "QIIME-pe")
tools <- c("DADA2_FROGS")
dedicated_palette <- c("#cccc00", "#8eadac", "#000000", "#56B4E9", "#8a6479", "#8a8964", "#ad8e9f", "#adac8e")
method_palette <- setNames(
         object = dedicated_palette, 
         nm = c("EXPECTED", "FROGS", "DADA2_FROGS", "USEARCH", "DADA2-se", "QIIME-se", "DADA2-pe", "QIIME-pe")
       )
dedicated_palette_wo_real <- c("#8eadac", "#000000", "#56B4E9", "#8a6479", "#8a8964", "#ad8e9f", "#adac8e")
method_palette_wo_real <- setNames(
           object = dedicated_palette_wo_real, 
           nm = c("FROGS", "DADA2_FROGS", "USEARCH", "DADA2-se", "QIIME-se", "DADA2-pe", "QIIME-pe")
         )

Introduction

This report aims at comparing amplicons (ITS1, ITS2, D1/D2 and RPB2) on different metrics.

Questions
  • Which marker, among ITS1, ITS2, D1/D2 and RPB2 is the best to detect fungal diversity in mock communities?
  • How different are DNA and PCR samples?

Synthesis

if(file.exists("data/all_heatmap.rds")){
  all_hmp <- readRDS("data/all_heatmap.rds")
  draw(all_hmp, annotation_legend_side = "bottom", row_title = "", width = unit(20, "cm"),
       column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.2, "cm"), annotation_legend_list = list(
       Legend(at = 1, title = "", labels=c("Absent (0)"), legend_gp = gpar(fill=absent_color)),
       Legend(at = 2, title = "", labels=c("Partially reconstructed (0.75)"), legend_gp = gpar(fill=partially_color)),
       Legend(at = 3, title = "", labels=c("Perfect (1)"), legend_gp = gpar(fill=perfect_color))
       )
       )
}

Heatmap of expected species for each mock community and barcode marker using the DADA2_FROGS approach. Dark green represents perfectly reconstructed sequences (score of 1), light green partially reconstructed sequences (score of 0.75) and orange species that are missing (0). Density of results are indicated for each line, representing the ability of markers to be efficient.

Details for each ecosystem

MEAT

sp <- "MEAT"
divergence_file <- file.path(sp,"divergence.tsv")
tool <- "DADA2_FROGS"
#cat("  \n###",tool, " \n\n")
htmp_list <- c()
htmp_list_presence <- c()
min <- 0
max <- 0
num <- 0
for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
    mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
    mat <- readRDS(mat_file)
    mat_presence <- readRDS(mat_presence_file)
    submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
    submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix() 
    rownames(submat) <- rownames(mat)
    rownames(submat_presence) <- rownames(mat_presence)
    submat[submat==0] <- NA
    #submat_presence[submat_presence==0] <- NA
    max <- max(max,max(submat,na.rm = TRUE))
    its_refac <- its
    if(its_refac=="D1D2"){
      its_refac<-"D1/D2"
    }
    type_refac <- type
    if(type_refac=="ADN"){
      type_refac<-"DNA"
    }
    colnames(submat) <- paste0(its_refac," ",type_refac)
    colnames(submat_presence) <- paste0(its_refac," ",type_refac)
    ht <- build_heatmap_one_tool(submat, min, max)
    ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
    htmp_list <- c(htmp_list,ht)
    htmp_list_presence <- c(htmp_list_presence,ht_presence)
    if(num>0){
      meat_mp <- cbind(meat_mp,submat_presence)
    }else{
      meat_mp <- submat_presence
    }
    num <- num+1
  }
}
meat_hmp <- build_heatmap_one_tool_presence_amplicons(t(meat_mp), 0, 1, sp)
ha <- rowAnnotation(bar = anno_density(t(meat_mp), 
                                       #type = "violin",
                                       #gp = gpar(fill = "#fa995f"),
                                       border = FALSE,
                                       axis_param = list(gp = gpar(fontsize = 5), at = c(0,0.75, 1), labels = c("0","0.75", "1"))
                                       ),
                    width = unit(1.5, "cm"), 
                    show_annotation_name = FALSE, 
                    border = FALSE)
meat_hmp <- meat_hmp + ha

l <- c()
for(i in htmp_list){
  l <- l + i  
}

l_presence <- c()
for(i in htmp_list_presence){
  l_presence <- l_presence + i  
}
cat("  \n### Abundance \n\n")

Abundance

draw(l, row_title = "Expected species",
    column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
    Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
  ))

cat(" \n")
meat_presence <- l_presence
cat("  \n### Reconstruction \n\n")

Reconstruction

draw(l_presence, row_title = "Expected species", 
      
     annotation_legend_list = list(
     Legend(at = 1, title = "", labels=c("Absent (0)"), legend_gp = gpar(fill=absent_color)),
     Legend(at = 2, title = "", labels=c("Partially reconstructed (0.75)"), legend_gp = gpar(fill=partially_color)),
     Legend(at = 3, title = "", labels=c("Perfect (1)"), legend_gp = gpar(fill=perfect_color))
     )
     )

 cat(" \n")
cat("  \n###"," Divergence, Precion, Recall rate", " \n")

Divergence, Precion, Recall rate

print(plot_divergence(type, divergence_file) + facet_grid(~its))  

cat(" \n")
print(plot_metrics(type, divergence_file))

cat(" \n")
print(plot_precision_and_recall(type, divergence_file))

cat(" \n")

CHEESE

sp <- "CHEESE"
divergence_file <- file.path(sp,"divergence.tsv")
tool <- "DADA2_FROGS"
#cat("  \n###",tool, " \n\n")
htmp_list <- c()
htmp_list_presence <- c()
min <- 0
max <- 0
num <- 0
for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
    mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
    mat <- readRDS(mat_file)
    mat_presence <- readRDS(mat_presence_file)
    submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
    submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix() 
    rownames(submat) <- rownames(mat)
    rownames(submat_presence) <- rownames(mat_presence)
    submat[submat==0] <- NA
    #submat_presence[submat_presence==0] <- NA
    max <- max(max,max(submat,na.rm = TRUE))
    its_refac <- its
    if(its_refac=="D1D2"){
      its_refac<-"D1/D2"
    }
    type_refac <- type
    if(type_refac=="ADN"){
      type_refac<-"DNA"
    }
    colnames(submat) <- paste0(its_refac," ",type_refac)
    colnames(submat_presence) <- paste0(its_refac," ",type_refac)
    ht <- build_heatmap_one_tool(submat, min, max)
    ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
    htmp_list <- c(htmp_list,ht)
    htmp_list_presence <- c(htmp_list_presence,ht_presence)
    if(num>0){
      cheese_mp <- cbind(cheese_mp,submat_presence)
    }else{
      cheese_mp <- submat_presence
    }
    num <- num+1
  }
}
cheese_hmp <- build_heatmap_one_tool_presence_amplicons(t(cheese_mp), 0, 1, sp)
ha <- rowAnnotation(bar = anno_density(t(cheese_mp), 
                                       #type = "violin",
                                       #gp = gpar(fill = "#fa995f"),
                                       border = FALSE,
                                       axis_param = list(gp = gpar(fontsize = 5), at = c(0,0.75, 1), labels = c("0","0.75", "1"))
                                       ),
                    width = unit(1.5, "cm"), 
                    show_annotation_name = FALSE, 
                    border = FALSE)
cheese_hmp <- cheese_hmp + ha

l <- c()
for(i in htmp_list){
  l <- l + i  
}

l_presence <- c()
for(i in htmp_list_presence){
  l_presence <- l_presence + i  
}
cat("  \n### Abundance \n\n")

Abundance

draw(l, row_title = "Expected species",
    column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
    Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
  ))

cat(" \n")
cheese_presence <- l_presence
cat("  \n### Reconstruction \n\n")

Reconstruction

draw(l_presence, row_title = "Expected species", 
      
     annotation_legend_list = list(
     Legend(at = 1, title = "", labels=c("Absent (0)"), legend_gp = gpar(fill=absent_color)),
     Legend(at = 2, title = "", labels=c("Partially reconstructed (0.75)"), legend_gp = gpar(fill=partially_color)),
     Legend(at = 3, title = "", labels=c("Perfect (1)"), legend_gp = gpar(fill=perfect_color))
     )
     )

 cat(" \n")
cat("  \n###"," Divergence, Precion, Recall rate", " \n")

Divergence, Precion, Recall rate

print(plot_divergence(type, divergence_file) + facet_grid(~its))  

cat(" \n")
print(plot_metrics(type, divergence_file))

cat(" \n")
print(plot_precision_and_recall(type, divergence_file))

cat(" \n")

BREAD

sp <- "BREAD"
divergence_file <- file.path(sp,"divergence.tsv")
tool <- "DADA2_FROGS"
#cat("  \n###",tool, " \n\n")
htmp_list <- c()
htmp_list_presence <- c()
min <- 0
max <- 0
num <- 0
for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
    mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
    mat <- readRDS(mat_file)
    mat_presence <- readRDS(mat_presence_file)
    submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
    submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix() 
    rownames(submat) <- rownames(mat)
    rownames(submat_presence) <- rownames(mat_presence)
    submat[submat==0] <- NA
    #submat_presence[submat_presence==0] <- NA
    max <- max(max,max(submat,na.rm = TRUE))
    its_refac <- its
    if(its_refac=="D1D2"){
      its_refac<-"D1/D2"
    }
    type_refac <- type
    if(type_refac=="ADN"){
      type_refac<-"DNA"
    }
    colnames(submat) <- paste0(its_refac," ",type_refac)
    colnames(submat_presence) <- paste0(its_refac," ",type_refac)
    ht <- build_heatmap_one_tool(submat, min, max)
    ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
    htmp_list <- c(htmp_list,ht)
    htmp_list_presence <- c(htmp_list_presence,ht_presence)
    if(num>0){
      bread_mp <- cbind(bread_mp,submat_presence)
    }else{
      bread_mp <- submat_presence
    }
    num <- num+1
  }
}
bread_hmp <- build_heatmap_one_tool_presence_amplicons(t(bread_mp), 0, 1, sp)
ha <- rowAnnotation(bar = anno_density(t(bread_mp), 
                                       #type = "violin",
                                       #gp = gpar(fill = "#fa995f"),
                                       border = FALSE,
                                       axis_param = list(gp = gpar(fontsize = 5), at = c(0,0.75, 1), labels = c("0","0.75", "1"))
                                       ),
                    width = unit(1.5, "cm"), 
                    show_annotation_name = FALSE, 
                    border = FALSE)
bread_hmp <- bread_hmp + ha

l <- c()
for(i in htmp_list){
  l <- l + i  
}

l_presence <- c()
for(i in htmp_list_presence){
  l_presence <- l_presence + i  
}
cat("  \n### Abundance \n\n")

Abundance

draw(l, row_title = "Expected species",
    column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
    Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
  ))

cat(" \n")
bread_presence <- l_presence
cat("  \n### Reconstruction \n\n")

Reconstruction

draw(l_presence, row_title = "Expected species", 
      
     annotation_legend_list = list(
     Legend(at = 1, title = "", labels=c("Absent (0)"), legend_gp = gpar(fill=absent_color)),
     Legend(at = 2, title = "", labels=c("Partially reconstructed (0.75)"), legend_gp = gpar(fill=partially_color)),
     Legend(at = 3, title = "", labels=c("Perfect (1)"), legend_gp = gpar(fill=perfect_color))
     )
     )

 cat(" \n")
cat("  \n###"," Divergence, Precion, Recall rate", " \n")

Divergence, Precion, Recall rate

print(plot_divergence(type, divergence_file) + facet_grid(~its))  

cat(" \n")
print(plot_metrics(type, divergence_file))

cat(" \n")
print(plot_precision_and_recall(type, divergence_file))

cat(" \n")

WINE

sp <- "WINE"
divergence_file <- file.path(sp,"divergence.tsv")
tool <- "DADA2_FROGS"
#cat("  \n###",tool, " \n\n")
htmp_list <- c()
htmp_list_presence <- c()
min <- 0
max <- 0
num <- 0
for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
    mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
    mat <- readRDS(mat_file)
    mat_presence <- readRDS(mat_presence_file)
    submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
    submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix() 
    rownames(submat) <- rownames(mat)
    rownames(submat_presence) <- rownames(mat_presence)
    submat[submat==0] <- NA
    #submat_presence[submat_presence==0] <- NA
    max <- max(max,max(submat,na.rm = TRUE))
    its_refac <- its
    if(its_refac=="D1D2"){
      its_refac<-"D1/D2"
    }
    type_refac <- type
    if(type_refac=="ADN"){
      type_refac<-"DNA"
    }
    colnames(submat) <- paste0(its_refac," ",type_refac)
    colnames(submat_presence) <- paste0(its_refac," ",type_refac)
    ht <- build_heatmap_one_tool(submat, min, max)
    ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
    htmp_list <- c(htmp_list,ht)
    htmp_list_presence <- c(htmp_list_presence,ht_presence)
    if(num>0){
      wine_mp <- cbind(wine_mp,submat_presence)
    }else{
      wine_mp <- submat_presence
    }
    num <- num+1
  }
}
wine_hmp <- build_heatmap_one_tool_presence_amplicons(t(wine_mp), 0, 1, sp)
ha <- rowAnnotation(bar = anno_density(t(wine_mp), 
                                       #type = "violin",
                                       #gp = gpar(fill = "#fa995f"),
                                       border = FALSE,
                                       axis_param = list(gp = gpar(fontsize = 5), at = c(0,0.75, 1), labels = c("0","0.75", "1"))
                                       ),
                    width = unit(1.5, "cm"), 
                    show_annotation_name = FALSE, 
                    border = FALSE)
wine_hmp <- wine_hmp + ha

l <- c()
for(i in htmp_list){
  l <- l + i  
}

l_presence <- c()
for(i in htmp_list_presence){
  l_presence <- l_presence + i  
}
cat("  \n### Abundance \n\n")

Abundance

draw(l, row_title = "Expected species",
    column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
    Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
  ))

cat(" \n")
wine_presence <- l_presence
cat("  \n### Reconstruction \n\n")

Reconstruction

draw(l_presence, row_title = "Expected species", 
      
     annotation_legend_list = list(
     Legend(at = 1, title = "", labels=c("Absent (0)"), legend_gp = gpar(fill=absent_color)),
     Legend(at = 2, title = "", labels=c("Partially reconstructed (0.75)"), legend_gp = gpar(fill=partially_color)),
     Legend(at = 3, title = "", labels=c("Perfect (1)"), legend_gp = gpar(fill=perfect_color))
     )
     )

 cat(" \n")
cat("  \n###"," Divergence, Precion, Recall rate", " \n")

Divergence, Precion, Recall rate

print(plot_divergence(type, divergence_file) + facet_grid(~its))  

cat(" \n")
print(plot_metrics(type, divergence_file))

cat(" \n")
print(plot_precision_and_recall(type, divergence_file))

cat(" \n")
if(!file.exists("data/all_heatmap.rds")){
all_hmp <- meat_hmp + bread_hmp + cheese_hmp + wine_hmp
draw(all_hmp, row_title = "", width = unit(20, "cm"),
       column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.2, "cm"), annotation_legend_list = list(
       Legend(at = 1, title = "", labels=c("Absent (0)"), legend_gp = gpar(fill=absent_color)),
       Legend(at = 2, title = "", labels=c("Partially reconstructed (0.75)"), legend_gp = gpar(fill=partially_color)),
       Legend(at = 3, title = "", labels=c("Perfect (1)"), legend_gp = gpar(fill=perfect_color))
       )
       )
saveRDS(all_hmp,"data/all_heatmap.rds")  
}
divergence_file <- file.path("data/divergence_all_ecos.tsv")
divergence_file

a <- as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA"), SampleID=str_replace(SampleID,"ADN","DNA"),  data=str_replace(data,"ADN","DNA"), SampleID=str_replace(SampleID,"D1D2","D1/D2")) %>% mutate(its=factor(its, levels=c("ITS1","ITS2","D1/D2","RPB2"))) %>% mutate(SAMPLEID = paste0(Ecosystem,"-",data))

p <- ggplot(a, aes(x=SAMPLEID, y=Divergence)) + 
  geom_boxplot() + facet_grid(Ecosystem + its ~ data,scales="free_x", margin=TRUE)
p

ggplot(a, aes(x=SampleID, color=data, y=Divergence)) +
    geom_boxplot() +
  stat_summary(fun = "median", geom = "point", aes(color = data)) + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "right")+ facet_grid(~Ecosystem+its, scales = "free_x") + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
create_frogs_object <- function(path, marker, type, file, multifile){
  biom <- file.path(path,marker,type,file)
  physeq <- import_frogs(biom, taxMethod = "blast")
  metadata_file <- file.path(path,marker,type,"metadata.tsv")
  metadata <- read.table(metadata_file, row.names=1, header=TRUE, sep="\t", stringsAsFactors = FALSE)

  sample_data(physeq) <- metadata
  multiaff_file <- file.path(path,marker,type,multifile)
  physeq <- update_frogs_physeq_with_multi(physeq, multiaff_file)
  physeq  = transform_sample_counts(physeq, function(x) x / sum(x) )
  sample_names(physeq) <- glue::glue(paste("{sample_names(physeq)}","frogs",marker, sep="_"))
  sample_data(physeq)$method <- "DADA2_FROGS"
  sample_data(physeq)$Type <- type
  sample_data(physeq)$Marker <- marker
  physeq_aglom <- phyloseq.extended::fast_tax_glom(physeq, taxrank = "Species")
  #physeq_aglom <- prune_taxa(tax_table(physeq_aglom)[,7] %in% references, physeq_aglom)
  taxa_names(physeq_aglom) <- cbind(tax_table(physeq_aglom)[,7])
  return(physeq_aglom)
}

update_frogs_physeq_with_multi <- function(physeq, multiaff_file){

  infos <- readr::read_tsv(file = multiaff_file) %>%
    as_tibble() %>%
    distinct(`#observation_name`, .keep_all = T) %>%
    select(`#observation_name`, blast_taxonomy) %>%
    separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species")) %>%
    #mutate(across(Kingdom:Species, ~ stringr::str_remove(.x, ".:"))) %>%
    tibble::column_to_rownames(var = "#observation_name") %>%
    as("matrix")

  good <- tax_table(subset_taxa(physeq, Species != "Multi-affiliation" & Genus != "Multi-affiliation" & Family != "Multi-affiliation" & Order != "Multi-affiliation" & Class != "Multi-affiliation" & Phylum != "Multi-affiliation" & Kingdom != "Multi-affiliation" )) %>% as("matrix")
  #na <- tax_table(subset_taxa(physeq, is.na(Species))) %>%  as("matrix")
  all_infos <- unique(rbind(infos,good))#,na) # case where two species are identical but are multi-affiliated
  tax_table(physeq) <- all_infos

  return(physeq)
}


keep_only_interest_species <- function(physeq, true_species){
   tax_tab <- tax_table(physeq) %>% as("matrix")
   tax_tab <- cbind(tax_tab[ , 1:7],
                    Status = ifelse(tax_tab[ , "Species"] %in% true_species, "true species", "artefact"))
   tax_table(physeq) <- tax_tab
   return(physeq)
}


path <- "MEAT/ITS1/ADN"
metadata_file <- file.path(path,"metadata.tsv")
path <- "MEAT"
metadata <- read.table(metadata_file, row.names=1, header=TRUE, sep="\t", stringsAsFactors = FALSE)
real_biom <- file.path("MEAT/ITS1/ADN/real.biom")
real <- create_phyloseq_object(real_biom, "real", metadata, "blast")
taxa_names(real) <- cbind(tax_table(real)[,7])
true_species <- as.character(tax_table(real)[ , "Species"])
tax_tab <- tax_table(real) %>% as("matrix")
tax_tab <- cbind(tax_tab[ , 1:7],
                 Status = ifelse(tax_tab[ , "Species"] %in% true_species, "true species", "artefact"))
tax_table(real) <- tax_tab




biomfile <- "dada2_frogs.biom"
multiafffile <- "dada2_frogs_multiaff.tsv"
frogs_its1_adn <- create_frogs_object(path,"ITS1", "ADN", biomfile, multiafffile)
frogs_its1_adn <- keep_only_interest_species(frogs_its1_adn, true_species)
frogs_its1_pcr <- create_frogs_object(path,"ITS1", "PCR", biomfile, multiafffile)
frogs_its1_pcr <- keep_only_interest_species(frogs_its1_pcr, true_species)
frogs_its2_adn <- create_frogs_object(path,"ITS2", "ADN", biomfile, multiafffile)
frogs_its2_adn <- keep_only_interest_species(frogs_its2_adn, true_species)
frogs_its2_pcr <- create_frogs_object(path,"ITS2", "PCR", biomfile, multiafffile)
frogs_its2_pcr <- keep_only_interest_species(frogs_its2_pcr, true_species)
frogs_d1d2_adn <- create_frogs_object(path,"D1D2", "ADN", biomfile, multiafffile)
frogs_d1d2_adn <- keep_only_interest_species(frogs_d1d2_adn, true_species)
frogs_d1d2_pcr <- create_frogs_object(path,"D1D2", "PCR", biomfile, multiafffile)
frogs_d1d2_pcr <- keep_only_interest_species(frogs_d1d2_pcr, true_species)
frogs_rpb2_adn <- create_frogs_object(path,"RPB2", "ADN", biomfile, multiafffile)
frogs_rpb2_adn <- keep_only_interest_species(frogs_rpb2_adn, true_species)
frogs_rpb2_pcr <- create_frogs_object(path,"RPB2", "PCR", biomfile, multiafffile)
frogs_rpb2_pcr <- keep_only_interest_species(frogs_rpb2_pcr, true_species)

sample_data(frogs_its1_adn)$Type <- "DNA"
sample_data(frogs_its2_adn)$Type <- "DNA"
sample_data(frogs_d1d2_adn)$Type <- "DNA"
sample_data(frogs_rpb2_adn)$Type <- "DNA"

all <- merge_phyloseq(real, frogs_its1_adn, frogs_its1_pcr, frogs_its2_adn, frogs_its2_pcr, frogs_d1d2_adn, frogs_d1d2_pcr, frogs_rpb2_adn, frogs_rpb2_pcr)
sample_data(all)$Type <- factor(sample_data(all)$Type, levels=c("DNA","PCR"))
sample_data(all)$Marker <- factor(sample_data(all)$Marker, levels=c("ITS1","ITS2","D1D2","RPB2"))
Method <- get_variable(all, "method")


p <- plot_heatmap(
  all,
  "NMDS", "bray", "Type", taxa.label = "Species",
  low="#66CCFF", high="#000033", na.value="white", title = paste0("Relative abundances with ",tool), sample.order = row.names(sample_data(all)), taxa.order = sort(true_species, decreasing=TRUE))
p <- p + facet_grid(~Marker, scales = "free") + guides(fill=guide_legend(title="White: 0"))

cat("\n")
print(p)
cat("\n")