All data used to produce this document are available on this repository: https://forgemia.inra.fr/migale/metabarfood/. The code used is available in the code chunks.

Introduction

This report gives results of the benchmark performed for the METABARFOOD project.

Questions
  • Which tool performs better with mock communities?
  • Which species are missing?
  • Are species sequences perfectly reconstructed?

Bioinformatics

We used FROGS (Bernard et al., 2021, https://doi.org/10.1093/bib/bbab318), USEARCH (Edgar, 2010, https://doi.org/10.1093/bioinformatics/btq461), QIIME2 (Bolyen et al., 2019, https://doi.org/10.1038/s41587-019-0209-9) and DADA2 (Callahan et al., 2016, https://doi.org/10.1038/nmeth.3869) following their own guidelines, and a custom combination of DADA2 and FROGS that we call DADA2_FROGS.

For the taxonomic affiliation of the OTUs/ASVs/ZOTUs built by each tool, we used a databank constituted of the mock communities sequences. No additional sequence was added to avoid bringing noise that we did not want to assess in this analysis.

For each tool, command lines are available below.

DADA2_FROGS

  • dada2_frogs.sh
#!/bin/bash

export PYTHONPATH=/save/frogs/src/FROGS_dev/lib:$PYTHONPATH
export PATH=/save/frogs/src/FROGS_dev/libexec/:$PATH
export PATH=/save/frogs/src/FROGS_dev/app/:$PATH

SAVE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/results/DADA2_FROGS"
INPUT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/"
SCRIPT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/scripts/"
REFERENCE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/REFERENCES/"
TYPE="ADN PCR"
ITS="ITS1 ITS2"
LONG="D1D2 RPB2"
CPU=8

declare -A FIVE_PRIM_PRIMERS=([ITS1]=CTTGGTCATTTAGAGGAAGTAA [ITS2]=GCATCGATGAAGAACGCAGC [D1D2]=GCATATCAATAAGCGGAGGAAAAG [RPB2]=TGGGGYATGGTNTGYCCYGC)
declare -A THREE_PRIM_PRIMERS=([ITS1]=GCATCGATGAAGAACGCAGC [ITS2]=GCAWAWCAAWAAGCGGAGGA [D1D2]=CCGTCTTGAAACACGGACC [RPB2]=CBTTCCCYGAYCAYAAYCARTC)


for its in $ITS
do
    for type in $TYPE
    do
        mkdir -p "$SAVE_DIR/$its/$type/"
        cd "$SAVE_DIR/$its/$type/"
        mkdir -p LOG
        # launch DADA2
        FIVE_PRIM=${FIVE_PRIM_PRIMERS[$its]}
        THREE_PRIM=${THREE_PRIM_PRIMERS[$its]}
        echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2_frogs.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$its/$type/ --output $SAVE_DIR/$its/$type/ --reference $REFERENCE_DIR/MEAT_${its}_GGtax_dada2.fasta --cpus $CPU" > "dada2_frogs-$its-$type.sh"
        echo "remove_chimera.py --input-fasta dada2_frogs_temp.fasta --input-biom dada2_frogs_temp.biom --nb-cpus $CPU --out-abundance remove_chimera.biom --non-chimera remove_chimera.fasta --log-file remove_chimera.log --summary remove_chimera.html" >> "dada2_frogs-$its-$type.sh"
        echo "otu_filters.py --min-abundance 0.00005 --nb-cpus $CPU --input-fasta remove_chimera.fasta --input-biom remove_chimera.biom --output-biom filters.biom --output-fasta filters.fasta --log-file filters.log --summary filters.html" >> "dada2_frogs-$its-$type.sh"
        echo "itsx.py --nb-cpus $CPU --region $its --check-its-only --input-fasta filters.fasta --input-biom filters.biom --out-fasta itsx.fasta --out-abundance itsx.biom --summary itsx.html --log-file itsx.log " >> "dada2_frogs-$its-$type.sh"
        echo "affiliation_OTU.py --nb-cpus $CPU --reference $REFERENCE_DIR/MEAT_${its}_GGtax.fasta --input-biom itsx.biom --input-fasta itsx.fasta --output-biom affiliation.biom --summary affiliation.html --log-file affiliation.log " >> "dada2_frogs-$its-$type.sh"
        echo "affiliation_filters.py --output-biom frogs.biom --output-fasta frogs.fasta --summary affiliation_filters.html --log-file affiliation_filters.log --min-blast-coverage 0.9 --min-blast-identity 0.9 --delete --input-biom affiliation.biom --input-fasta itsx.fasta " >> "dada2_frogs-$its-$type.sh"
        echo "biom_to_tsv.py --input-biom frogs.biom --input-fasta frogs.fasta --output-tsv frogs.tsv --output-multi-affi multiaff.tsv --log-file biom_to_tsv.log " >> "dada2_frogs-$its-$type.sh"
        
        sbatch --mem=32G -J dada2_${its}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2_frogs-$its-$type.sh"
        cd -
    done
done 

for long in $LONG
do
    for type in $TYPE
    do
        mkdir -p "$SAVE_DIR/$long/$type/"
        cd "$SAVE_DIR/$long/$type/"
        mkdir -p LOG
        # launch DADA2
        FIVE_PRIM=${FIVE_PRIM_PRIMERS[$long]}
        THREE_PRIM=${THREE_PRIM_PRIMERS[$long]}
        echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2_frogs.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$long/$type/ --output $SAVE_DIR/$long/$type/ --reference $REFERENCE_DIR/MEAT_${long}_GGtax_dada2.fasta --cpus $CPU" > "dada2_frogs-$long-$type.sh"
        echo "remove_chimera.py --input-fasta dada2_frogs_temp.fasta --input-biom dada2_frogs_temp.biom --nb-cpus $CPU --out-abundance remove_chimera.biom --non-chimera remove_chimera.fasta --log-file remove_chimera.log --summary remove_chimera.html" >> "dada2_frogs-$long-$type.sh"
        echo "otu_filters.py --min-abundance 0.00005 --nb-cpus $CPU --input-fasta remove_chimera.fasta --input-biom remove_chimera.biom --output-biom filters.biom --output-fasta filters.fasta --log-file filters.log --summary filters.html" >> "dada2_frogs-$long-$type.sh"
        echo "affiliation_OTU.py --nb-cpus $CPU --reference $REFERENCE_DIR/MEAT_${long}_GGtax.fasta --input-biom filters.biom --input-fasta filters.fasta --output-biom affiliation.biom --summary affiliation.html --log-file affiliation.log " >> "dada2_frogs-$long-$type.sh"
        echo "affiliation_filters.py --output-biom frogs.biom --output-fasta frogs.fasta --summary affiliation_filters.html --log-file affiliation_filters.log --min-blast-coverage 0.9 --min-blast-identity 0.9 --delete --input-biom affiliation.biom --input-fasta filters.fasta " >> "dada2_frogs-$long-$type.sh"
        echo "biom_to_tsv.py --input-biom frogs.biom --input-fasta frogs.fasta --output-tsv frogs.tsv --output-multi-affi multiaff.tsv --log-file biom_to_tsv.log " >> "dada2_frogs-$long-$type.sh"
        
        sbatch --mem=32G -J dada2_${long}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2_frogs-$long-$type.sh"
        cd -
    done
done 
  • dada2_frogs.R
#!/usr/bin/env Rscript

library(Matrix, lib.loc="~/R/x86_64-pc-linux-gnu-library/3.6")
library(argparse)
library(ShortRead)
library(Biostrings)
library(tidyverse)
library(phyloseq)
library(dada2)
library(ShortRead)
library(Biostrings)
library(biomformat)

## Get arguments from bash script
args <- commandArgs(TRUE)
parser <- ArgumentParser()

parser$add_argument("-i", "--input", type="character", default=NULL,
                    help="dataset file name", metavar="character")
parser$add_argument("-o", "--output", type="character", default="out.txt",
                    help="output file name", metavar="character")
parser$add_argument("--reference", type="character", default=NULL,
                    help="reference database", metavar="character")
parser$add_argument("--cpus", type="integer", default=1,
                    help="number of cpus", metavar="N")      
parser$add_argument("--five-prim-primer", type="character", default=NULL,
                    help="forward primer sequence")
parser$add_argument("--three-prim-primer", type="character", default=NULL,
                    help="reverse primer sequence")

args <- parser$parse_args()

FWD <- args$five_prim_primer  ## CHANGE ME to your forward primer sequence
REV <- args$three_prim_primer  ## CHANGE ME...

input_dir <- args$input
output_dir <- args$output

## Input files
fnFs <- sort(list.files(input_dir, pattern = "_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(input_dir, pattern = "_R2.fastq.gz", full.names = TRUE))

# Cutadapt
allOrients <- function(primer) {
  # Create all orientations of the input sequence
  require(Biostrings)
  dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
  orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
               RevComp = reverseComplement(dna))
  return(sapply(orients, toString))  # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)

primerHits <- function(primer, fn) {
  # Counts number of reads in which the primer is found
  nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
  return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs[[1]]), 
      FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs[[1]]), 
      REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs[[1]]), 
      REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs[[1]]))

cutadapt <- "/usr/local/bioinfo/src/cutadapt/cutadapt-2.10/cutadapt-2.10_venv/bin/cutadapt"
path.cut <- file.path(output_dir, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))

FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV) 
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV.RC, "-A", FWD.RC) 
# Run Cutadapt
for(i in seq_along(fnFs)) {
  system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
                             "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
                             fnFs[i], fnRs[i])) # input files
}

rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), 
      FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]), 
      REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]), 
      REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))

# Remove sequences with N
path.N <- file.path(output_dir, "filtered")
filtFs <- file.path(output_dir, "filtered", basename(fnFs)) # Put N-filterd files in filtN/ subdirectory
filtRs <- file.path(output_dir, "filtered", basename(fnRs))
out <- filterAndTrim(fnFs.cut, filtFs, fnRs.cut, filtRs, maxN = 0, maxEE = c(2, 2), 
                     truncQ = 2, minLen = 50, rm.phix = TRUE, compress = TRUE, multithread = 16)

get.sample.name <- function(fname) paste(strsplit(basename(fname), "_R")[[1]][1],collapse="_")
sample.names <- unname(sapply(fnFs, get.sample.name))

# Dada
dadaFs <- dada(filtFs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
dadaRs <- dada(filtRs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)

names(dadaFs) <- sample.names
names(dadaRs) <- sample.names

# Deal with pairs
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, returnRejects = TRUE, trimOverhang = TRUE, verbose = TRUE, maxMismatch = 0)
concats <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, justConcatenate=TRUE,verbose=TRUE)
repair_merger <- function(merger, concat) {
  merger[!merger$accept, ] <- concat[!merger$accept, ]
  merger$sequence <- unlist(merger$sequence)
  merger
}
repaired_mergers <- purrr::map2(mergers, concats, repair_merger)
seqtab <- makeSequenceTable(repaired_mergers)


colnames(seqtab) <- gsub("NNNNNNNNNN", "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", colnames(seqtab))
asv_seqs <- colnames(seqtab)
asv_headers <- paste(">",colnames(seqtab),sep="")
asv_headers <- if_else(grepl("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", asv_headers), paste0(">",asv_seqs, "_FROGS_combined"), asv_headers)
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, file.path(output_dir,"dada2_frogs_temp.fasta"))

asv_tab <- t(seqtab)
colnames(seqtab) <- if_else(grepl("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", colnames(seqtab)), paste0(colnames(seqtab), "_FROGS_combined"), colnames(seqtab))


row.names(asv_tab) <- sub(">", "", asv_headers)
write.table(asv_tab, file.path(output_dir,"ASVs_counts.txt"), sep="\t", quote=F)

st.biom <- make_biom(t(seqtab))
write_biom(st.biom, file.path(output_dir,"dada2_frogs_temp.biom"))

FROGS

  • FROGS.sh
#!/bin/bash

export PYTHONPATH=/save/frogs/src/FROGS_dev/lib:$PYTHONPATH
export PATH=/save/frogs/src/FROGS_dev/libexec/:$PATH
export PATH=/save/frogs/src/FROGS_dev/app/:$PATH

SAVE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/results/FROGS"
INPUT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/"
REFERENCE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/REFERENCES/"
TYPE="ADN PCR"
ITS="ITS1 ITS2"
LONG="D1D2 RPB2"
CPU=8

declare -A FIVE_PRIM_PRIMERS=([ITS1]=CTTGGTCATTTAGAGGAAGTAA [ITS2]=GCATCGATGAAGAACGCAGC [D1D2]=GCATATCAATAAGCGGAGGAAAAG [RPB2]=TGGGGYATGGTNTGYCCYGC)
declare -A THREE_PRIM_PRIMERS=([ITS1]=GCATCGATGAAGAACGCAGC [ITS2]=GCAWAWCAAWAAGCGGAGGA [D1D2]=CCGTCTTGAAACACGGACC [RPB2]=CBTTCCCYGAYCAYAAYCARTC)

for its in $ITS
do
    FIVE_PRIM=${FIVE_PRIM_PRIMERS[$its]}
    THREE_PRIM=${THREE_PRIM_PRIMERS[$its]}
    for type in $TYPE
    do
        # Create output directory for each dataset
        ARCHIVE_FILE="$INPUT_DIR/$its/$type/Archive.tar.gz"
        mkdir -p "$SAVE_DIR/$its/$type/"
        cd "$SAVE_DIR/$its/$type/"
        mkdir -p LOG
        # Preprocess
        echo "preprocess.py illumina --input-archive $ARCHIVE_FILE --keep-unmerged --merge-software pear --min-amplicon-size 50 --max-amplicon-size 490 --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --R1-size 250 --R2-size 250 --nb-cpus $CPU --output-dereplicated preprocess.fasta --output-count preprocess.tsv --summary preprocess.html --log-file preprocess.log" >> "frogs-$its-$type.sh"
        # Clustering
        echo "clustering.py --input-fasta preprocess.fasta --input-count preprocess.tsv --fastidious --distance 1 --nb-cpus $CPU --output-biom clustering.biom --output-fasta clustering.fasta --log-file clustering.log" >> "frogs-$its-$type.sh"
        echo "remove_chimera.py --input-fasta clustering.fasta --input-biom clustering.biom --nb-cpus $CPU --out-abundance remove_chimera.biom --non-chimera remove_chimera.fasta --log-file remove_chimera.log --summary remove_chimera.html" >> "frogs-$its-$type.sh"
        echo "otu_filters.py --min-abundance 0.00005 --nb-cpus $CPU --input-fasta remove_chimera.fasta --input-biom remove_chimera.biom --output-biom filters.biom --output-fasta filters.fasta --log-file filters.log --summary filters.html" >> "frogs-$its-$type.sh"
        echo "itsx.py --nb-cpus $CPU --region $its --check-its-only --input-fasta filters.fasta --input-biom filters.biom --out-fasta itsx.fasta --out-abundance itsx.biom --summary itsx.html --log-file itsx.log " >> "frogs-$its-$type.sh"
        echo "affiliation_OTU.py --nb-cpus $CPU --reference $REFERENCE_DIR/MEAT_${its}_GGtax.fasta --input-biom itsx.biom --input-fasta itsx.fasta --output-biom affiliation.biom --summary affiliation.html --log-file affiliation.log " >> "frogs-$its-$type.sh"
        echo "affiliation_filters.py --output-biom frogs.biom --output-fasta frogs.fasta --summary affiliation_filters.html --log-file affiliation_filters.log --min-blast-coverage 0.9 --min-blast-identity 0.9 --delete --input-biom affiliation.biom --input-fasta itsx.fasta " >> "frogs-$its-$type.sh"
        echo "biom_to_tsv.py --input-biom frogs.biom --input-fasta frogs.fasta --output-tsv frogs.tsv --output-multi-affi multiaff.tsv --log-file biom_to_tsv.log " >> "frogs-$its-$type.sh"
        # launch FROGS
        sbatch -J frogs_${its}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh frogs-$its-$type.sh"
        cd -
    done
done 

for long in $LONG
do
    FIVE_PRIM=${FIVE_PRIM_PRIMERS[$long]}
    THREE_PRIM=${THREE_PRIM_PRIMERS[$long]}
    for type in $TYPE
    do
        # Create output directory for each dataset
        ARCHIVE_FILE="$INPUT_DIR/$long/$type/Archive.tar.gz"
        mkdir -p "$SAVE_DIR/$long/$type/"
        cd "$SAVE_DIR/$long/$type/"
        mkdir -p LOG
        # Preprocess
        echo "preprocess.py illumina --input-archive $ARCHIVE_FILE --keep-unmerged --merge-software pear --min-amplicon-size 500 --max-amplicon-size 1000 --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --R1-size 250 --R2-size 250 --nb-cpus $CPU --output-dereplicated preprocess.fasta --output-count preprocess.tsv --summary preprocess.html --log-file preprocess.log" >> "frogs-$long-$type.sh"
        # Clustering
        echo "clustering.py --input-fasta preprocess.fasta --input-count preprocess.tsv --fastidious --distance 1 --nb-cpus $CPU --output-biom clustering.biom --output-fasta clustering.fasta --log-file clustering.log" >> "frogs-$long-$type.sh"
        echo "remove_chimera.py --input-fasta clustering.fasta --input-biom clustering.biom --nb-cpus $CPU --out-abundance remove_chimera.biom --non-chimera remove_chimera.fasta --log-file remove_chimera.log --summary remove_chimera.html" >> "frogs-$long-$type.sh"
        echo "otu_filters.py --min-abundance 0.00005 --nb-cpus $CPU --input-fasta remove_chimera.fasta --input-biom remove_chimera.biom --output-biom filters.biom --output-fasta filters.fasta --log-file filters.log --summary filters.html" >> "frogs-$long-$type.sh"
        echo "affiliation_OTU.py --nb-cpus $CPU --reference $REFERENCE_DIR/MEAT_${long}_GGtax.fasta --input-biom filters.biom --input-fasta filters.fasta --output-biom affiliation.biom --summary affiliation.html --log-file affiliation.log " >> "frogs-$long-$type.sh"
        echo "affiliation_filters.py --output-biom frogs.biom --output-fasta frogs.fasta --summary affiliation_filters.html --log-file affiliation_filters.log --min-blast-identity 0.9 --delete --input-biom affiliation.biom --input-fasta filters.fasta " >> "frogs-$long-$type.sh"
        echo "biom_to_tsv.py --input-biom frogs.biom --input-fasta frogs.fasta --output-tsv frogs.tsv --output-multi-affi multiaff.tsv --log-file biom_to_tsv.log " >> "frogs-$long-$type.sh"
        # launch FROGS
        sbatch -J frogs_${long}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh frogs-$long-$type.sh"
        cd -
    done
done 

USEARCH

  • usearch.sh
#!/bin/sh

READS_DIR=$1
REF=$2
OUT_DIR=$3
CPU=$4
FW_PRIMER=$5
RV_PRIMER=$6

mkdir -p $OUT_DIR/usearch/log

export PATH=/save/frogs/src/FROGS_dev/app:$PATH

module load bioinfo/usearch11.0.667
module load system/Python-3.7.4

echo -e ">fw_prim\n"$FW_PRIMER > $OUT_DIR/usearch/single_fw_prim.fa
echo -e ">fw_prim\n"$FW_PRIMER"\n>rv_prim\n"$RV_PRIMER > $OUT_DIR/usearch/paired_prim.fa

###############################################################################################
# Merge all R1/R2 file pairs
# Add sample name to read labels (-relabel @ option)
# Pool samples together into raw.fq
# keep R1 unmerged reads

mkdir $OUT_DIR/data

for R1 in $READS_DIR/*_R1.fastq.gz
do
R2=`echo $R1 |sed 's/_R1.fastq.gz/_R2.fastq.gz/'`
sample=`basename $R1 |sed 's/_R1.fastq.gz//'`

gzip -dc $R1 > $OUT_DIR/data/`basename $R1`
gzip -dc $R2 > $OUT_DIR/data/`basename $R2`

R1=$OUT_DIR/data/`basename $R1`
R2=$OUT_DIR/data/`basename $R2`

usearch -fastq_mergepairs $R1 -reverse $R2  \
    -sample $sample -threads $CPU \
    -fastqout $OUT_DIR/usearch/${sample}_raw_paired.fq -fastqout_notmerged_fwd $OUT_DIR/usearch/${sample}_raw_unpaired_fw.fq 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/${sample}_fastq_mergepairs.log

awk -v S=$sample '{c++; if(c==1){print $0";sample="S";"}else{print $0; if(c==4){c=0}}}' $OUT_DIR/usearch/${sample}_raw_unpaired_fw.fq > $OUT_DIR/usearch/${sample}_raw_unpaired_fw_rename.fq

#rm $OUT_DIR/usearch/${sample}_raw_unpaired_fw.fq

usearch -search_oligodb $OUT_DIR/usearch/${sample}_raw_paired.fq -threads $CPU -db $OUT_DIR/usearch/paired_prim.fa -strand plus -userout $OUT_DIR/usearch/${sample}_paired_primer.tsv -userfields query+target+qlo+qhi 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/${sample}_paired_search_oligodb.log

/home/mbernard/save/tools/bin/bioawk -c fastx -v T=$OUT_DIR/usearch/${sample}_paired_primer.tsv 'BEGIN{while(getline<T>0){if($2=="fw_prim"){FW[$1]=$4}else{RV[$1]=$3}}}{
    start=1
    l=length($seq)
    if(FW[$name] != ""){
        start=FW[$name]+1
    }
    if(RV[$name] != ""){
        l = RV[$name] - 1
    }
    l = l - start + 1
    print "@"$name"\n"substr($seq,start,l)"\n+\n"substr($qual,start,l)
}' $OUT_DIR/usearch/${sample}_raw_paired.fq > $OUT_DIR/usearch/${sample}_raw_paired_truncated.fq

usearch -search_oligodb $OUT_DIR/usearch/${sample}_raw_unpaired_fw_rename.fq -threads $CPU -db $OUT_DIR/usearch/single_fw_prim.fa -strand plus -userout $OUT_DIR/usearch/${sample}_single_fw_primer.tsv -userfields query+target+qlo+qhi 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/${sample}_single_fw_search_oligodb.log

/home/mbernard/save/tools/bin/bioawk -c fastx -v T=$OUT_DIR/usearch/${sample}_single_fw_primer.tsv 'BEGIN{while(getline<T>0){FW[$1]=$4}}{
    if(FW[$name] != ""){
        start=FW[$name]+1
        l = length($seq)-start+1
        print "@"$name"\n"substr($seq,start,l)"\n+\n"substr($qual,start,l)
    }else{
        print "@"$name"\n"$seq"\n+\n"$qual
    }
}' $OUT_DIR/usearch/${sample}_raw_unpaired_fw_rename.fq > $OUT_DIR/usearch/${sample}_raw_unpaired_fw_rename_truncated.fq

done

#~ rm -r $OUT_DIR/data

# concat R1R2 merged and R1 unmerged reads
cat $OUT_DIR/usearch/*raw_paired_truncated.fq $OUT_DIR/usearch/*_raw_unpaired_fw_rename_truncated.fq > $OUT_DIR/usearch/amplicon.fq
#rm $OUT_DIR/usearch/*raw_paired.fq $OUT_DIR/usearch/*raw_unpaired_fw_rename.fq

###############################################################################################
# Amplicon sequence quality filter
#       Discard reads with > E expected errors per base: https://www.drive5.com/usearch/manual/exp_errs.html
#       For example, with the fastq_maxee_rate option set to 0.01, then 
#       a read of length 100 will be discarded if the expected errors is >1, 
#       and a read of length 1,000 will be discarded if the expected errors is >10.

usearch -fastq_filter $OUT_DIR/usearch/amplicon.fq \
    -fastq_maxee 1.0 -relabel Filt -threads $CPU \
    -fastaout $OUT_DIR/usearch/amplicon_filtered.fa 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/fastq_filter.log

###############################################################################################
# Find unique read sequences, store abundances, and sort by abundance
usearch -fastx_uniques $OUT_DIR/usearch/amplicon_filtered.fa \
    -sizeout -relabel Uniq -threads $CPU \
    -fastaout $OUT_DIR/usearch/amplicon_uniques.fa 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/fastx_uniques.log

###############################################################################################
# Run UNOISE algorithm to get denoised sequences (ZOTUs)
# The -minsize option specifies the minimum abundance (size= annotation). Default is 8. 
# Input sequences with lower abundances are discarded.

usearch -unoise3 $OUT_DIR/usearch/amplicon_uniques.fa \
    -zotus $OUT_DIR/usearch/zotu.fa 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/unoise3.log

# make ZOTU table
usearch -otutab $OUT_DIR/usearch/amplicon.fq \
    -threads $CPU \
    -otus $OUT_DIR/usearch/zotu.fa -otutabout $OUT_DIR/usearch/zotus.txt 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/otutab.log


awk -F "\t" 'BEGIN{OFS="\t"}{if($1=="#OTU ID"){$1="observation_name\tobservation_sum"}else{s=0; for (i=2;i<= NF; i++){s+=$i} ; $1=$1"\t"s} print $0}' $OUT_DIR/usearch/zotus.txt > $OUT_DIR/usearch/zotus_reformat.txt
# keep Totu sequence that are present in zotu table
/home/mbernard/save/tools/bin/bioawk -c fastx -v Z=$OUT_DIR/usearch/zotus_reformat.txt 'BEGIN{while(getline<Z>0){if($1!="observation_name"){tab[$1]=1}}}{if(tab[$name]==1){print ">"$name"\n"$seq}}' $OUT_DIR/usearch/zotu.fa > $OUT_DIR/usearch.fa

tsv_to_biom.py -t $OUT_DIR/usearch/zotus_reformat.txt -b $OUT_DIR/usearch/zotus.biom

###############################################################################################
# perform affiliation using Sintax
# If the -sintax_cutoff option is given then predictions are written a second time after applying the confidence 
# threshold, keeping only ranks with high enough confidence. On V4 reads, using a cutoff of 0.8 gives predictions 
# with similar accuracy to RDP at 80% bootstrap cutoff.

usearch -sintax $OUT_DIR/usearch.fa \
    -db $REF -strand both -threads $CPU \
    -tabbedout $OUT_DIR/usearch/zotus_tax.txt -sintax_cutoff 0.8 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/sintax.log

cat $OUT_DIR/usearch/zotus_tax.txt | awk '{c++; if(c==1){print "#observation_name\tboostrapped_taxonomy\tstrand\ttaxonomy"}; gsub(",",";",$0) ; print $0}' > $OUT_DIR/usearch/zotus_tax_reformat.txt
# ==> supprimer le cutoff de bootstrap ?

# create final biom
/usr/local/bioinfo/src/Miniconda/Miniconda3-4.4.10/envs/qiime2-2019.4/bin/biom add-metadata -i $OUT_DIR/usearch/zotus.biom \
  --observation-metadata-fp  $OUT_DIR/usearch/zotus_tax_reformat.txt \
  -o $OUT_DIR/usearch.biom --output-as-json \
  --sc-separated boostrapped_taxonomy,taxonomy 

# create abundance table in TSV format
biom_to_tsv.py -b $OUT_DIR/usearch.biom -f $OUT_DIR/usearch.fa -t $OUT_DIR/usearch.tsv

QIIME-PE

  • qiime-pe.sh
#!/usr/bin/sh

# FROGS/app directory need to be in the PATH

module load system/Miniconda3-4.4.10
module load bioinfo/qiime2-2019.4

MANIFEST=$1


OUT_DIR=`dirname $MANIFEST`
WORK_DIR=$OUT_DIR/qiime_workdir
mkdir -p $WORK_DIR

DB_PREFIX=$2
DB_REF_SEQ=$DB_PREFIX.seq.qza
DB_REF_CLASSIFIER=$DB_PREFIX.classifier.qza

P1=$3
P2=$4

### import
qiime tools import \
  --type 'SampleData[PairedEndSequencesWithQuality]' \
  --input-path $MANIFEST \
  --output-path $WORK_DIR/pe-demux.qza \
  --input-format PairedEndFastqManifestPhred33V2

qiime demux summarize \
  --i-data $WORK_DIR/pe-demux.qza \
  --o-visualization $WORK_DIR/pe-demux.qzv


 
# trimming primers
JOINING_INPUT=$WORK_DIR/pe-demux.qza
if [[ $P1 != "" ]]
then
qiime cutadapt trim-paired \
  --i-demultiplexed-sequences $WORK_DIR/pe-demux.qza \
  --p-front-f $P1 \
  --p-front-r $P2 \
  --o-trimmed-sequences $WORK_DIR/pe-demux-trimmed.qza

qiime demux summarize \
  --i-data $WORK_DIR/pe-demux-trimmed.qza \
  --o-visualization $WORK_DIR/pe-demux-trimmed.qzv
  
JOINING_INTPUT=$WORK_DIR/pe-demux-trimmed.qza
fi  

echo "qiime cutadapt trim-paired --i-demultiplexed-sequences $WORK_DIR/pe-demux.qza --p-front-f $P1 --p-front-r $P2 --o-trimmed-sequences $WORK_DIR/pe-demux-trimmed.qza"

########################################################################
### clustering pipeline : open reference clustering paired end

# The features produced by clustering methods are known as operational taxonomic units (OTUs), which is Esperanto for suboptimal, imprecise rubbish

# join pair

# paramètres par defaut:
#   --p-minlen 1
#   --p-maxns (optional)
#   --p-allowmergestagger False
#   --p-minovlen 10
#   --p-maxdiffs 10
#   --p-minmergelen (optional)
#   --p-maxmergelen (optional)
#   --p-maxee (optional)
#   --p-qmin 0
#   --p-qminout 0
#   --p-qmax 41
#   --p-qmaxout 41

qiime vsearch join-pairs \
  --i-demultiplexed-seqs $JOINING_INTPUT \
  --p-allowmergestagger --p-minovlen 10 --p-minmergelen 20 --p-maxmergelen 490 \
  --o-joined-sequences $WORK_DIR/pe-join.qza
  
qiime demux summarize \
  --i-data $WORK_DIR/pe-join.qza \
  --o-visualization $WORK_DIR/pe-join.qzv

# dereplication

qiime vsearch dereplicate-sequences \
  --i-sequences $WORK_DIR/pe-join.qza \
  --o-dereplicated-table $WORK_DIR/pe-join-derepTable.qza \
  --o-dereplicated-sequences $WORK_DIR/pe-join-derepSeqs.qza

qiime feature-table summarize \
  --i-table $WORK_DIR/pe-join-derepTable.qza \
  --o-visualization $WORK_DIR/pe-join-table-or-99.qzv # à renommer en pe-join-derepTable.qzv


# clustering

#   Given a feature table and the associated feature sequences, cluster the
#   features against a reference database based on user-specified percent
#   identity threshold of their sequences. Any sequences that don't match are
#   then clustered de novo. This is not a general-purpose clustering method,
#   but rather is intended to be used for clustering the results of quality-
#   filtering/dereplication methods, such as DADA2, or for re-clustering a
#   FeatureTable at a lower percent identity than it was originally clustered
#   at. When a group of features in the input table are clustered into a
#   single feature, the frequency of that single feature in a given sample is
#   the sum of the frequencies of the features that were clustered in that
#   sample. Feature identifiers will be inherited from the centroid feature of
#   each cluster. For features that match a reference sequence, the centroid
#   feature is that reference sequence, so its identifier will become the
#   feature identifier. The clustered_sequences result will contain feature
#   representative sequences that are derived from the sequences input for all
#   features in clustered_table. This will always be the most abundant
#   sequence in the cluster. The new_reference_sequences result will contain
#   the entire reference database, plus feature representative sequences for
#   any de novo features. This is intended to be used as a reference database
#   in subsequent iterations of cluster_features_open_reference, if
#   applicable. See the vsearch documentation for details on how sequence
#   clustering is performed.

# default param : --p-strand plus

qiime vsearch cluster-features-open-reference \
  --i-table $WORK_DIR/pe-join-derepTable.qza \
  --i-sequences $WORK_DIR/pe-join-derepSeqs.qza \
  --i-reference-sequences $DB_REF_SEQ \
  --p-perc-identity 0.99 \
  --o-clustered-table $WORK_DIR/pe-clusterTable-or-99.qza \
  --o-clustered-sequences $WORK_DIR/pe-clusterSeqs-or-99.qza \
  --o-new-reference-sequences $WORK_DIR/new-ref-seqs-or-99.qza

qiime feature-table summarize \
  --i-table $WORK_DIR/pe-clusterTable-or-99.qza \
  --o-visualization $WORK_DIR/pe-clusterTable-or-99.qzv

# suppression des chimères

#  Apply the vsearch uchime_denovo method to identify chimeric feature
#  sequences. The results of this method can be used to filter chimeric
#  features from the corresponding feature table. For additional details,
#  please refer to the vsearch documentation.

# paramètres par défaut (défaut de vsearch, mais manque abskew : min abundance ratio of parent vs chimera (2.0)):
#   --p-dn 1.4
#   --p-mindiffs 3
#   --p-mindiv 0.8
#   --p-minh 0.28
#   --p-xn 8.0

qiime vsearch uchime-denovo \
  --i-table $WORK_DIR/pe-clusterTable-or-99.qza \
  --i-sequences $WORK_DIR/pe-clusterSeqs-or-99.qza \
  --output-dir $WORK_DIR/pe-uchime-dn-out

qiime feature-table filter-features \
  --i-table $WORK_DIR/pe-clusterTable-or-99.qza \
  --m-metadata-file $WORK_DIR/pe-uchime-dn-out/nonchimeras.qza \
  --o-filtered-table $WORK_DIR/pe-nonchimeric-wo-borderline-table.qza

qiime feature-table summarize \
  --i-table $WORK_DIR/pe-nonchimeric-wo-borderline-table.qza \
  --o-visualization $WORK_DIR/pe-nonchimeric-wo-borderline-table.qzv

qiime feature-table filter-seqs \
  --i-data $WORK_DIR/pe-clusterSeqs-or-99.qza \
  --m-metadata-file $WORK_DIR/pe-uchime-dn-out/nonchimeras.qza \
  --o-filtered-data $WORK_DIR/pe-nonchimeric-wo-borderline-seqs.qza

# filter single singleton

#  q2-vsearch implements three different OTU clustering strategies: de novo, closed reference, and open reference. 
#  All should be preceded by basic quality-score-based filtering and followed by chimera filtering and aggressive 
#  OTU filtering (the treacherous trio, a.k.a. the Bokulich method). ==> 0.00005% 
#  ==> problème impossible de filtrer sur une fréquence de façon automatique
#  ==> Filtre sur les single singleton

qiime feature-table filter-features \
  --i-table $WORK_DIR/pe-nonchimeric-wo-borderline-table.qza \
  --p-min-frequency 2 \
  --o-filtered-table $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qza

qiime feature-table summarize \
  --i-table $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qza \
  --o-visualization $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qzv

qiime feature-table filter-seqs \
  --i-data $WORK_DIR/pe-nonchimeric-wo-borderline-seqs.qza \
  --i-table $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qza \
  --o-filtered-data $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-seqs.qza

# affiliation

qiime feature-classifier classify-sklearn \
  --i-classifier $DB_REF_CLASSIFIER \
  --i-reads $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-seqs.qza \
  --o-classification $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-tax.qza 

qiime metadata tabulate \
  --m-input-file $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-tax.qza \
  --o-visualization $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-tax.qzv

#~ # export biom
qiime tools export \
  --input-path $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qza \
  --output-path $WORK_DIR/export

qiime tools export \
  --input-path $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-tax.qza \
  --output-path $WORK_DIR/export

# create final biom
sed -i 's/Feature/#Feature/' $WORK_DIR/export/taxonomy.tsv
biom add-metadata -i $WORK_DIR/export/feature-table.biom \
  --observation-metadata-fp  $WORK_DIR/export/taxonomy.tsv \
  -o $OUT_DIR/pe-qiime.biom \
  --output-as-json

# export fasta
qiime tools export \
  --input-path $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-seqs.qza  \
  --output-path $WORK_DIR/export

mv $WORK_DIR/export/dna-sequences.fasta $OUT_DIR/pe-qiime.fasta

# create abundance table in TSV format
biom_to_tsv.py -b $OUT_DIR/pe-qiime.biom -f $OUT_DIR/pe-qiime.fasta -t $OUT_DIR/pe-qiime.tsv

QIIME-se

  • qiime-se.sh
#!/usr/bin/sh

# FROGS/app directory need to be in the PATH

module load system/Miniconda3-4.4.10
module load bioinfo/qiime2-2019.4

MANIFEST=$1

OUT_DIR=`dirname $MANIFEST`
WORK_DIR=$OUT_DIR/qiime_workdir
mkdir -p $WORK_DIR

DB_PREFIX=$2
DB_REF_SEQ=$DB_PREFIX.seq.qza
DB_REF_CLASSIFIER=$DB_PREFIX.classifier.qza

CPU=$3

P1=$4

### import
qiime tools import \
  --type 'SampleData[SequencesWithQuality]' \
  --input-path $MANIFEST \
  --output-path $WORK_DIR/se-demux.qza \
  --input-format SingleEndFastqManifestPhred33V2

qiime demux summarize \
  --i-data $WORK_DIR/se-demux.qza \
  --o-visualization $WORK_DIR/se-demux.qzv
  

# trimming primers
JOINING_INPUT=$WORK_DIR/se-demux.qzv
if [[ $P1 != "" ]]
then
qiime cutadapt trim-single \
  --i-demultiplexed-sequences $WORK_DIR/se-demux.qza \
  --p-front $P1 \
  --o-trimmed-sequences $WORK_DIR/se-demux-trimmed.qza

qiime demux summarize \
  --i-data $WORK_DIR/se-demux-trimmed.qza \
  --o-visualization $WORK_DIR/se-demux-trimmed.qzv
  
JOINING_INTPUT=$WORK_DIR/se-demux-trimmed.qza
fi
  
########################################################################
### clustering pipeline : open reference clustering single end

# The features produced by clustering methods are known as operational taxonomic units (OTUs), which is Esperanto for suboptimal, imprecise rubbish
  
qiime vsearch dereplicate-sequences \
  --i-sequences $JOINING_INTPUT \
  --o-dereplicated-table $WORK_DIR/se-derepTable.qza \
  --o-dereplicated-sequences $WORK_DIR/se-derepSeqs.qza

qiime feature-table summarize \
  --i-table $WORK_DIR/se-derepTable.qza \
  --o-visualization $WORK_DIR/se-derepTable.qzv
  
# clustering

#   Given a feature table and the associated feature sequences, cluster the
#   features against a reference database based on user-specified percent
#   identity threshold of their sequences. Any sequences that don't match are
#   then clustered de novo. This is not a general-purpose clustering method,
#   but rather is intended to be used for clustering the results of quality-
#   filtering/dereplication methods, such as DADA2, or for re-clustering a
#   FeatureTable at a lower percent identity than it was originally clustered
#   at. When a group of features in the input table are clustered into a
#   single feature, the frequency of that single feature in a given sample is
#   the sum of the frequencies of the features that were clustered in that
#   sample. Feature identifiers will be inherited from the centroid feature of
#   each cluster. For features that match a reference sequence, the centroid
#   feature is that reference sequence, so its identifier will become the
#   feature identifier. The clustered_sequences result will contain feature
#   representative sequences that are derived from the sequences input for all
#   features in clustered_table. This will always be the most abundant
#   sequence in the cluster. The new_reference_sequences result will contain
#   the entire reference database, plus feature representative sequences for
#   any de novo features. This is intended to be used as a reference database
#   in subsequent iterations of cluster_features_open_reference, if
#   applicable. See the vsearch documentation for details on how sequence
#   clustering is performed.

# default param : --p-strand plus

qiime vsearch cluster-features-open-reference \
  --i-table $WORK_DIR/se-derepTable.qza \
  --i-sequences $WORK_DIR/se-derepSeqs.qza \
  --i-reference-sequences $DB_REF_SEQ \
  --p-perc-identity 0.99 --p-threads $CPU \
  --o-clustered-table $WORK_DIR/se-clusterTable-or-99.qza \
  --o-clustered-sequences $WORK_DIR/se-clusterSeqs-or-99.qza \
  --o-new-reference-sequences $WORK_DIR/new-ref-seqs-or-99.qza

qiime feature-table summarize \
  --i-table $WORK_DIR/se-clusterTable-or-99.qza \
  --o-visualization $WORK_DIR/se-clusterTable-or-99.qzv

# suppression des chimères

#  Apply the vsearch uchime_denovo method to identify chimeric feature
#  sequences. The results of this method can be used to filter chimeric
#  features from the corresponding feature table. For additional details,
#  please refer to the vsearch documentation.

# paramètres par défaut (défaut de vsearch, mais manque abskew : min abundance ratio of parent vs chimera (2.0)):
#   --p-dn 1.4
#   --p-mindiffs 3
#   --p-mindiv 0.8
#   --p-minh 0.28
#   --p-xn 8.0

qiime vsearch uchime-denovo \
  --i-table $WORK_DIR/se-clusterTable-or-99.qza \
  --i-sequences $WORK_DIR/se-clusterSeqs-or-99.qza \
  --output-dir $WORK_DIR/se-uchime-dn-out

qiime feature-table filter-features \
  --i-table $WORK_DIR/se-clusterTable-or-99.qza \
  --m-metadata-file $WORK_DIR/se-uchime-dn-out/nonchimeras.qza \
  --o-filtered-table $WORK_DIR/se-nonchimeric-wo-borderline-table.qza

qiime feature-table summarize \
  --i-table $WORK_DIR/se-nonchimeric-wo-borderline-table.qza \
  --o-visualization $WORK_DIR/se-nonchimeric-wo-borderline-table.qzv

qiime feature-table filter-seqs \
  --i-data $WORK_DIR/se-clusterSeqs-or-99.qza \
  --m-metadata-file $WORK_DIR/se-uchime-dn-out/nonchimeras.qza \
  --o-filtered-data $WORK_DIR/se-nonchimeric-wo-borderline-seqs.qza

# filter single singleton

#  q2-vsearch implements three different OTU clustering strategies: de novo, closed reference, and open reference. 
#  All should be preceded by basic quality-score-based filtering and followed by chimera filtering and aggressive 
#  OTU filtering (the treacherous trio, a.k.a. the Bokulich method). ==> 0.00005% 
#  ==> problème impossible de filtrer sur une fréquence de façon automatique
#  ==> Filtre sur les single singleton

qiime feature-table filter-features \
  --i-table $WORK_DIR/se-nonchimeric-wo-borderline-table.qza \
  --p-min-frequency 2 \
  --o-filtered-table $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qza

qiime feature-table summarize \
  --i-table $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qza \
  --o-visualization $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qzv

qiime feature-table filter-seqs \
  --i-data $WORK_DIR/se-nonchimeric-wo-borderline-seqs.qza \
  --i-table $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qza \
  --o-filtered-data $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-seqs.qza

# affiliation

qiime feature-classifier classify-sklearn \
  --i-classifier $DB_REF_CLASSIFIER \
  --i-reads $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-seqs.qza \
  --o-classification $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-tax.qza 

qiime metadata tabulate \
  --m-input-file $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-tax.qza \
  --o-visualization $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-tax.qzv

# export biom
qiime tools export \
  --input-path $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qza \
  --output-path $WORK_DIR/export

qiime tools export \
  --input-path $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-tax.qza \
  --output-path $WORK_DIR/export

# create final biom
sed -i 's/Feature/#Feature/' $WORK_DIR/export/taxonomy.tsv
biom add-metadata -i $WORK_DIR/export/feature-table.biom \
  --observation-metadata-fp  $WORK_DIR/export/taxonomy.tsv \
  -o $OUT_DIR/se-qiime.biom \
  --output-as-json

# export fasta
qiime tools export \
  --input-path $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-seqs.qza  \
  --output-path $WORK_DIR/export

mv $WORK_DIR/export/dna-sequences.fasta $OUT_DIR/se-qiime.fasta

# create abundance table in TSV format
biom_to_tsv.py -b $OUT_DIR/se-qiime.biom -f $OUT_DIR/se-qiime.fasta -t $OUT_DIR/se-qiime.tsv

DADA2-PE

  • dada2.sh
#!/bin/bash

#export PYTHONPATH=/save/frogs/src/FROGS_dev/lib:$PYTHONPATH
#export PATH=/save/frogs/src/FROGS_dev/libexec/:$PATH
#export PATH=/save/frogs/src/FROGS_dev/app/:$PATH

SAVE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/results/DADA2"
INPUT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/"
SCRIPT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/scripts/"
#REFERENCE="/work/frogsfungi/FungiPubli/JeuTestDuMoment/Unite_ref/Unite_7.1_clean_certae_refs_for_DADA2.fasta"
REFERENCE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/REFERENCES/"
TYPE="ADN PCR"
ITS="ITS1 ITS2 D1D2 RPB2"
CPU=8

declare -A FIVE_PRIM_PRIMERS=([ITS1]=CTTGGTCATTTAGAGGAAGTAA [ITS2]=GCATCGATGAAGAACGCAGC [D1D2]=GCATATCAATAAGCGGAGGAAAAG [RPB2]=TGGGGYATGGTNTGYCCYGC)
declare -A THREE_PRIM_PRIMERS=([ITS1]=GCATCGATGAAGAACGCAGC [ITS2]=GCAWAWCAAWAAGCGGAGGA [D1D2]=CCGTCTTGAAACACGGACC [RPB2]=CBTTCCCYGAYCAYAAYCARTC)


for its in $ITS
do
    for type in $TYPE
    do
        mkdir -p "$SAVE_DIR/$its/$type/"
        cd "$SAVE_DIR/$its/$type/"
        mkdir -p LOG
        # launch DADA2
        FIVE_PRIM=${FIVE_PRIM_PRIMERS[$its]}
        THREE_PRIM=${THREE_PRIM_PRIMERS[$its]}
        echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$its/$type/ --output $SAVE_DIR/$its/$type/ --reference $REFERENCE_DIR/MEAT_${its}_GGtax_dada2.fasta --cpus $CPU" > "dada2-$its-$type.sh"
        sbatch --mem=32G -J dada2_${its}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2-$its-$type.sh"
        cd -
    done
done 
  • dada2.R
#!/usr/bin/env Rscript

library(argparse)
library(ShortRead)
library(Biostrings)
library(tidyverse)
library(phyloseq)
library(dada2)
library(ShortRead)
library(Biostrings)

## Get arguments from bash script
args <- commandArgs(TRUE)
parser <- ArgumentParser()

parser$add_argument("-i", "--input", type="character", default=NULL,
                    help="dataset file name", metavar="character")
parser$add_argument("-o", "--output", type="character", default="out.txt",
                    help="output file name", metavar="character")
parser$add_argument("--reference", type="character", default=NULL,
                    help="reference database", metavar="character")
parser$add_argument("--cpus", type="integer", default=1,
                    help="number of cpus", metavar="N")      
parser$add_argument("--five-prim-primer", type="character", default=NULL,
                    help="forward primer sequence")
parser$add_argument("--three-prim-primer", type="character", default=NULL,
                    help="reverse primer sequence")

args <- parser$parse_args()

FWD <- args$five_prim_primer  ## CHANGE ME to your forward primer sequence
REV <- args$three_prim_primer  ## CHANGE ME...

input_dir <- args$input
output_dir <- args$output

## Input files
fnFs <- sort(list.files(input_dir, pattern = "_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(input_dir, pattern = "_R2.fastq.gz", full.names = TRUE))

# Cutadapt
allOrients <- function(primer) {
  # Create all orientations of the input sequence
  require(Biostrings)
  dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
  orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
               RevComp = reverseComplement(dna))
  return(sapply(orients, toString))  # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)

primerHits <- function(primer, fn) {
  # Counts number of reads in which the primer is found
  nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
  return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs[[1]]), 
      FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs[[1]]), 
      REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs[[1]]), 
      REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs[[1]]))

cutadapt <- "/usr/local/bioinfo/src/cutadapt/cutadapt-2.10/cutadapt-2.10_venv/bin/cutadapt"
path.cut <- file.path(output_dir, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))

FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV) 
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV.RC, "-A", FWD.RC) 
# Run Cutadapt
for(i in seq_along(fnFs)) {
  system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
                             "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
                             fnFs[i], fnRs[i])) # input files
}

rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), 
      FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]), 
      REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]), 
      REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))

# Remove sequences with N
path.N <- file.path(output_dir, "filtered")
filtFs <- file.path(output_dir, "filtered", basename(fnFs)) # Put N-filterd files in filtN/ subdirectory
filtRs <- file.path(output_dir, "filtered", basename(fnRs))
out <- filterAndTrim(fnFs.cut, filtFs, fnRs.cut, filtRs, maxN = 0, maxEE = c(2, 2), 
                     truncQ = 2, minLen = 50, rm.phix = TRUE, compress = TRUE, multithread = 16)

get.sample.name <- function(fname) paste(strsplit(basename(fname), "_R")[[1]][1],collapse="_")
sample.names <- unname(sapply(fnFs, get.sample.name))

# Learn errors
#errF <- learnErrors(filtFs, multithread = args$cpus)
#errR <- learnErrors(filtRs, multithread = args$cpus)

# Dada
dadaFs <- dada(filtFs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
dadaRs <- dada(filtRs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
#dadaFs <- dada(filtFs, err=errF, multithread = args$cpus)
#dadaRs <- dada(filtRs, err=errR, multithread = args$cpus)

names(dadaFs) <- sample.names
names(dadaRs) <- sample.names

# Dereplication
#derepFs <- derepFastq(filtFs, verbose = TRUE)
#derepRs <- derepFastq(filtRs, verbose = TRUE)
#names(derepFs) <- sample.names
#names(derepRs) <- sample.names

# Deal with pairs
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs)

seqtab <- makeSequenceTable(mergers)
seqtab.nochim <- removeBimeraDenovo(seqtab, multithread=args$cpus)

taxa <- assignTaxonomy(seqtab.nochim, args$reference, multithread = args$cpus)
saveRDS(taxa,file.path(output_dir,"taxa.rds"))

taxa <- taxa[,c(1,2,3,4,5,6,7)]
colnames(taxa) <- c("Kingdom", "Phylum", "Class", "Order" ,  "Family" , "Genus" ,  "Species")


### Track reads
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
print(track)
###


asv_seqs <- colnames(seqtab.nochim)

biomtable <- t(seqtab.nochim)
b <- as_tibble(biomtable)
obs_sum <- rowSums(b)
b$observation_sum <- obs_sum
b$observation_name <- paste0("ASV", 1:nrow(b))
b$seed_sequence <- asv_seqs

taxonomy <- as_tibble(taxa) %>% unite("#taxonomy", 1:7 , sep=";", remove = FALSE)

b$"#taxonomy" <- taxonomy$"#taxonomy"

col_order <- c("#taxonomy", "seed_sequence",    "observation_name", "observation_sum",
 rownames(seqtab.nochim))
b2 <- b[, col_order]

write.table(b2, file.path(output_dir,"dada2.tsv"), sep="\t", quote=F, row.names=F)

tsv_to_biom <- "/save/frogs/src/FROGS_dev/app/tsv_to_biom.py"

system2(tsv_to_biom, args = c("--input-tsv", file.path(output_dir,"dada2.tsv"),
                                     "--output-biom", file.path(output_dir,"dada2.biom"),
                                     "--output-fasta", file.path(output_dir,"dada2.fasta"),
                                     "--log-file", file.path(output_dir,"tsv_to_biom.log")
                                    )
        )

samples.out <- rownames(seqtab.nochim)
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE) ,
               tax_table(taxa))
saveRDS(ps, file.path(output_dir,"dada2.rds"))

###############

ids_study <- paste("Cluster", 1:ncol(seqtab.nochim), sep = "_")
asv_headers <- paste(">",ids_study, sep="")
asv_seqs <- colnames(seqtab.nochim)
asv_fasta <- c(rbind(asv_headers, asv_seqs))

taxanomy_fields <- apply(unname(taxa), 1, function(x) paste(x, collapse=";"))
seqtab.nochim_t <- as.data.frame(t(seqtab.nochim))
samples_ids <- colnames(seqtab.nochim_t)
seqtab.nochim_t$ids <- ids_study#rownames(seqtab.nochim_t)
seqtab.nochim_t$"#blast_taxonomy" <- taxanomy_fields
seqtab.nochim_t$"seed_sequence" <- asv_seqs
seqtab.nochim_t$"observation_name" <- ids_study
seqtab.nochim_t$observation_sum <- rowSums(t(seqtab.nochim))
seqtab.nochim_t <- seqtab.nochim_t[, c("#blast_taxonomy", "seed_sequence", "observation_name", "observation_sum", samples_ids)]
write.table(x = rbind(colnames(seqtab.nochim_t), seqtab.nochim_t),
            file = file.path(output_dir, "dada2.tsv"),
            quote = FALSE,
            sep = "\t",
            col.names = FALSE,
            row.names = FALSE)

system2("/save/frogs/src/FROGS_dev/app/tsv_to_biom.py", args=c("--input-tsv", file.path(output_dir, "dada2.tsv"), "--output-biom", file.path(output_dir, "dada2.biom")))

DADA2-SE

  • dada2-se.sh

SAVE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/results/DADA2_SE"
INPUT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/"
SCRIPT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/scripts/"
#REFERENCE="/work/frogsfungi/FungiPubli/JeuTestDuMoment/Unite_ref/Unite_7.1_clean_certae_refs_for_DADA2.fasta"
REFERENCE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/REFERENCES/"
TYPE="ADN PCR"
ITS="ITS1 ITS2"
LONG="D1D2 RPB2"
CPU=8


declare -A FIVE_PRIM_PRIMERS=([ITS1]=CTTGGTCATTTAGAGGAAGTAA [ITS2]=GCATCGATGAAGAACGCAGC [D1D2]=GCATATCAATAAGCGGAGGAAAAG [RPB2]=TGGGGYATGGTNTGYCCYGC)
declare -A THREE_PRIM_PRIMERS=([ITS1]=GCATCGATGAAGAACGCAGC [ITS2]=GCAWAWCAAWAAGCGGAGGA [D1D2]=CCGTCTTGAAACACGGACC [RPB2]=CBTTCCCYGAYCAYAAYCARTC)

for its in $ITS
do
    for type in $TYPE
    do
        mkdir -p "$SAVE_DIR/$its/$type/"
        cd "$SAVE_DIR/$its/$type/"
        mkdir -p LOG
        # launch DADA2
        FIVE_PRIM=${FIVE_PRIM_PRIMERS[$its]}
        THREE_PRIM=${THREE_PRIM_PRIMERS[$its]}
        echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2_se.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$its/$type/ --output $SAVE_DIR/$its/$type/ --reference $REFERENCE_DIR/MEAT_${its}_GGtax_dada2.fasta --cpus $CPU" > "dada2-$its-$type.sh"
        sbatch --mem=32G -J dada2_${its}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2-$its-$type.sh"
        cd -
    done
done 

for long in $LONG
do
    for type in $TYPE
    do
        mkdir -p "$SAVE_DIR/$long/$type/"
        cd "$SAVE_DIR/$long/$type/"
        mkdir -p LOG
        # launch DADA2
        FIVE_PRIM=${FIVE_PRIM_PRIMERS[$long]}
        THREE_PRIM=${THREE_PRIM_PRIMERS[$long]}
        echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2_se.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$long/$type/ --output $SAVE_DIR/$long/$type/ --reference $REFERENCE_DIR/MEAT_${long}_GGtax_dada2.fasta --cpus $CPU" > "dada2-$long-$type.sh"
        sbatch --mem=32G -J dada2_${long}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2-$long-$type.sh"
        cd -
    done
done 
  • dada2-se.R
#!/usr/bin/env Rscript

library(argparse)
library(ShortRead)
library(Biostrings)
library(tidyverse)
library(phyloseq)
library(dada2)

## Get arguments from bash script
args <- commandArgs(TRUE)
parser <- ArgumentParser()

parser$add_argument("-i", "--input", type="character", default=NULL,
                    help="dataset file name", metavar="character")
parser$add_argument("-o", "--output", type="character", default="out.txt",
                    help="output file name", metavar="character")
parser$add_argument("--reference", type="character", default=NULL,
                    help="reference database", metavar="character")
parser$add_argument("--cpus", type="integer", default=1,
                    help="number of cpus", metavar="N")                        
parser$add_argument("--five-prim-primer", type="character", default=NULL,
                    help="forward primer sequence")
parser$add_argument("--three-prim-primer", type="character", default=NULL,
                    help="reverse primer sequence")

args <- parser$parse_args()

FWD <- args$five_prim_primer  ## CHANGE ME to your forward primer sequence
REV <- args$three_prim_primer  ## CHANGE ME...

input_dir <- args$input
output_dir <- args$output

## Input files
fnFs <- sort(list.files(input_dir, pattern = "_R1.fastq.gz", full.names = TRUE))
#fnRs <- sort(list.files(input_dir, pattern = "_R2.fastq.gz", full.names = TRUE))

# Cutadapt
allOrients <- function(primer) {
  # Create all orientations of the input sequence
  require(Biostrings)
  dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
  orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
               RevComp = reverseComplement(dna))
  return(sapply(orients, toString))  # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)

primerHits <- function(primer, fn) {
  # Counts number of reads in which the primer is found
  nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
  return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs[[1]]), 
      #FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs[[1]]), 
      REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs[[1]])) 
      #REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs[[1]]))

cutadapt <- "/usr/local/bioinfo/src/cutadapt/cutadapt-2.10/cutadapt-2.10_venv/bin/cutadapt"
path.cut <- file.path(output_dir, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
#fnRs.cut <- file.path(path.cut, basename(fnRs))

FWD.RC <- dada2:::rc(FWD)
#REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV) 
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
#R2.flags <- paste("-G", REV.RC, "-A", FWD.RC) 
# Run Cutadapt
for(i in seq_along(fnFs)) {
  system2(cutadapt, args = c(R1.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
                             "-o", fnFs.cut[i], # output files
                             fnFs[i])) # input files
}

rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), 
      #FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]])) 
      REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]))#, 
      #REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))

# Remove sequences with N
path.N <- file.path(output_dir, "filtered")
filtFs <- file.path(output_dir, "filtered", basename(fnFs)) # Put N-filterd files in filtN/ subdirectory
#filtRs <- file.path(output_dir, "filtered", basename(fnRs))
filterAndTrim(fnFs.cut, filtFs, maxN = 0, maxEE = 2, 
                     truncQ = 2, minLen = 50, rm.phix = TRUE, compress = TRUE, multithread = 16)

get.sample.name <- function(fname) paste(strsplit(basename(fname), "_R")[[1]][1],collapse="_")
sample.names <- unname(sapply(fnFs, get.sample.name))

# Learn errors
#errF <- learnErrors(filtFs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
#errR <- learnErrors(filtRs, multithread = args$cpus)

# Dada
dadaFs <- dada(filtFs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
#dadaRs <- dada(filtRs, err=errR, multithread = args$cpus)

names(dadaFs) <- sample.names
#names(dadaRs) <- sample.names

# Dereplication
#derepFs <- derepFastq(filtFs, verbose = TRUE)
#derepRs <- derepFastq(filtRs, verbose = TRUE)
#names(derepFs) <- sample.names
#names(derepRs) <- sample.names

# Deal with pairs
#mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs)

seqtab <- makeSequenceTable(dadaFs)
seqtab.nochim <- removeBimeraDenovo(seqtab, multithread=args$cpus)

taxa <- assignTaxonomy(seqtab.nochim, args$reference, multithread = args$cpus)
saveRDS(taxa,file.path(output_dir,"taxa.rds"))

taxa <- taxa[,c(1,2,3,4,5,6,7)]
colnames(taxa) <- c("Kingdom", "Phylum", "Class", "Order" ,  "Family" , "Genus" ,  "Species")



asv_seqs <- colnames(seqtab.nochim)

biomtable <- t(seqtab.nochim)
b <- as_tibble(biomtable)
obs_sum <- rowSums(b)
b$observation_sum <- obs_sum
b$observation_name <- paste0("ASV", 1:nrow(b))
b$seed_sequence <- asv_seqs

taxonomy <- as_tibble(taxa) %>% unite("#taxonomy", 1:7 , sep=";", remove = FALSE)

b$"#taxonomy" <- taxonomy$"#taxonomy"

col_order <- c("#taxonomy", "seed_sequence",    "observation_name", "observation_sum",
 rownames(seqtab.nochim))
b2 <- b[, col_order]

write.table(b2, file.path(output_dir,"dada2.tsv"), sep="\t", quote=F, row.names=F)

tsv_to_biom <- "/save/frogs/src/FROGS_dev/app/tsv_to_biom.py"

system2(tsv_to_biom, args = c("--input-tsv", file.path(output_dir,"dada2.tsv"),
                                     "--output-biom", file.path(output_dir,"dada2.biom"),
                                     "--output-fasta", file.path(output_dir,"dada2.fasta"),
                                     "--log-file", file.path(output_dir,"tsv_to_biom.log")
                                    )
        )

samples.out <- rownames(seqtab.nochim)
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE) ,
               tax_table(taxa))
saveRDS(ps, file.path(output_dir,"dada2.rds"))

###############

ids_study <- paste("Cluster", 1:ncol(seqtab.nochim), sep = "_")
asv_headers <- paste(">",ids_study, sep="")
asv_seqs <- colnames(seqtab.nochim)
asv_fasta <- c(rbind(asv_headers, asv_seqs))

taxanomy_fields <- apply(unname(taxa), 1, function(x) paste(x, collapse=";"))
seqtab.nochim_t <- as.data.frame(t(seqtab.nochim))
samples_ids <- colnames(seqtab.nochim_t)
seqtab.nochim_t$ids <- ids_study#rownames(seqtab.nochim_t)
seqtab.nochim_t$"#blast_taxonomy" <- taxanomy_fields
seqtab.nochim_t$"seed_sequence" <- asv_seqs
seqtab.nochim_t$"observation_name" <- ids_study
seqtab.nochim_t$observation_sum <- rowSums(t(seqtab.nochim))
seqtab.nochim_t <- seqtab.nochim_t[, c("#blast_taxonomy", "seed_sequence", "observation_name", "observation_sum", samples_ids)]
write.table(x = rbind(colnames(seqtab.nochim_t), seqtab.nochim_t),
            file = file.path(output_dir, "dada2.tsv"),
            quote = FALSE,
            sep = "\t",
            col.names = FALSE,
            row.names = FALSE)

system2("/save/frogs/src/FROGS_dev/app/tsv_to_biom.py", args=c("--input-tsv", file.path(output_dir, "dada2.tsv"), "--output-biom", file.path(output_dir, "dada2.biom")))

Benchmarking

Different metrics were computed to compare methods:

  • the Richness compared to expected
  • the abundance of sequences compared to expected
  • the divergence rate (Divergence), computed as the Bray-Curtis distance between expected and observed abundance profiles at species level
  • the number of false-negative taxa (FN): the number of expected taxa that were not recovered by the method
  • the number of false positive taxa (FP): the number of recovered taxa that were not expected
  • the number of true positive taxa (TP): the number of recovered taxa that were expected

From these metrics we can compute the Precision (TP/(TP+FP)) and the Recall Rate (TP/(TP+FN)) of each method.

Finally, as we knew the exact expected sequences, we computed the number of sequences perfectly reconstructed: OTUs/ASVs/zOTUs whose nucleic sequence is strictly identic to reference sequence. For long sequences that we cannot get the complete sequence with 2x250 bp, 100% of identity of OTU/ASV/zOTU sequence is enough to be considered as perfectly reconstructed.

source("functions.R")

plot_divergence <- function(dataset, divergence_file){
  p1<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")) %>% mutate(its=factor(its, levels=c("ITS1","ITS2","D1/D2","RPB2"))) , aes(x=sample, color=data, y=Divergence, fill=SampleID))  + 
    geom_boxplot() + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    #stat_summary(fun = "median", geom = "point", aes(color = tool)) +
    #scale_fill_manual(values=method_palette_wo_real, aesthetics = c("colour", "fill")) +
    #stat_compare_means(ref.group = "FROGS", method = "wilcox.test", label="p.signif" ) +
    ggtitle("Divergence (%) at species level", subtitle = "") +
    scale_y_continuous(expand = expansion(0, 0)) + 
      scale_x_discrete(expand = expansion(0, 0)) #+
    #expand_limits(y = 0)
  return(p1)
}


plot_metrics <- function(dataset, divergence_file){
  p2<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")) %>% mutate(data=str_replace(data,"ADN","DNA")), aes(x=sample, color=data,y=False.Negative, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + guides(fill = FALSE) +
    scale_y_continuous(expand = expansion(0, 0)) + 
      scale_x_discrete(expand = expansion(0, 0)) +
    theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), legend.position = "none") #+
    #expand_limits(y = 0)
  p3<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")), aes(x=sample, color=data, y=False.Positive, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + guides(fill = FALSE) +
    scale_y_continuous(expand = expansion(0, 0)) + 
      scale_x_discrete(expand = expansion(0, 0)) +
    theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), legend.position = "none") #+
    #expand_limits(y = 0)
  p4<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")), aes(x=sample, color=data, y=True.Positive, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + guides(fill = FALSE) +
    scale_y_continuous(expand = expansion(0, 0)) + 
      scale_x_discrete(expand = expansion(0, 0)) +
    theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), legend.position = "none") #+
    #expand_limits(y = 0)
  p <- ggarrange(p2, p3, p4, ncol=3, nrow=1, common.legend = TRUE, legend="right")
  p <- annotate_figure(p,
                       top = text_grob("Metrics at species level")
  )
  return(p)
}


plot_precision_and_recall <- function(dataset, divergence_file){
  infos <- as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")) %>% mutate(data=str_replace(data,"ADN","DNA")) %>% mutate(Precision = (True.Positive/(True.Positive + False.Positive))) %>% mutate(Recall.Rate = (True.Positive/(True.Positive + False.Negative)))
  p1<-ggplot(infos, aes(x=sample, color=data, y=Recall.Rate, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + guides(fill = FALSE) + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) #+
    #theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) #+
    #expand_limits(y = 0)
  p2<-ggplot(infos, aes(x=sample, color=data, y=Precision, fill=SampleID)) + facet_grid(~its)+
    geom_boxplot() + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))# +
    #theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) #+
    #expand_limits(y = 0)
  return(ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="right"))
  
}

metabarfood <- TRUE
amplicons <- c("ITS1","ITS2", "D1D2", "RPB2")
types <- c("ADN","PCR")
absent_color <- "black"
absent_color <- "#F6CFB8"
partially_color <- "#b1c5c4"
partially_color <- "#DAF7A6"

perfect_color <- "#cce5ff"
perfect_color <- "#86a5a4"
perfect_color <- "#cce5ff"
tools <- c("FROGS", "DADA2_FROGS", "USEARCH", "DADA2-pe", "DADA2-se", "QIIME-se", "QIIME-pe")

dedicated_palette <- c("#cccc00", "#8eadac", "#9a9a9a", "#56B4E9", "#8a6479", "#8a8964", "#ad8e9f", "#adac8e")
method_palette <- setNames(
         object = dedicated_palette, 
         nm = c("EXPECTED", "FROGS", "DADA2_FROGS", "USEARCH", "DADA2-se", "QIIME-se", "DADA2-pe", "QIIME-pe")
       )
dedicated_palette_wo_real <- c("#8eadac", "#9a9a9a", "#56B4E9", "#8a6479", "#8a8964", "#ad8e9f", "#adac8e")
method_palette_wo_real <- setNames(
           object = dedicated_palette_wo_real, 
           nm = c("FROGS", "DADA2_FROGS", "USEARCH", "DADA2-se", "QIIME-se", "DADA2-pe", "QIIME-pe")
         )

Composition of mock communities

This table shows, for each of the 118 strains, the taxonomy, the sequence of the four markers used in this study, and the presence or absence for the four ecosystems.

ggd <- read.delim("REFERENCES/118sp_wo_droso.tsv", sep="\t", header=T, stringsAsFactors = F) 
datatable(ggd, extensions='Buttons', options= list(dom='Bfrtip',buttons = c('excel','csv')))

Synthesis

These results show the agglomerated results of the four ecosystems (bread, cheese, fermented meat and wine)

a <- read.csv2("data/divergence_all_ecos.tsv", sep="\t", stringsAsFactors = F, dec = ".", header = T) %>% as_tibble()

a$Recall.rate <- as.numeric(a$Recall.rate)
a$Precision <- as.numeric(a$Precision)
a$Divergence <- as.numeric(a$Divergence)

means <- aggregate(Recall.rate ~  tool, a, mean)
means$Recall.rate = as.numeric(format(round(means$Recall.rate, 2),nsmall=2))

a$tool <- factor(a$tool, levels=c("FROGS",
                                  "DADA2_FROGS", 
                                  "USEARCH",
                                  "DADA2-se",
                                  "QIIME-se",
                                  "DADA2-pe",
                                  "QIIME-pe"))

# RECALL RATE
p1 <- a %>%
  ggplot(aes(tool,Recall.rate, fill=tool)) +
  geom_boxplot(outlier.size=0.3) +
  scale_fill_manual(values=dedicated_palette_wo_real) +
  geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())

p1 <- p1 +labs(title="A. Recall rate",x="",y="")
  

# PRECISION
means <- aggregate(Precision ~  tool, a, mean)
means$Precision = as.numeric(format(round(means$Precision, 2),nsmall=2))
p2 <- a %>%
  ggplot(aes(tool,Precision, fill=tool)) +
  geom_boxplot(outlier.size=0.3) +
  scale_fill_manual(values=dedicated_palette_wo_real) +
  geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p2 <- p2 +labs(title="B. Precision",x="",y="")

# DIVERGENCE
means <- aggregate(Divergence ~  tool, a, mean)
means$Divergence = as.numeric(format(round(means$Divergence, 2),nsmall=2))
p3 <- a %>%
  ggplot(aes(tool,Divergence, fill=tool)) +
  geom_boxplot(outlier.size=0.3) +
  scale_fill_manual(values=dedicated_palette_wo_real) +
  geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p3 <- p3 +labs(title="C. Divergence",x="",y="")

b <- read.csv2("data/PR_all_ecos.tsv", sep="\t", stringsAsFactors = F, dec = ".", header = T) %>% as_tibble()

b$Expected <- as.numeric(b$Expected)
b$Found <- as.numeric(b$Found)
b$Perfect <- as.numeric(b$Perfect)

b <- b %>% mutate(Percent.Found = Found/Expected*100)
b <- b %>% mutate(Percent.Perfect = Perfect/Expected*100)

b$tool <- factor(b$tool, levels=c("FROGS",
                                            "DADA2_FROGS", 
                                            "USEARCH",
                                            "DADA2-se",
                                            "QIIME-se",
                                            "DADA2-pe",
                                            "QIIME-pe"))

means <- aggregate(Percent.Found ~  tool, b, mean)
means$Percent.Found = as.numeric(format(round(means$Percent.Found, 2),nsmall=2))
p <- b %>%
  ggplot(aes(tool,Percent.Found, fill=tool)) +
  geom_boxplot(outlier.size=0.3) +
  scale_fill_manual(values=dedicated_palette_wo_real) +
  geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
  theme(legend.position = "none") 
p +labs(title="Proportion of species detected",
        y ="", x = "")

#a <- a %>% filter(its == "ITS1" | its == "ITS2")

means <- aggregate(Percent.Perfect ~  tool, b, mean)
means$Percent.Perfect = as.numeric(format(round(means$Percent.Perfect, 2),nsmall=2))
p4 <- b %>%
  ggplot(aes(tool,Percent.Perfect, fill=tool)) +
  geom_boxplot(outlier.size=0.3) +
  scale_fill_manual(values=dedicated_palette_wo_real) +
  geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p4 <- p4 +labs(title="D. % of perfectly constructed sequences",
        x ="", y = "")

b <- b %>% mutate(Percent.Perfect2 = Perfect/Found*100)

means <- aggregate(Percent.Perfect2 ~  tool, b, mean)
means$Percent.Perfect2 = as.numeric(format(round(means$Percent.Perfect2, 2),nsmall=2))
b <- b %>% replace(is.na(.), 0)
p <- b %>% 
  ggplot(aes(tool,Percent.Perfect2, fill=tool)) +
  #geom_violin(scale = "count") +
  geom_boxplot(outlier.size=0.3) +
  scale_fill_manual(values=dedicated_palette_wo_real) +
  geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
  theme(legend.position = "none") 
  
p +labs(title="Proportion of perfectly constructed ASVs/OTUs/ZOTUs",
        x ="tools", y = "")

ggarrange(p1, p2, p3, p4, ncol=2, nrow=2, common.legend = TRUE, legend="right")

DADA2_FROGS seems to be the most accurate method! This solution is chosen for further analyis.

Details for each ecosystem

MEAT

sp <- "MEAT"
divergence_file <- file.path(sp,"divergence.tsv")

Expected vs. observed depth (count of sequences)

for(its in amplicons){
  for(type in types){
    #path <- file.path("../benchmark_real",sp,its,type)
    path <- file.path(sp,its,type)
    infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
    if(its=="D1D2"){
      if(type=="ADN"){
        cat("  \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
      }else{
        cat("  \n####",sp, " ", "D1/D2", " ", type, " \n\n")
      }
      
    }else{
      if(type=="ADN"){
        cat("  \n####",sp, " ", its, " ", "DNA", " \n\n")  
      }
      else{
        cat("  \n####",sp, " ", its, " ", type, " \n\n")    
      }
    }
    
    if(!(file.exists(infos_file))){
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
      }
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
      }
      
    
    if(its == "ITS1"){
      species <- real_infos_dict$ITS1$Taxonomy
      nb_species <- length(species)
    }else if(its == "ITS2"){
      species <- real_infos_dict$ITS2$Taxonomy
      nb_species <- length(species)
    }else if(its == "D1D2"){
      species <- real_infos_dict$D1D2$Taxonomy
      nb_species <- length(species)
    }else if(its == "RPB2"){
      species <- real_infos_dict$RPB2$Taxonomy
      nb_species <- length(species)
    }
      
    data <- tibble(richeness = c(
      nb_species,
      nrow(frogs_infos),
      nrow(dada2_frogs_infos),
      nrow(usearch_infos),
      nrow(dada2_se_infos),
      nrow(qiime_se_infos),
      nrow(dada2_pe_infos),
      nrow(qiime_pe_infos)
    ),
    tool = c("EXPECTED", 
             "FROGS", 
             "DADA2_FROGS",
             "USEARCH",
             "DADA2_SE",
             "QIIME_SE",
             "DADA2_PE",
             "QIIME_PE"),
    reads = c(
      metadata_infos %>% filter(Ecosystem == sp) %>% filter(Marker == its) %>% filter(str_detect(Sample, paste("^",type,sep=""))) %>% select(Reads) %>% sum,
      frogs_infos %>% select(observation_sum) %>% sum(),
      dada2_frogs_infos %>% select(observation_sum) %>% sum(),
      usearch_infos %>% select(observation_sum) %>% sum(),
      dada2_se_infos %>% select(observation_sum) %>% sum(),
      qiime_se_infos %>% select(observation_sum) %>% sum(),
      dada2_pe_infos %>% select(observation_sum) %>% sum(),
      qiime_pe_infos %>% select(observation_sum) %>% sum()
      
    ))
    data$tool <- factor(data$tool, levels=c("EXPECTED", 
                                            "FROGS", 
                                            "DADA2_FROGS",
                                            "USEARCH",
                                            "DADA2_SE",
                                            "QIIME_SE",
                                            "DADA2_PE",
                                            "QIIME_PE"))
    saveRDS(data, infos_file)
    }
    else{
      data <- readRDS(infos_file)
    }
    print(plot_depth(data))
    cat("\n")
  }
}

MEAT ITS1 DNA

MEAT ITS1 PCR

MEAT ITS2 DNA

MEAT ITS2 PCR

MEAT D1/D2 DNA

MEAT D1/D2 PCR

MEAT RPB2 DNA

MEAT RPB2 PCR

Expected vs. observed Richness (count of OTU, ASV, ZOTU)

for(its in amplicons){
  for(type in types){
    if(its=="D1D2"){
      if(type=="ADN"){
        cat("  \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
      }else{
        cat("  \n####",sp, " ", "D1/D2", " ", type, " \n\n")
      }
      
    }else{
      if(type=="ADN"){
        cat("  \n####",sp, " ", its, " ", "DNA", " \n\n")  
      }
      else{
        cat("  \n####",sp, " ", its, " ", type, " \n\n")    
      }
    }
    #path <- file.path("../benchmark_real",sp,its,type)
    path <- file.path(sp,its,type)
    infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
    data <- readRDS(infos_file)
    cat("\n")
    cat("\n")
    print(plot_richeness_graph(data))
    cat("\n")
  }
}

MEAT ITS1 DNA

MEAT ITS1 PCR

MEAT ITS2 DNA

MEAT ITS2 PCR

MEAT D1/D2 DNA

MEAT D1/D2 PCR

MEAT RPB2 DNA

MEAT RPB2 PCR

Metrics calculated in relation to the expected

The results obtained for each tool were compared to what was expected and different metrics were calculated. Affiliations and associated abundances are taken into account.

The metrics are:

  • Divergence (takes into account abundance)
  • False negatives
  • False positives
  • True positives
for(its in amplicons){
  for(type in types){
    if(its=="D1D2"){
      if(type=="ADN"){
        cat("  \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
      }else{
        cat("  \n####",sp, " ", "D1/D2", " ", type, " \n\n")
      }
      
    }else{
      if(type=="ADN"){
        cat("  \n####",sp, " ", its, " ", "DNA", " \n\n")  
      }
      else{
        cat("  \n####",sp, " ", its, " ", type, " \n\n")    
      }
    }
    print(plot_divergence_real(type, its, divergence_file))
    cat(" \n")
    cat(" \n")
    print(plot_metrics_real(type, its, divergence_file))
    cat(" \n")
    print(plot_precision_and_recall_real(type, its, divergence_file))
    cat(" \n")
  }
}

MEAT ITS1 DNA

MEAT ITS1 PCR

MEAT ITS2 DNA

MEAT ITS2 PCR

MEAT D1/D2 DNA

MEAT D1/D2 PCR

MEAT RPB2 DNA

MEAT RPB2 PCR

cat("  \n###"," Comparison of markers and types for DADA2_FROGS", " \n")

Comparison of markers and types for DADA2_FROGS

print(plot_divergence(type, divergence_file) + facet_grid(~its))  

cat(" \n")
print(plot_metrics(type, divergence_file))

cat(" \n")
print(plot_precision_and_recall(type, divergence_file))

cat(" \n")

Taxonomies found vs. lost

Tools are clustered using the canberra distance.

for(its in amplicons){
  for(type in types){
    #path <- file.path("../benchmark_real",sp,its,type)
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
    if(its=="D1D2"){
      if(type=="ADN"){
        cat("  \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
      }else{
        cat("  \n####",sp, " ", "D1/D2", " ", type, " \n\n")
      }
      
    }else{
      if(type=="ADN"){
        cat("  \n####",sp, " ", its, " ", "DNA", " \n\n")  
      }
      else{
        cat("  \n####",sp, " ", its, " ", type, " \n\n")    
      }
    }
    if(!file.exists(mat_file)){
      if(its == "ITS1"){
        species <- real_infos_dict$ITS1$Taxonomy  
      }else if(its == "ITS2"){
        species <- real_infos_dict$ITS2$Taxonomy
      }else if(its == "D1D2"){
        species <- real_infos_dict$D1D2$Taxonomy
      }else if(its == "RPB2"){
        species <- real_infos_dict$RPB2$Taxonomy
      }
      
      nb_expected_species <- length(species)
      species <- c(species,"Others")
      mat = matrix(nrow = nb_expected_species+1, ncol = 8)
      rownames(mat) <- species
      colnames(mat) <- c(tools,"EXPECTED")
      mat[,"EXPECTED"] <- c(rep(1/nb_expected_species,times=nb_expected_species),0)
      
      mat[is.na(mat)] <- 0
      
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      
      mat <- fill_abundance_matrix(mat, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
      mat <- fill_abundance_matrix(mat, dada2_se_infos, "DADA2-se")
      mat <- fill_abundance_matrix(mat, dada2_pe_infos, "DADA2-pe")
      mat <- fill_abundance_matrix(mat, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, qiime_se_infos, "QIIME-se")
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, qiime_pe_infos, "QIIME-pe")
      }
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, usearch_infos, "USEARCH")  
      }
      
      rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
      rownames(mat)[nrow(mat)] <- "Others"
      saveRDS(mat, mat_file)
    }else{
      mat <- readRDS(mat_file)
    }
    
    if(!metabarfood){
      mat <- subset( mat, select = -DADA2_FROGS )
    }
    
    l <-  build_heatmap_abundance_real(mat)
    h <- l[[1]]
    mat <- l[[2]]
    lgd_list <- l[[3]]
    draw(h, annotation_legend_list = lgd_list)
    cat(" \n")
  }
}

MEAT ITS1 DNA

MEAT ITS1 PCR

MEAT ITS2 DNA

MEAT ITS2 PCR

MEAT D1/D2 DNA

MEAT D1/D2 PCR

MEAT RPB2 DNA

MEAT RPB2 PCR

Reconstruction of the reference

for(its in amplicons){
  for(type in types){
    
    #path <- file.path("../benchmark_real",sp,its,type)
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
    
    if(its=="D1D2"){
      if(type=="ADN"){
        cat("  \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
      }else{
        cat("  \n####",sp, " ", "D1/D2", " ", type, " \n\n")
      }
      
    }else{
      if(type=="ADN"){
        cat("  \n####",sp, " ", its, " ", "DNA", " \n\n")  
      }
      else{
        cat("  \n####",sp, " ", its, " ", type, " \n\n")    
      }
    }
    if(!file.exists(mat_file)){
      if(its == "ITS1"){
        species <- real_infos_dict$ITS1$Taxonomy
        real_infos <- real_infos_dict$ITS1
      }else if(its == "ITS2"){
        species <- real_infos_dict$ITS2$Taxonomy
        real_infos <- real_infos_dict$ITS2
      }else if(its == "D1D2"){
        species <- real_infos_dict$D1D2$Taxonomy
        real_infos <- real_infos_dict$D1D2
      }else if(its == "RPB2"){
        species <- real_infos_dict$RPB2$Taxonomy
        real_infos <- real_infos_dict$RPB2
      }
    }
    if(!file.exists(mat_file)){
      nb_expected_species <- length(species)
      mat = matrix(nrow = nb_expected_species, ncol = 8)
      rownames(mat) <- species
      colnames(mat) <- c("EXPECTED","FROGS","DADA2_FROGS","USEARCH","DADA2-se","DADA2-pe","QIIME-se","QIIME-pe")
      mat[,"EXPECTED"] <- 1
      
      mat[is.na(mat)] <- 0
      
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
      
      mat <- fill_presence_matrix(mat, real_infos, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
      mat <- fill_presence_matrix(mat, real_infos, dada2_se_infos, "DADA2-se")
      mat <- fill_presence_matrix(mat, real_infos, dada2_pe_infos, "DADA2-pe")
      mat <- fill_presence_matrix(mat, real_infos, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, qiime_se_infos, "QIIME-se")
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, qiime_pe_infos, "QIIME-pe")
      }
      
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, usearch_infos, "USEARCH")  
      }
      rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
      saveRDS(mat, mat_file)
    }else{
      mat <- readRDS(mat_file)
    }
    
    l <-  build_heatmap_presence_real(mat)
    
    tt <- summarise_all(as_tibble(ceiling(mat)),sum)
    table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU detected") %>% kable_styling("striped", full_width = F)
    cat(" \n")
    print(table)
    cat(" \n\n\n")
    
    tt <- summarise_all(as_tibble(floor(mat)),sum)
    table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU perfectly reconstructed") %>% kable_styling("striped", full_width = F)
    cat(" \n")
    print(table)
    cat(" \n\n\n")
    
    h <- l[[1]]
    mat <- l[[2]]
    
    draw(h)
    cat(" \n")
    cat(" \n")
  }
}

MEAT ITS1 DNA

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 34 35 24 32 30 28 28
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 26 30 23 7 25 1 27

MEAT ITS1 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 38 38 29 36 34 31 31
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 26 33 26 7 28 1 28

MEAT ITS2 DNA

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 35 36 25 29 28 25 25
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 25 29 25 4 28 0 24

MEAT ITS2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 37 36 25 31 30 25 25
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 24 30 25 4 30 0 24

MEAT D1/D2 DNA

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 34 37 23 23 3 24 1
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 27 32 0 0 0 0 0

MEAT D1/D2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 35 37 23 24 4 24 1
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 28 33 0 0 0 0 0

MEAT RPB2 DNA

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 29 32 0 34 11 33 4
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 13 12 0 0 0 0 0

MEAT RPB2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 33 33 0 34 4 33 7
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
40 15 14 0 0 0 0 0

Abundance comparison according to marker and type (DNA/PCR)

for(tool in tools){
  cat("  \n####",tool, " \n\n")
  htmp_list <- c()
  htmp_list_presence <- c()
  min <- 0
  max <- 0
  for(its in amplicons){
    for(type in types){
      path <- file.path(sp,its,type)
      mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
      mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
      mat <- readRDS(mat_file)
      mat_presence <- readRDS(mat_presence_file)
      submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
      submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix()
      rownames(submat) <- rownames(mat)
      rownames(submat_presence) <- rownames(mat_presence)
      submat[submat==0] <- NA
      #submat_presence[submat_presence==0] <- NA
      max <- max(max,max(submat,na.rm = TRUE))
      colnames(submat) <- paste0(its," ",type)
      colnames(submat_presence) <- paste0(its," ",type)
      ht <- build_heatmap_one_tool(submat, min, max)
      ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
      htmp_list <- c(htmp_list,ht)
      htmp_list_presence <- c(htmp_list_presence,ht_presence)
    }
  }
  
  l <- c()
  for(i in htmp_list){
    l <- l + i  
  }
  
  l_presence <- c()
  for(i in htmp_list_presence){
    l_presence <- l_presence + i  
  }
  
  draw(l, row_title = "Expected species",
      column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
      Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
    ))
  cat(" \n")
  
  # draw(l_presence, row_title = "Expected species",
  #     column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
  #     Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color)),
  #     Legend(at = 2, title = "", labels=c("Not well reconstructed"), legend_gp = gpar(fill="#b1c5c4")),
  #     Legend(at = 3, title = "", labels=c("Perfect"), legend_gp = gpar(fill="#86a5a4"))
  #     )
  #     )
  # cat(" \n")
}

FROGS

DADA2_FROGS

USEARCH

DADA2-pe

DADA2-se

QIIME-se

QIIME-pe

CHEESE

sp <- "CHEESE"
divergence_file <- file.path(sp,"divergence.tsv")

Expected vs. observed depth (count of sequences)

for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    if(!(file.exists(infos_file))){
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
      }
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
      }
      
    
    if(its == "ITS1"){
      species <- real_infos_dict$ITS1$Taxonomy
      nb_species <- length(species)
    }else if(its == "ITS2"){
      species <- real_infos_dict$ITS2$Taxonomy
      nb_species <- length(species)
    }else if(its == "D1D2"){
      species <- real_infos_dict$D1D2$Taxonomy
      nb_species <- length(species)
    }else if(its == "RPB2"){
      species <- real_infos_dict$RPB2$Taxonomy
      nb_species <- length(species)
    }
      
    data <- tibble(richeness = c(
      nb_species,
      nrow(frogs_infos),
      nrow(dada2_frogs_infos),
      nrow(usearch_infos),
      nrow(dada2_se_infos),
      nrow(qiime_se_infos),
      nrow(dada2_pe_infos),
      nrow(qiime_pe_infos)
    ),
    tool = c("EXPECTED", 
             "FROGS", 
             "DADA2_FROGS",
             "USEARCH",
             "DADA2_SE",
             "QIIME_SE",
             "DADA2_PE",
             "QIIME_PE"),
    reads = c(
      metadata_infos %>% filter(Ecosystem == sp) %>% filter(Marker == its) %>% filter(str_detect(Sample, paste("^",type,sep=""))) %>% select(Reads) %>% sum,
      frogs_infos %>% select(observation_sum) %>% sum(),
      dada2_frogs_infos %>% select(observation_sum) %>% sum(),
      usearch_infos %>% select(observation_sum) %>% sum(),
      dada2_se_infos %>% select(observation_sum) %>% sum(),
      qiime_se_infos %>% select(observation_sum) %>% sum(),
      dada2_pe_infos %>% select(observation_sum) %>% sum(),
      qiime_pe_infos %>% select(observation_sum) %>% sum()
      
    ))
    data$tool <- factor(data$tool, levels=c("EXPECTED", 
                                            "FROGS", 
                                            "DADA2_FROGS",
                                            "USEARCH",
                                            "DADA2_SE",
                                            "QIIME_SE",
                                            "DADA2_PE",
                                            "QIIME_PE"))
    saveRDS(data, infos_file)
    }
    else{
      data <- readRDS(infos_file)
    }
    print(plot_depth(data))
    cat("\n")
  }
}

CHEESE ITS1 ADN

CHEESE ITS1 PCR

CHEESE ITS2 ADN

CHEESE ITS2 PCR

CHEESE D1D2 ADN

CHEESE D1D2 PCR

CHEESE RPB2 ADN

CHEESE RPB2 PCR

Expected vs. observed Richness (count of OTU, ASV, ZOTU)

for(its in amplicons){
  for(type in types){
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    path <- file.path(sp,its,type)
    infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
    data <- readRDS(infos_file)
    cat("\n")
    cat("\n")
    print(plot_richeness_graph(data))
    cat("\n")
  }
}

CHEESE ITS1 ADN

CHEESE ITS1 PCR

CHEESE ITS2 ADN

CHEESE ITS2 PCR

CHEESE D1D2 ADN

CHEESE D1D2 PCR

CHEESE RPB2 ADN

CHEESE RPB2 PCR

Metrics calculated in relation to the expected

The results obtained for each tool were compared to what was expected and different metrics were calculated. Affiliations and associated abundances are taken into account.

The metrics are:

  • Divergence (takes into account abundance)
  • False negatives
  • False positives
  • True positives
for(its in amplicons){
  for(type in types){
    cat("  \n####",sp, " ", its, " ", type, " \n")
    print(plot_divergence_real(type, its, divergence_file))
    cat(" \n")
    cat(" \n")
    print(plot_metrics_real(type, its, divergence_file))
    cat(" \n")
    print(plot_precision_and_recall_real(type, its, divergence_file))
    cat(" \n")
  }
}

CHEESE ITS1 ADN

CHEESE ITS1 PCR

CHEESE ITS2 ADN

CHEESE ITS2 PCR

CHEESE D1D2 ADN

CHEESE D1D2 PCR

CHEESE RPB2 ADN

CHEESE RPB2 PCR

cat("  \n###"," Comparison of markers and types for DADA2_FROGS", " \n")

Comparison of markers and types for DADA2_FROGS

print(plot_divergence(type, divergence_file) + facet_grid(~its))  

cat(" \n")
print(plot_metrics(type, divergence_file))

cat(" \n")
print(plot_precision_and_recall(type, divergence_file))

cat(" \n")

Taxonomies found vs. lost

Tools are clustered using the canberra distance.

for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    if(!file.exists(mat_file)){
      if(its == "ITS1"){
        species <- real_infos_dict$ITS1$Taxonomy  
      }else if(its == "ITS2"){
        species <- real_infos_dict$ITS2$Taxonomy
      }else if(its == "D1D2"){
        species <- real_infos_dict$D1D2$Taxonomy
      }else if(its == "RPB2"){
        species <- real_infos_dict$RPB2$Taxonomy
      }
      
      nb_expected_species <- length(species)
      species <- c(species,"Others")
      mat = matrix(nrow = nb_expected_species+1, ncol = 8)
      rownames(mat) <- species
      colnames(mat) <- c(tools,"EXPECTED")
      mat[,"EXPECTED"] <- c(rep(1/nb_expected_species,times=nb_expected_species),0)
      
      mat[is.na(mat)] <- 0
      
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      
      mat <- fill_abundance_matrix(mat, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
      mat <- fill_abundance_matrix(mat, dada2_se_infos, "DADA2-se")
      mat <- fill_abundance_matrix(mat, dada2_pe_infos, "DADA2-pe")
      mat <- fill_abundance_matrix(mat, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, qiime_se_infos, "QIIME-se")
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, qiime_pe_infos, "QIIME-pe")
      }
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, usearch_infos, "USEARCH")  
      }
      
      rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
      rownames(mat)[nrow(mat)] <- "Others"
      saveRDS(mat, mat_file)
    }else{
      mat <- readRDS(mat_file)
    }
    
    if(!metabarfood){
      mat <- subset( mat, select = -DADA2_FROGS )
    }
    
    l <-  build_heatmap_abundance_real(mat)
    h <- l[[1]]
    mat <- l[[2]]
    lgd_list <- l[[3]]
    draw(h, annotation_legend_list = lgd_list)
    cat(" \n")
  }
}

CHEESE ITS1 ADN

CHEESE ITS1 PCR

CHEESE ITS2 ADN

CHEESE ITS2 PCR

CHEESE D1D2 ADN

CHEESE D1D2 PCR

CHEESE RPB2 ADN

CHEESE RPB2 PCR

Reconstruction of the reference

for(its in amplicons){
  for(type in types){
    
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
    
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    if(!file.exists(mat_file)){
      if(its == "ITS1"){
        species <- real_infos_dict$ITS1$Taxonomy
        real_infos <- real_infos_dict$ITS1
      }else if(its == "ITS2"){
        species <- real_infos_dict$ITS2$Taxonomy
        real_infos <- real_infos_dict$ITS2
      }else if(its == "D1D2"){
        species <- real_infos_dict$D1D2$Taxonomy
        real_infos <- real_infos_dict$D1D2
      }else if(its == "RPB2"){
        species <- real_infos_dict$RPB2$Taxonomy
        real_infos <- real_infos_dict$RPB2
      }
    }
    
    if(!file.exists(mat_file)){
      nb_expected_species <- length(species)
      mat = matrix(nrow = nb_expected_species, ncol = 8)
      rownames(mat) <- species
      colnames(mat) <- c("EXPECTED","FROGS","DADA2_FROGS","USEARCH","DADA2-se","DADA2-pe","QIIME-se","QIIME-pe")
      mat[,"EXPECTED"] <- 1
      
      mat[is.na(mat)] <- 0
      
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
      
      mat <- fill_presence_matrix(mat, real_infos, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
      mat <- fill_presence_matrix(mat, real_infos, dada2_se_infos, "DADA2-se")
      mat <- fill_presence_matrix(mat, real_infos, dada2_pe_infos, "DADA2-pe")
      mat <- fill_presence_matrix(mat, real_infos, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, qiime_se_infos, "QIIME-se")
      }else{
        mat[,"QIIME-se"] <- 0
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, qiime_pe_infos, "QIIME-pe")
      }else{
        mat[,"QIIME-pe"] <- 0
      }
      
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, usearch_infos, "USEARCH")  
      }else{
        mat[,"USEARCH"] <- 0
      }
      rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
      saveRDS(mat, mat_file)
    }else{
      mat <- readRDS(mat_file)
    }
    
    l <-  build_heatmap_presence_real(mat)
    
    tt <- summarise_all(as_tibble(ceiling(mat)),sum)
    table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU detected") %>% kable_styling("striped", full_width = F)
    cat(" \n")
    print(table)
    cat(" \n\n\n")
    
    tt <- summarise_all(as_tibble(floor(mat)),sum)
    table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU perfectly reconstructed") %>% kable_styling("striped", full_width = F)
    cat(" \n")
    print(table)
    cat(" \n\n\n")
    
    h <- l[[1]]
    mat <- l[[2]]
    
    draw(h)
    cat(" \n")
    cat(" \n")
  }
}

CHEESE ITS1 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 23 23 20 23 24 23 23
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 21 22 19 7 21 0 21

CHEESE ITS1 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 23 23 21 23 24 23 23
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 21 22 20 7 22 0 21

CHEESE ITS2 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 24 24 0 24 24 24 24
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 23 23 0 4 22 0 22

CHEESE ITS2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 24 24 23 24 24 24 25
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 23 23 23 4 23 0 22

CHEESE D1D2 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 21 21 20 20 10 19 14
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 13 14 0 0 0 0 0

CHEESE D1D2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 20 21 19 20 5 19 7
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 13 14 0 0 0 0 0

CHEESE RPB2 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 18 17 22 22 8 23 13
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 13 12 0 0 0 0 0

CHEESE RPB2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 15 15 0 20 6 21 6
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
25 10 10 0 0 0 0 0

Abundance comparison according to marker and type (DNA/PCR)

for(tool in tools){
  cat("  \n####",tool, " \n\n")
  htmp_list <- c()
  htmp_list_presence <- c()
  min <- 0
  max <- 0
  for(its in amplicons){
    for(type in types){
      path <- file.path(sp,its,type)
      mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
      mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
      mat <- readRDS(mat_file)
      mat_presence <- readRDS(mat_presence_file)
      submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
      submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix()
      rownames(submat) <- rownames(mat)
      rownames(submat_presence) <- rownames(mat_presence)
      submat[submat==0] <- NA
      #submat_presence[submat_presence==0] <- NA
      max <- max(max,max(submat,na.rm = TRUE))
      colnames(submat) <- paste0(its," ",type)
      colnames(submat_presence) <- paste0(its," ",type)
      ht <- build_heatmap_one_tool(submat, min, max)
      ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
      htmp_list <- c(htmp_list,ht)
      htmp_list_presence <- c(htmp_list_presence,ht_presence)
    }
  }
  
  l <- c()
  for(i in htmp_list){
    l <- l + i  
  }
  
  l_presence <- c()
  for(i in htmp_list_presence){
    l_presence <- l_presence + i  
  }
  
  draw(l, row_title = "Expected species",
      column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
      Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
    ))
  cat(" \n")
  
  # draw(l_presence, row_title = "Expected species",
  #     column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
  #     Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color)),
  #     Legend(at = 2, title = "", labels=c("Not well reconstructed"), legend_gp = gpar(fill="#b1c5c4")),
  #     Legend(at = 3, title = "", labels=c("Perfect"), legend_gp = gpar(fill="#86a5a4"))
  #     )
  #     )
  # cat(" \n")
}

FROGS

DADA2_FROGS

USEARCH

DADA2-pe

DADA2-se

QIIME-se

QIIME-pe

BREAD

sp <- "BREAD"
divergence_file <- file.path(sp,"divergence.tsv")

Expected vs. observed depth (count of sequences)

for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    if(!(file.exists(infos_file))){
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
      }
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
      }
      
    
    if(its == "ITS1"){
      species <- real_infos_dict$ITS1$Taxonomy
      nb_species <- length(species)
    }else if(its == "ITS2"){
      species <- real_infos_dict$ITS2$Taxonomy
      nb_species <- length(species)
    }else if(its == "D1D2"){
      species <- real_infos_dict$D1D2$Taxonomy
      nb_species <- length(species)
    }else if(its == "RPB2"){
      species <- real_infos_dict$RPB2$Taxonomy
      nb_species <- length(species)
    }
      
    data <- tibble(richeness = c(
      nb_species,
      nrow(frogs_infos),
      nrow(dada2_frogs_infos),
      nrow(usearch_infos),
      nrow(dada2_se_infos),
      nrow(qiime_se_infos),
      nrow(dada2_pe_infos),
      nrow(qiime_pe_infos)
    ),
    tool = c("EXPECTED", 
             "FROGS", 
             "DADA2_FROGS",
             "USEARCH",
             "DADA2_SE",
             "QIIME_SE",
             "DADA2_PE",
             "QIIME_PE"),
    reads = c(
      metadata_infos %>% filter(Ecosystem == sp) %>% filter(Marker == its) %>% filter(str_detect(Sample, paste("^",type,sep=""))) %>% select(Reads) %>% sum,
      frogs_infos %>% select(observation_sum) %>% sum(),
      dada2_frogs_infos %>% select(observation_sum) %>% sum(),
      usearch_infos %>% select(observation_sum) %>% sum(),
      dada2_se_infos %>% select(observation_sum) %>% sum(),
      qiime_se_infos %>% select(observation_sum) %>% sum(),
      dada2_pe_infos %>% select(observation_sum) %>% sum(),
      qiime_pe_infos %>% select(observation_sum) %>% sum()
      
    ))
    data$tool <- factor(data$tool, levels=c("EXPECTED", 
                                            "FROGS", 
                                            "DADA2_FROGS",
                                            "USEARCH",
                                            "DADA2_SE",
                                            "QIIME_SE",
                                            "DADA2_PE",
                                            "QIIME_PE"))
    saveRDS(data, infos_file)
    }
    else{
      data <- readRDS(infos_file)
    }
    print(plot_depth(data))
    cat("\n")
  }
}

BREAD ITS1 ADN

BREAD ITS1 PCR

BREAD ITS2 ADN

BREAD ITS2 PCR

BREAD D1D2 ADN

BREAD D1D2 PCR

BREAD RPB2 ADN

BREAD RPB2 PCR

Expected vs. observed Richness (count of OTU, ASV, ZOTU)

for(its in amplicons){
  for(type in types){
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    path <- file.path(sp,its,type)
    infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
    data <- readRDS(infos_file)
    cat("\n")
    cat("\n")
    print(plot_richeness_graph(data))
    cat("\n")
  }
}

BREAD ITS1 ADN

BREAD ITS1 PCR

BREAD ITS2 ADN

BREAD ITS2 PCR

BREAD D1D2 ADN

BREAD D1D2 PCR

BREAD RPB2 ADN

BREAD RPB2 PCR

Metrics calculated in relation to the expected

The results obtained for each tool were compared to what was expected and different metrics were calculated. Affiliations and associated abundances are taken into account.

The metrics are:

  • Divergence (takes into account abundance)
  • False negatives
  • False positives
  • True positives
for(its in amplicons){
  for(type in types){
    cat("  \n####",sp, " ", its, " ", type, " \n")
    print(plot_divergence_real(type, its, divergence_file))
    cat(" \n")
    cat(" \n")
    print(plot_metrics_real(type, its, divergence_file))
    cat(" \n")
    print(plot_precision_and_recall_real(type, its, divergence_file))
    cat(" \n")
  }
}

BREAD ITS1 ADN

BREAD ITS1 PCR

BREAD ITS2 ADN

BREAD ITS2 PCR

BREAD D1D2 ADN

BREAD D1D2 PCR

BREAD RPB2 ADN

BREAD RPB2 PCR

cat("  \n###"," Comparison of markers and types for DADA2_FROGS", " \n")

Comparison of markers and types for DADA2_FROGS

print(plot_divergence(type, divergence_file) + facet_grid(~its))  

cat(" \n")
print(plot_metrics(type, divergence_file))

cat(" \n")
print(plot_precision_and_recall(type, divergence_file))

cat(" \n")

Taxonomies found vs. lost

Tools are clustered using the canberra distance.

for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    if(!file.exists(mat_file)){
      if(its == "ITS1"){
        species <- real_infos_dict$ITS1$Taxonomy  
      }else if(its == "ITS2"){
        species <- real_infos_dict$ITS2$Taxonomy
      }else if(its == "D1D2"){
        species <- real_infos_dict$D1D2$Taxonomy
      }else if(its == "RPB2"){
        species <- real_infos_dict$RPB2$Taxonomy
      }
      
      nb_expected_species <- length(species)
      species <- c(species,"Others")
      mat = matrix(nrow = nb_expected_species+1, ncol = 8)
      rownames(mat) <- species
      colnames(mat) <- c(tools,"EXPECTED")
      mat[,"EXPECTED"] <- c(rep(1/nb_expected_species,times=nb_expected_species),0)
      
      mat[is.na(mat)] <- 0
      
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      
      mat <- fill_abundance_matrix(mat, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
      mat <- fill_abundance_matrix(mat, dada2_se_infos, "DADA2-se")
      mat <- fill_abundance_matrix(mat, dada2_pe_infos, "DADA2-pe")
      mat <- fill_abundance_matrix(mat, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, qiime_se_infos, "QIIME-se")
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, qiime_pe_infos, "QIIME-pe")
      }
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, usearch_infos, "USEARCH")  
      }
      
      rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
      rownames(mat)[nrow(mat)] <- "Others"
      saveRDS(mat, mat_file)
    }else{
      mat <- readRDS(mat_file)
    }
    
    if(!metabarfood){
      mat <- subset( mat, select = -DADA2_FROGS )
    }
    
    l <-  build_heatmap_abundance_real(mat)
    h <- l[[1]]
    mat <- l[[2]]
    lgd_list <- l[[3]]
    draw(h, annotation_legend_list = lgd_list)
    cat(" \n")
  }
}

BREAD ITS1 ADN

BREAD ITS1 PCR

BREAD ITS2 ADN

BREAD ITS2 PCR

BREAD D1D2 ADN

BREAD D1D2 PCR

BREAD RPB2 ADN

BREAD RPB2 PCR

Reconstruction of the reference

for(its in amplicons){
  for(type in types){
    
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
    
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    if(!file.exists(mat_file)){
      if(its == "ITS1"){
        species <- real_infos_dict$ITS1$Taxonomy
        real_infos <- real_infos_dict$ITS1
      }else if(its == "ITS2"){
        species <- real_infos_dict$ITS2$Taxonomy
        real_infos <- real_infos_dict$ITS2
      }else if(its == "D1D2"){
        species <- real_infos_dict$D1D2$Taxonomy
        real_infos <- real_infos_dict$D1D2
      }else if(its == "RPB2"){
        species <- real_infos_dict$RPB2$Taxonomy
        real_infos <- real_infos_dict$RPB2
      }
    }
    if(!file.exists(mat_file)){
      nb_expected_species <- length(species)
      mat = matrix(nrow = nb_expected_species, ncol = 8)
      rownames(mat) <- species
      colnames(mat) <- c("EXPECTED","FROGS","DADA2_FROGS","USEARCH","DADA2-se","DADA2-pe","QIIME-se","QIIME-pe")
      mat[,"EXPECTED"] <- 1
      
      mat[is.na(mat)] <- 0
      
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
      
      mat <- fill_presence_matrix(mat, real_infos, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
      mat <- fill_presence_matrix(mat, real_infos, dada2_se_infos, "DADA2-se")
      mat <- fill_presence_matrix(mat, real_infos, dada2_pe_infos, "DADA2-pe")
      mat <- fill_presence_matrix(mat, real_infos, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, qiime_se_infos, "QIIME-se")
      }else{
        mat[,"QIIME-se"] <- 0
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, qiime_pe_infos, "QIIME-pe")
      }else{
        mat[,"QIIME-pe"] <- 0
      }
      
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, usearch_infos, "USEARCH")  
      }else{
        mat[,"USEARCH"] <- 0
      }
      rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
      saveRDS(mat, mat_file)
    }else{
      mat <- readRDS(mat_file)
    }
    
    l <-  build_heatmap_presence_real(mat)
    
    tt <- summarise_all(as_tibble(ceiling(mat)),sum)
    table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU detected") %>% kable_styling("striped", full_width = F)
    cat(" \n")
    print(table)
    cat(" \n\n\n")
    
    tt <- summarise_all(as_tibble(floor(mat)),sum)
    table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU perfectly reconstructed") %>% kable_styling("striped", full_width = F)
    cat(" \n")
    print(table)
    cat(" \n\n\n")
    
    h <- l[[1]]
    mat <- l[[2]]
    
    draw(h)
    cat(" \n")
    cat(" \n")
  }
}

BREAD ITS1 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 21 21 20 22 20 23 22
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 19 19 16 6 17 1 18

BREAD ITS1 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 21 21 20 21 20 23 20
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 19 18 16 6 16 1 18

BREAD ITS2 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 21 21 20 21 21 21 21
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 20 20 19 0 20 0 20

BREAD ITS2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 21 21 20 21 21 21 22
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 20 19 18 0 19 0 19

BREAD D1D2 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 21 22 0 18 0 16 0
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 18 18 0 0 0 0 0

BREAD D1D2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 21 21 16 18 3 17 5
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 18 18 0 0 0 0 0

BREAD RPB2 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 10 10 0 19 2 20 3
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 6 6 0 0 0 0 0

BREAD RPB2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 9 9 0 16 3 20 5
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
27 3 4 0 0 0 0 0

Abundance comparison according to marker and type (DNA/PCR)

for(tool in tools){
  cat("  \n####",tool, " \n\n")
  htmp_list <- c()
  htmp_list_presence <- c()
  min <- 0
  max <- 0
  for(its in amplicons){
    for(type in types){
      path <- file.path(sp,its,type)
      mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
      mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
      mat <- readRDS(mat_file)
      mat_presence <- readRDS(mat_presence_file)
      submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
      submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix()
      rownames(submat) <- rownames(mat)
      rownames(submat_presence) <- rownames(mat_presence)
      submat[submat==0] <- NA
      #submat_presence[submat_presence==0] <- NA
      max <- max(max,max(submat,na.rm = TRUE))
      colnames(submat) <- paste0(its," ",type)
      colnames(submat_presence) <- paste0(its," ",type)
      ht <- build_heatmap_one_tool(submat, min, max)
      ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
      htmp_list <- c(htmp_list,ht)
      htmp_list_presence <- c(htmp_list_presence,ht_presence)
    }
  }
  
  l <- c()
  for(i in htmp_list){
    l <- l + i  
  }
  
  l_presence <- c()
  for(i in htmp_list_presence){
    l_presence <- l_presence + i  
  }
  
  draw(l, row_title = "Expected species",
      column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
      Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
    ))
  cat(" \n")
  
  # draw(l_presence, row_title = "Expected species",
  #     column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
  #     Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color)),
  #     Legend(at = 2, title = "", labels=c("Not well reconstructed"), legend_gp = gpar(fill="#b1c5c4")),
  #     Legend(at = 3, title = "", labels=c("Perfect"), legend_gp = gpar(fill="#86a5a4"))
  #     )
  #     )
  # cat(" \n")
}

FROGS

DADA2_FROGS

USEARCH

DADA2-pe

DADA2-se

QIIME-se

QIIME-pe

WINE

sp <- "WINE"
divergence_file <- file.path(sp,"divergence.tsv")

Expected vs. observed depth (count of sequences)

for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    if(!(file.exists(infos_file))){
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
      }
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
      }
      
    
    if(its == "ITS1"){
      species <- real_infos_dict$ITS1$Taxonomy
      nb_species <- length(species)
    }else if(its == "ITS2"){
      species <- real_infos_dict$ITS2$Taxonomy
      nb_species <- length(species)
    }else if(its == "D1D2"){
      species <- real_infos_dict$D1D2$Taxonomy
      nb_species <- length(species)
    }else if(its == "RPB2"){
      species <- real_infos_dict$RPB2$Taxonomy
      nb_species <- length(species)
    }
      
    data <- tibble(richeness = c(
      nb_species,
      nrow(frogs_infos),
      nrow(dada2_frogs_infos),
      nrow(usearch_infos),
      nrow(dada2_se_infos),
      nrow(qiime_se_infos),
      nrow(dada2_pe_infos),
      nrow(qiime_pe_infos)
    ),
    tool = c("EXPECTED", 
             "FROGS", 
             "DADA2_FROGS",
             "USEARCH",
             "DADA2_SE",
             "QIIME_SE",
             "DADA2_PE",
             "QIIME_PE"),
    reads = c(
      metadata_infos %>% filter(Ecosystem == sp) %>% filter(Marker == its) %>% filter(str_detect(Sample, paste("^",type,sep=""))) %>% select(Reads) %>% sum,
      frogs_infos %>% select(observation_sum) %>% sum(),
      dada2_frogs_infos %>% select(observation_sum) %>% sum(),
      usearch_infos %>% select(observation_sum) %>% sum(),
      dada2_se_infos %>% select(observation_sum) %>% sum(),
      qiime_se_infos %>% select(observation_sum) %>% sum(),
      dada2_pe_infos %>% select(observation_sum) %>% sum(),
      qiime_pe_infos %>% select(observation_sum) %>% sum()
      
    ))
    data$tool <- factor(data$tool, levels=c("EXPECTED", 
                                            "FROGS", 
                                            "DADA2_FROGS",
                                            "USEARCH",
                                            "DADA2_SE",
                                            "QIIME_SE",
                                            "DADA2_PE",
                                            "QIIME_PE"))
    saveRDS(data, infos_file)
    }
    else{
      data <- readRDS(infos_file)
    }
    print(plot_depth(data))
    cat("\n")
  }
}

WINE ITS1 ADN

WINE ITS1 PCR

WINE ITS2 ADN

WINE ITS2 PCR

WINE D1D2 ADN

WINE D1D2 PCR

WINE RPB2 ADN

WINE RPB2 PCR

Expected vs. observed Richness (count of OTU, ASV, ZOTU)

for(its in amplicons){
  for(type in types){
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    path <- file.path(sp,its,type)
    infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
    data <- readRDS(infos_file)
    cat("\n")
    cat("\n")
    print(plot_richeness_graph(data))
    cat("\n")
  }
}

WINE ITS1 ADN

WINE ITS1 PCR

WINE ITS2 ADN

WINE ITS2 PCR

WINE D1D2 ADN

WINE D1D2 PCR

WINE RPB2 ADN

WINE RPB2 PCR

Metrics calculated in relation to the expected

The results obtained for each tool were compared to what was expected and different metrics were calculated. Affiliations and associated abundances are taken into account.

The metrics are:

  • Divergence (takes into account abundance)
  • False negatives
  • False positives
  • True positives
for(its in amplicons){
  for(type in types){
    cat("  \n####",sp, " ", its, " ", type, " \n")
    print(plot_divergence_real(type, its, divergence_file))
    cat(" \n")
    cat(" \n")
    print(plot_metrics_real(type, its, divergence_file))
    cat(" \n")
    print(plot_precision_and_recall_real(type, its, divergence_file))
    cat(" \n")
  }
}

WINE ITS1 ADN

WINE ITS1 PCR

WINE ITS2 ADN

WINE ITS2 PCR

WINE D1D2 ADN

WINE D1D2 PCR

WINE RPB2 ADN

WINE RPB2 PCR

cat("  \n###"," Comparison of markers and types for DADA2_FROGS", " \n")

Comparison of markers and types for DADA2_FROGS

print(plot_divergence(type, divergence_file) + facet_grid(~its))  

cat(" \n")
print(plot_metrics(type, divergence_file))

cat(" \n")
print(plot_precision_and_recall(type, divergence_file))

cat(" \n")

Taxonomies found vs. lost

Tools are clustered using the canberra distance.

for(its in amplicons){
  for(type in types){
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    if(!file.exists(mat_file)){
      if(its == "ITS1"){
        species <- real_infos_dict$ITS1$Taxonomy  
      }else if(its == "ITS2"){
        species <- real_infos_dict$ITS2$Taxonomy
      }else if(its == "D1D2"){
        species <- real_infos_dict$D1D2$Taxonomy
      }else if(its == "RPB2"){
        species <- real_infos_dict$RPB2$Taxonomy
      }
      
      nb_expected_species <- length(species)
      species <- c(species,"Others")
      mat = matrix(nrow = nb_expected_species+1, ncol = 8)
      rownames(mat) <- species
      colnames(mat) <- c(tools,"EXPECTED")
      mat[,"EXPECTED"] <- c(rep(1/nb_expected_species,times=nb_expected_species),0)
      
      mat[is.na(mat)] <- 0
      
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
      
      mat <- fill_abundance_matrix(mat, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
      mat <- fill_abundance_matrix(mat, dada2_se_infos, "DADA2-se")
      mat <- fill_abundance_matrix(mat, dada2_pe_infos, "DADA2-pe")
      mat <- fill_abundance_matrix(mat, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, qiime_se_infos, "QIIME-se")
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, qiime_pe_infos, "QIIME-pe")
      }
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
        mat <- fill_abundance_matrix(mat, usearch_infos, "USEARCH")  
      }
      
      rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
      rownames(mat)[nrow(mat)] <- "Others"
      saveRDS(mat, mat_file)
    }else{
      mat <- readRDS(mat_file)
    }
    
    if(!metabarfood){
      mat <- subset( mat, select = -DADA2_FROGS )
    }
    
    l <-  build_heatmap_abundance_real(mat)
    h <- l[[1]]
    mat <- l[[2]]
    lgd_list <- l[[3]]
    draw(h, annotation_legend_list = lgd_list)
    cat(" \n")
  }
}

WINE ITS1 ADN

WINE ITS1 PCR

WINE ITS2 ADN

WINE ITS2 PCR

WINE D1D2 ADN

WINE D1D2 PCR

WINE RPB2 ADN

WINE RPB2 PCR

Reconstruction of the reference

for(its in amplicons){
  for(type in types){
    
    path <- file.path(sp,its,type)
    mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
    
    cat("  \n####",sp, " ", its, " ", type, " \n\n")
    if(!file.exists(mat_file)){
      if(its == "ITS1"){
        species <- real_infos_dict$ITS1$Taxonomy
        real_infos <- real_infos_dict$ITS1
      }else if(its == "ITS2"){
        species <- real_infos_dict$ITS2$Taxonomy
        real_infos <- real_infos_dict$ITS2
      }else if(its == "D1D2"){
        species <- real_infos_dict$D1D2$Taxonomy
        real_infos <- real_infos_dict$D1D2
      }else if(its == "RPB2"){
        species <- real_infos_dict$RPB2$Taxonomy
        real_infos <- real_infos_dict$RPB2
      }
    }
    if(!file.exists(mat_file)){
      nb_expected_species <- length(species)
      mat = matrix(nrow = nb_expected_species, ncol = 8)
      rownames(mat) <- species
      colnames(mat) <- c("EXPECTED","FROGS","DADA2_FROGS","USEARCH","DADA2-se","DADA2-pe","QIIME-se","QIIME-pe")
      mat[,"EXPECTED"] <- 1
      
      mat[is.na(mat)] <- 0
      
      dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
      frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
      
      frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
      dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
      
      mat <- fill_presence_matrix(mat, real_infos, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
      mat <- fill_presence_matrix(mat, real_infos, dada2_se_infos, "DADA2-se")
      mat <- fill_presence_matrix(mat, real_infos, dada2_pe_infos, "DADA2-pe")
      mat <- fill_presence_matrix(mat, real_infos, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
      
      if(file.exists(file.path(path,"se-qiime.tsv"))){
        qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, qiime_se_infos, "QIIME-se")
      }else{
        mat[,"QIIME-se"] <- 0
      }
      
      if(file.exists(file.path(path,"pe-qiime.tsv"))){
        qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, qiime_pe_infos, "QIIME-pe")
      }else{
        mat[,"QIIME-pe"] <- 0
      }
      
      
      if(file.exists(file.path(path,"usearch.tsv"))){
        usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
        mat <- fill_presence_matrix(mat, real_infos, usearch_infos, "USEARCH")  
      }else{
        mat[,"USEARCH"] <- 0
      }
      rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
      saveRDS(mat, mat_file)
    }else{
      mat <- readRDS(mat_file)
    }
    
    l <-  build_heatmap_presence_real(mat)
    
    tt <- summarise_all(as_tibble(ceiling(mat)),sum)
    table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU detected") %>% kable_styling("striped", full_width = F)
    cat(" \n")
    print(table)
    cat(" \n\n\n")
    
    tt <- summarise_all(as_tibble(floor(mat)),sum)
    table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU perfectly reconstructed") %>% kable_styling("striped", full_width = F)
    cat(" \n")
    print(table)
    cat(" \n\n\n")
    
    h <- l[[1]]
    mat <- l[[2]]
    
    draw(h)
    cat(" \n")
    cat(" \n")
  }
}

WINE ITS1 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 50 50 48 51 49 50 50
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 46 46 43 24 41 2 45

WINE ITS1 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 53 53 49 54 52 54 54
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 48 49 44 25 44 2 48

WINE ITS2 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 52 53 51 54 54 53 53
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 49 49 48 4 49 0 49

WINE ITS2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 52 53 51 54 54 54 54
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 49 49 48 4 49 0 49

WINE D1D2 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 51 51 47 51 4 49 6
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 41 41 0 0 0 0 0

WINE D1D2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 51 51 46 50 5 49 9
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 41 41 0 0 0 0 0

WINE RPB2 ADN

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 36 38 0 42 1 45 8
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 23 24 0 0 0 0 0

WINE RPB2 PCR

Count of OTU/ASV/ZOTU detected
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 30 31 0 38 3 39 10
Count of OTU/ASV/ZOTU perfectly reconstructed
EXPECTED FROGS DADA2_FROGS USEARCH DADA2-se DADA2-pe QIIME-se QIIME-pe
60 19 19 0 0 0 0 0

Abundance comparison according to marker and type (DNA/PCR)

for(tool in tools){
  cat("  \n####",tool, " \n\n")
  htmp_list <- c()
  htmp_list_presence <- c()
  min <- 0
  max <- 0
  for(its in amplicons){
    for(type in types){
      path <- file.path(sp,its,type)
      mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
      mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
      mat <- readRDS(mat_file)
      mat_presence <- readRDS(mat_presence_file)
      submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
      submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix()
      rownames(submat) <- rownames(mat)
      rownames(submat_presence) <- rownames(mat_presence)
      submat[submat==0] <- NA
      #submat_presence[submat_presence==0] <- NA
      max <- max(max,max(submat,na.rm = TRUE))
      colnames(submat) <- paste0(its," ",type)
      colnames(submat_presence) <- paste0(its," ",type)
      ht <- build_heatmap_one_tool(submat, min, max)
      ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
      htmp_list <- c(htmp_list,ht)
      htmp_list_presence <- c(htmp_list_presence,ht_presence)
    }
  }
  
  l <- c()
  for(i in htmp_list){
    l <- l + i  
  }
  
  l_presence <- c()
  for(i in htmp_list_presence){
    l_presence <- l_presence + i  
  }
  
  draw(l, row_title = "Expected species",
      column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
      Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
    ))
  cat(" \n")
  
  # draw(l_presence, row_title = "Expected species",
  #     column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
  #     Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color)),
  #     Legend(at = 2, title = "", labels=c("Not well reconstructed"), legend_gp = gpar(fill="#b1c5c4")),
  #     Legend(at = 3, title = "", labels=c("Perfect"), legend_gp = gpar(fill="#86a5a4"))
  #     )
  #     )
  # cat(" \n")
}

FROGS

DADA2_FROGS

USEARCH

DADA2-pe

DADA2-se

QIIME-se

QIIME-pe