https://forgemia.inra.fr/migale/metabarfood/
. The
code used is available in the code chunks.
This report gives results of the benchmark performed for the METABARFOOD project.
We used FROGS (Bernard et al., 2021, https://doi.org/10.1093/bib/bbab318), USEARCH (Edgar, 2010, https://doi.org/10.1093/bioinformatics/btq461), QIIME2 (Bolyen et al., 2019, https://doi.org/10.1038/s41587-019-0209-9) and DADA2 (Callahan et al., 2016, https://doi.org/10.1038/nmeth.3869) following their own guidelines, and a custom combination of DADA2 and FROGS that we call DADA2_FROGS.
For the taxonomic affiliation of the OTUs/ASVs/ZOTUs built by each tool, we used a databank constituted of the mock communities sequences. No additional sequence was added to avoid bringing noise that we did not want to assess in this analysis.
For each tool, command lines are available below.
#!/bin/bash
export PYTHONPATH=/save/frogs/src/FROGS_dev/lib:$PYTHONPATH
export PATH=/save/frogs/src/FROGS_dev/libexec/:$PATH
export PATH=/save/frogs/src/FROGS_dev/app/:$PATH
SAVE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/results/DADA2_FROGS"
INPUT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/"
SCRIPT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/scripts/"
REFERENCE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/REFERENCES/"
TYPE="ADN PCR"
ITS="ITS1 ITS2"
LONG="D1D2 RPB2"
CPU=8
declare -A FIVE_PRIM_PRIMERS=([ITS1]=CTTGGTCATTTAGAGGAAGTAA [ITS2]=GCATCGATGAAGAACGCAGC [D1D2]=GCATATCAATAAGCGGAGGAAAAG [RPB2]=TGGGGYATGGTNTGYCCYGC)
declare -A THREE_PRIM_PRIMERS=([ITS1]=GCATCGATGAAGAACGCAGC [ITS2]=GCAWAWCAAWAAGCGGAGGA [D1D2]=CCGTCTTGAAACACGGACC [RPB2]=CBTTCCCYGAYCAYAAYCARTC)
for its in $ITS
do
for type in $TYPE
do
mkdir -p "$SAVE_DIR/$its/$type/"
cd "$SAVE_DIR/$its/$type/"
mkdir -p LOG
# launch DADA2
FIVE_PRIM=${FIVE_PRIM_PRIMERS[$its]}
THREE_PRIM=${THREE_PRIM_PRIMERS[$its]}
echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2_frogs.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$its/$type/ --output $SAVE_DIR/$its/$type/ --reference $REFERENCE_DIR/MEAT_${its}_GGtax_dada2.fasta --cpus $CPU" > "dada2_frogs-$its-$type.sh"
echo "remove_chimera.py --input-fasta dada2_frogs_temp.fasta --input-biom dada2_frogs_temp.biom --nb-cpus $CPU --out-abundance remove_chimera.biom --non-chimera remove_chimera.fasta --log-file remove_chimera.log --summary remove_chimera.html" >> "dada2_frogs-$its-$type.sh"
echo "otu_filters.py --min-abundance 0.00005 --nb-cpus $CPU --input-fasta remove_chimera.fasta --input-biom remove_chimera.biom --output-biom filters.biom --output-fasta filters.fasta --log-file filters.log --summary filters.html" >> "dada2_frogs-$its-$type.sh"
echo "itsx.py --nb-cpus $CPU --region $its --check-its-only --input-fasta filters.fasta --input-biom filters.biom --out-fasta itsx.fasta --out-abundance itsx.biom --summary itsx.html --log-file itsx.log " >> "dada2_frogs-$its-$type.sh"
echo "affiliation_OTU.py --nb-cpus $CPU --reference $REFERENCE_DIR/MEAT_${its}_GGtax.fasta --input-biom itsx.biom --input-fasta itsx.fasta --output-biom affiliation.biom --summary affiliation.html --log-file affiliation.log " >> "dada2_frogs-$its-$type.sh"
echo "affiliation_filters.py --output-biom frogs.biom --output-fasta frogs.fasta --summary affiliation_filters.html --log-file affiliation_filters.log --min-blast-coverage 0.9 --min-blast-identity 0.9 --delete --input-biom affiliation.biom --input-fasta itsx.fasta " >> "dada2_frogs-$its-$type.sh"
echo "biom_to_tsv.py --input-biom frogs.biom --input-fasta frogs.fasta --output-tsv frogs.tsv --output-multi-affi multiaff.tsv --log-file biom_to_tsv.log " >> "dada2_frogs-$its-$type.sh"
sbatch --mem=32G -J dada2_${its}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2_frogs-$its-$type.sh"
cd -
done
done
for long in $LONG
do
for type in $TYPE
do
mkdir -p "$SAVE_DIR/$long/$type/"
cd "$SAVE_DIR/$long/$type/"
mkdir -p LOG
# launch DADA2
FIVE_PRIM=${FIVE_PRIM_PRIMERS[$long]}
THREE_PRIM=${THREE_PRIM_PRIMERS[$long]}
echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2_frogs.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$long/$type/ --output $SAVE_DIR/$long/$type/ --reference $REFERENCE_DIR/MEAT_${long}_GGtax_dada2.fasta --cpus $CPU" > "dada2_frogs-$long-$type.sh"
echo "remove_chimera.py --input-fasta dada2_frogs_temp.fasta --input-biom dada2_frogs_temp.biom --nb-cpus $CPU --out-abundance remove_chimera.biom --non-chimera remove_chimera.fasta --log-file remove_chimera.log --summary remove_chimera.html" >> "dada2_frogs-$long-$type.sh"
echo "otu_filters.py --min-abundance 0.00005 --nb-cpus $CPU --input-fasta remove_chimera.fasta --input-biom remove_chimera.biom --output-biom filters.biom --output-fasta filters.fasta --log-file filters.log --summary filters.html" >> "dada2_frogs-$long-$type.sh"
echo "affiliation_OTU.py --nb-cpus $CPU --reference $REFERENCE_DIR/MEAT_${long}_GGtax.fasta --input-biom filters.biom --input-fasta filters.fasta --output-biom affiliation.biom --summary affiliation.html --log-file affiliation.log " >> "dada2_frogs-$long-$type.sh"
echo "affiliation_filters.py --output-biom frogs.biom --output-fasta frogs.fasta --summary affiliation_filters.html --log-file affiliation_filters.log --min-blast-coverage 0.9 --min-blast-identity 0.9 --delete --input-biom affiliation.biom --input-fasta filters.fasta " >> "dada2_frogs-$long-$type.sh"
echo "biom_to_tsv.py --input-biom frogs.biom --input-fasta frogs.fasta --output-tsv frogs.tsv --output-multi-affi multiaff.tsv --log-file biom_to_tsv.log " >> "dada2_frogs-$long-$type.sh"
sbatch --mem=32G -J dada2_${long}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2_frogs-$long-$type.sh"
cd -
done
done
#!/usr/bin/env Rscript
library(Matrix, lib.loc="~/R/x86_64-pc-linux-gnu-library/3.6")
library(argparse)
library(ShortRead)
library(Biostrings)
library(tidyverse)
library(phyloseq)
library(dada2)
library(ShortRead)
library(Biostrings)
library(biomformat)
## Get arguments from bash script
args <- commandArgs(TRUE)
parser <- ArgumentParser()
parser$add_argument("-i", "--input", type="character", default=NULL,
help="dataset file name", metavar="character")
parser$add_argument("-o", "--output", type="character", default="out.txt",
help="output file name", metavar="character")
parser$add_argument("--reference", type="character", default=NULL,
help="reference database", metavar="character")
parser$add_argument("--cpus", type="integer", default=1,
help="number of cpus", metavar="N")
parser$add_argument("--five-prim-primer", type="character", default=NULL,
help="forward primer sequence")
parser$add_argument("--three-prim-primer", type="character", default=NULL,
help="reverse primer sequence")
args <- parser$parse_args()
FWD <- args$five_prim_primer ## CHANGE ME to your forward primer sequence
REV <- args$three_prim_primer ## CHANGE ME...
input_dir <- args$input
output_dir <- args$output
## Input files
fnFs <- sort(list.files(input_dir, pattern = "_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(input_dir, pattern = "_R2.fastq.gz", full.names = TRUE))
# Cutadapt
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs[[1]]))
cutadapt <- "/usr/local/bioinfo/src/cutadapt/cutadapt-2.10/cutadapt-2.10_venv/bin/cutadapt"
path.cut <- file.path(output_dir, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))
FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV)
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV.RC, "-A", FWD.RC)
# Run Cutadapt
for(i in seq_along(fnFs)) {
system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
fnFs[i], fnRs[i])) # input files
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
# Remove sequences with N
path.N <- file.path(output_dir, "filtered")
filtFs <- file.path(output_dir, "filtered", basename(fnFs)) # Put N-filterd files in filtN/ subdirectory
filtRs <- file.path(output_dir, "filtered", basename(fnRs))
out <- filterAndTrim(fnFs.cut, filtFs, fnRs.cut, filtRs, maxN = 0, maxEE = c(2, 2),
truncQ = 2, minLen = 50, rm.phix = TRUE, compress = TRUE, multithread = 16)
get.sample.name <- function(fname) paste(strsplit(basename(fname), "_R")[[1]][1],collapse="_")
sample.names <- unname(sapply(fnFs, get.sample.name))
# Dada
dadaFs <- dada(filtFs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
dadaRs <- dada(filtRs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
names(dadaFs) <- sample.names
names(dadaRs) <- sample.names
# Deal with pairs
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, returnRejects = TRUE, trimOverhang = TRUE, verbose = TRUE, maxMismatch = 0)
concats <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, justConcatenate=TRUE,verbose=TRUE)
repair_merger <- function(merger, concat) {
merger[!merger$accept, ] <- concat[!merger$accept, ]
merger$sequence <- unlist(merger$sequence)
merger
}
repaired_mergers <- purrr::map2(mergers, concats, repair_merger)
seqtab <- makeSequenceTable(repaired_mergers)
colnames(seqtab) <- gsub("NNNNNNNNNN", "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", colnames(seqtab))
asv_seqs <- colnames(seqtab)
asv_headers <- paste(">",colnames(seqtab),sep="")
asv_headers <- if_else(grepl("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", asv_headers), paste0(">",asv_seqs, "_FROGS_combined"), asv_headers)
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, file.path(output_dir,"dada2_frogs_temp.fasta"))
asv_tab <- t(seqtab)
colnames(seqtab) <- if_else(grepl("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", colnames(seqtab)), paste0(colnames(seqtab), "_FROGS_combined"), colnames(seqtab))
row.names(asv_tab) <- sub(">", "", asv_headers)
write.table(asv_tab, file.path(output_dir,"ASVs_counts.txt"), sep="\t", quote=F)
st.biom <- make_biom(t(seqtab))
write_biom(st.biom, file.path(output_dir,"dada2_frogs_temp.biom"))
#!/bin/bash
export PYTHONPATH=/save/frogs/src/FROGS_dev/lib:$PYTHONPATH
export PATH=/save/frogs/src/FROGS_dev/libexec/:$PATH
export PATH=/save/frogs/src/FROGS_dev/app/:$PATH
SAVE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/results/FROGS"
INPUT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/"
REFERENCE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/REFERENCES/"
TYPE="ADN PCR"
ITS="ITS1 ITS2"
LONG="D1D2 RPB2"
CPU=8
declare -A FIVE_PRIM_PRIMERS=([ITS1]=CTTGGTCATTTAGAGGAAGTAA [ITS2]=GCATCGATGAAGAACGCAGC [D1D2]=GCATATCAATAAGCGGAGGAAAAG [RPB2]=TGGGGYATGGTNTGYCCYGC)
declare -A THREE_PRIM_PRIMERS=([ITS1]=GCATCGATGAAGAACGCAGC [ITS2]=GCAWAWCAAWAAGCGGAGGA [D1D2]=CCGTCTTGAAACACGGACC [RPB2]=CBTTCCCYGAYCAYAAYCARTC)
for its in $ITS
do
FIVE_PRIM=${FIVE_PRIM_PRIMERS[$its]}
THREE_PRIM=${THREE_PRIM_PRIMERS[$its]}
for type in $TYPE
do
# Create output directory for each dataset
ARCHIVE_FILE="$INPUT_DIR/$its/$type/Archive.tar.gz"
mkdir -p "$SAVE_DIR/$its/$type/"
cd "$SAVE_DIR/$its/$type/"
mkdir -p LOG
# Preprocess
echo "preprocess.py illumina --input-archive $ARCHIVE_FILE --keep-unmerged --merge-software pear --min-amplicon-size 50 --max-amplicon-size 490 --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --R1-size 250 --R2-size 250 --nb-cpus $CPU --output-dereplicated preprocess.fasta --output-count preprocess.tsv --summary preprocess.html --log-file preprocess.log" >> "frogs-$its-$type.sh"
# Clustering
echo "clustering.py --input-fasta preprocess.fasta --input-count preprocess.tsv --fastidious --distance 1 --nb-cpus $CPU --output-biom clustering.biom --output-fasta clustering.fasta --log-file clustering.log" >> "frogs-$its-$type.sh"
echo "remove_chimera.py --input-fasta clustering.fasta --input-biom clustering.biom --nb-cpus $CPU --out-abundance remove_chimera.biom --non-chimera remove_chimera.fasta --log-file remove_chimera.log --summary remove_chimera.html" >> "frogs-$its-$type.sh"
echo "otu_filters.py --min-abundance 0.00005 --nb-cpus $CPU --input-fasta remove_chimera.fasta --input-biom remove_chimera.biom --output-biom filters.biom --output-fasta filters.fasta --log-file filters.log --summary filters.html" >> "frogs-$its-$type.sh"
echo "itsx.py --nb-cpus $CPU --region $its --check-its-only --input-fasta filters.fasta --input-biom filters.biom --out-fasta itsx.fasta --out-abundance itsx.biom --summary itsx.html --log-file itsx.log " >> "frogs-$its-$type.sh"
echo "affiliation_OTU.py --nb-cpus $CPU --reference $REFERENCE_DIR/MEAT_${its}_GGtax.fasta --input-biom itsx.biom --input-fasta itsx.fasta --output-biom affiliation.biom --summary affiliation.html --log-file affiliation.log " >> "frogs-$its-$type.sh"
echo "affiliation_filters.py --output-biom frogs.biom --output-fasta frogs.fasta --summary affiliation_filters.html --log-file affiliation_filters.log --min-blast-coverage 0.9 --min-blast-identity 0.9 --delete --input-biom affiliation.biom --input-fasta itsx.fasta " >> "frogs-$its-$type.sh"
echo "biom_to_tsv.py --input-biom frogs.biom --input-fasta frogs.fasta --output-tsv frogs.tsv --output-multi-affi multiaff.tsv --log-file biom_to_tsv.log " >> "frogs-$its-$type.sh"
# launch FROGS
sbatch -J frogs_${its}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh frogs-$its-$type.sh"
cd -
done
done
for long in $LONG
do
FIVE_PRIM=${FIVE_PRIM_PRIMERS[$long]}
THREE_PRIM=${THREE_PRIM_PRIMERS[$long]}
for type in $TYPE
do
# Create output directory for each dataset
ARCHIVE_FILE="$INPUT_DIR/$long/$type/Archive.tar.gz"
mkdir -p "$SAVE_DIR/$long/$type/"
cd "$SAVE_DIR/$long/$type/"
mkdir -p LOG
# Preprocess
echo "preprocess.py illumina --input-archive $ARCHIVE_FILE --keep-unmerged --merge-software pear --min-amplicon-size 500 --max-amplicon-size 1000 --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --R1-size 250 --R2-size 250 --nb-cpus $CPU --output-dereplicated preprocess.fasta --output-count preprocess.tsv --summary preprocess.html --log-file preprocess.log" >> "frogs-$long-$type.sh"
# Clustering
echo "clustering.py --input-fasta preprocess.fasta --input-count preprocess.tsv --fastidious --distance 1 --nb-cpus $CPU --output-biom clustering.biom --output-fasta clustering.fasta --log-file clustering.log" >> "frogs-$long-$type.sh"
echo "remove_chimera.py --input-fasta clustering.fasta --input-biom clustering.biom --nb-cpus $CPU --out-abundance remove_chimera.biom --non-chimera remove_chimera.fasta --log-file remove_chimera.log --summary remove_chimera.html" >> "frogs-$long-$type.sh"
echo "otu_filters.py --min-abundance 0.00005 --nb-cpus $CPU --input-fasta remove_chimera.fasta --input-biom remove_chimera.biom --output-biom filters.biom --output-fasta filters.fasta --log-file filters.log --summary filters.html" >> "frogs-$long-$type.sh"
echo "affiliation_OTU.py --nb-cpus $CPU --reference $REFERENCE_DIR/MEAT_${long}_GGtax.fasta --input-biom filters.biom --input-fasta filters.fasta --output-biom affiliation.biom --summary affiliation.html --log-file affiliation.log " >> "frogs-$long-$type.sh"
echo "affiliation_filters.py --output-biom frogs.biom --output-fasta frogs.fasta --summary affiliation_filters.html --log-file affiliation_filters.log --min-blast-identity 0.9 --delete --input-biom affiliation.biom --input-fasta filters.fasta " >> "frogs-$long-$type.sh"
echo "biom_to_tsv.py --input-biom frogs.biom --input-fasta frogs.fasta --output-tsv frogs.tsv --output-multi-affi multiaff.tsv --log-file biom_to_tsv.log " >> "frogs-$long-$type.sh"
# launch FROGS
sbatch -J frogs_${long}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh frogs-$long-$type.sh"
cd -
done
done
#!/bin/sh
READS_DIR=$1
REF=$2
OUT_DIR=$3
CPU=$4
FW_PRIMER=$5
RV_PRIMER=$6
mkdir -p $OUT_DIR/usearch/log
export PATH=/save/frogs/src/FROGS_dev/app:$PATH
module load bioinfo/usearch11.0.667
module load system/Python-3.7.4
echo -e ">fw_prim\n"$FW_PRIMER > $OUT_DIR/usearch/single_fw_prim.fa
echo -e ">fw_prim\n"$FW_PRIMER"\n>rv_prim\n"$RV_PRIMER > $OUT_DIR/usearch/paired_prim.fa
###############################################################################################
# Merge all R1/R2 file pairs
# Add sample name to read labels (-relabel @ option)
# Pool samples together into raw.fq
# keep R1 unmerged reads
mkdir $OUT_DIR/data
for R1 in $READS_DIR/*_R1.fastq.gz
do
R2=`echo $R1 |sed 's/_R1.fastq.gz/_R2.fastq.gz/'`
sample=`basename $R1 |sed 's/_R1.fastq.gz//'`
gzip -dc $R1 > $OUT_DIR/data/`basename $R1`
gzip -dc $R2 > $OUT_DIR/data/`basename $R2`
R1=$OUT_DIR/data/`basename $R1`
R2=$OUT_DIR/data/`basename $R2`
usearch -fastq_mergepairs $R1 -reverse $R2 \
-sample $sample -threads $CPU \
-fastqout $OUT_DIR/usearch/${sample}_raw_paired.fq -fastqout_notmerged_fwd $OUT_DIR/usearch/${sample}_raw_unpaired_fw.fq 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/${sample}_fastq_mergepairs.log
awk -v S=$sample '{c++; if(c==1){print $0";sample="S";"}else{print $0; if(c==4){c=0}}}' $OUT_DIR/usearch/${sample}_raw_unpaired_fw.fq > $OUT_DIR/usearch/${sample}_raw_unpaired_fw_rename.fq
#rm $OUT_DIR/usearch/${sample}_raw_unpaired_fw.fq
usearch -search_oligodb $OUT_DIR/usearch/${sample}_raw_paired.fq -threads $CPU -db $OUT_DIR/usearch/paired_prim.fa -strand plus -userout $OUT_DIR/usearch/${sample}_paired_primer.tsv -userfields query+target+qlo+qhi 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/${sample}_paired_search_oligodb.log
/home/mbernard/save/tools/bin/bioawk -c fastx -v T=$OUT_DIR/usearch/${sample}_paired_primer.tsv 'BEGIN{while(getline<T>0){if($2=="fw_prim"){FW[$1]=$4}else{RV[$1]=$3}}}{
start=1
l=length($seq)
if(FW[$name] != ""){
start=FW[$name]+1
}
if(RV[$name] != ""){
l = RV[$name] - 1
}
l = l - start + 1
print "@"$name"\n"substr($seq,start,l)"\n+\n"substr($qual,start,l)
}' $OUT_DIR/usearch/${sample}_raw_paired.fq > $OUT_DIR/usearch/${sample}_raw_paired_truncated.fq
usearch -search_oligodb $OUT_DIR/usearch/${sample}_raw_unpaired_fw_rename.fq -threads $CPU -db $OUT_DIR/usearch/single_fw_prim.fa -strand plus -userout $OUT_DIR/usearch/${sample}_single_fw_primer.tsv -userfields query+target+qlo+qhi 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/${sample}_single_fw_search_oligodb.log
/home/mbernard/save/tools/bin/bioawk -c fastx -v T=$OUT_DIR/usearch/${sample}_single_fw_primer.tsv 'BEGIN{while(getline<T>0){FW[$1]=$4}}{
if(FW[$name] != ""){
start=FW[$name]+1
l = length($seq)-start+1
print "@"$name"\n"substr($seq,start,l)"\n+\n"substr($qual,start,l)
}else{
print "@"$name"\n"$seq"\n+\n"$qual
}
}' $OUT_DIR/usearch/${sample}_raw_unpaired_fw_rename.fq > $OUT_DIR/usearch/${sample}_raw_unpaired_fw_rename_truncated.fq
done
#~ rm -r $OUT_DIR/data
# concat R1R2 merged and R1 unmerged reads
cat $OUT_DIR/usearch/*raw_paired_truncated.fq $OUT_DIR/usearch/*_raw_unpaired_fw_rename_truncated.fq > $OUT_DIR/usearch/amplicon.fq
#rm $OUT_DIR/usearch/*raw_paired.fq $OUT_DIR/usearch/*raw_unpaired_fw_rename.fq
###############################################################################################
# Amplicon sequence quality filter
# Discard reads with > E expected errors per base: https://www.drive5.com/usearch/manual/exp_errs.html
# For example, with the fastq_maxee_rate option set to 0.01, then
# a read of length 100 will be discarded if the expected errors is >1,
# and a read of length 1,000 will be discarded if the expected errors is >10.
usearch -fastq_filter $OUT_DIR/usearch/amplicon.fq \
-fastq_maxee 1.0 -relabel Filt -threads $CPU \
-fastaout $OUT_DIR/usearch/amplicon_filtered.fa 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/fastq_filter.log
###############################################################################################
# Find unique read sequences, store abundances, and sort by abundance
usearch -fastx_uniques $OUT_DIR/usearch/amplicon_filtered.fa \
-sizeout -relabel Uniq -threads $CPU \
-fastaout $OUT_DIR/usearch/amplicon_uniques.fa 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/fastx_uniques.log
###############################################################################################
# Run UNOISE algorithm to get denoised sequences (ZOTUs)
# The -minsize option specifies the minimum abundance (size= annotation). Default is 8.
# Input sequences with lower abundances are discarded.
usearch -unoise3 $OUT_DIR/usearch/amplicon_uniques.fa \
-zotus $OUT_DIR/usearch/zotu.fa 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/unoise3.log
# make ZOTU table
usearch -otutab $OUT_DIR/usearch/amplicon.fq \
-threads $CPU \
-otus $OUT_DIR/usearch/zotu.fa -otutabout $OUT_DIR/usearch/zotus.txt 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/otutab.log
awk -F "\t" 'BEGIN{OFS="\t"}{if($1=="#OTU ID"){$1="observation_name\tobservation_sum"}else{s=0; for (i=2;i<= NF; i++){s+=$i} ; $1=$1"\t"s} print $0}' $OUT_DIR/usearch/zotus.txt > $OUT_DIR/usearch/zotus_reformat.txt
# keep Totu sequence that are present in zotu table
/home/mbernard/save/tools/bin/bioawk -c fastx -v Z=$OUT_DIR/usearch/zotus_reformat.txt 'BEGIN{while(getline<Z>0){if($1!="observation_name"){tab[$1]=1}}}{if(tab[$name]==1){print ">"$name"\n"$seq}}' $OUT_DIR/usearch/zotu.fa > $OUT_DIR/usearch.fa
tsv_to_biom.py -t $OUT_DIR/usearch/zotus_reformat.txt -b $OUT_DIR/usearch/zotus.biom
###############################################################################################
# perform affiliation using Sintax
# If the -sintax_cutoff option is given then predictions are written a second time after applying the confidence
# threshold, keeping only ranks with high enough confidence. On V4 reads, using a cutoff of 0.8 gives predictions
# with similar accuracy to RDP at 80% bootstrap cutoff.
usearch -sintax $OUT_DIR/usearch.fa \
-db $REF -strand both -threads $CPU \
-tabbedout $OUT_DIR/usearch/zotus_tax.txt -sintax_cutoff 0.8 2>&1 | sed -e 's/\r/\n/g' > $OUT_DIR/usearch/log/sintax.log
cat $OUT_DIR/usearch/zotus_tax.txt | awk '{c++; if(c==1){print "#observation_name\tboostrapped_taxonomy\tstrand\ttaxonomy"}; gsub(",",";",$0) ; print $0}' > $OUT_DIR/usearch/zotus_tax_reformat.txt
# ==> supprimer le cutoff de bootstrap ?
# create final biom
/usr/local/bioinfo/src/Miniconda/Miniconda3-4.4.10/envs/qiime2-2019.4/bin/biom add-metadata -i $OUT_DIR/usearch/zotus.biom \
--observation-metadata-fp $OUT_DIR/usearch/zotus_tax_reformat.txt \
-o $OUT_DIR/usearch.biom --output-as-json \
--sc-separated boostrapped_taxonomy,taxonomy
# create abundance table in TSV format
biom_to_tsv.py -b $OUT_DIR/usearch.biom -f $OUT_DIR/usearch.fa -t $OUT_DIR/usearch.tsv
#!/usr/bin/sh
# FROGS/app directory need to be in the PATH
module load system/Miniconda3-4.4.10
module load bioinfo/qiime2-2019.4
MANIFEST=$1
OUT_DIR=`dirname $MANIFEST`
WORK_DIR=$OUT_DIR/qiime_workdir
mkdir -p $WORK_DIR
DB_PREFIX=$2
DB_REF_SEQ=$DB_PREFIX.seq.qza
DB_REF_CLASSIFIER=$DB_PREFIX.classifier.qza
P1=$3
P2=$4
### import
qiime tools import \
--type 'SampleData[PairedEndSequencesWithQuality]' \
--input-path $MANIFEST \
--output-path $WORK_DIR/pe-demux.qza \
--input-format PairedEndFastqManifestPhred33V2
qiime demux summarize \
--i-data $WORK_DIR/pe-demux.qza \
--o-visualization $WORK_DIR/pe-demux.qzv
# trimming primers
JOINING_INPUT=$WORK_DIR/pe-demux.qza
if [[ $P1 != "" ]]
then
qiime cutadapt trim-paired \
--i-demultiplexed-sequences $WORK_DIR/pe-demux.qza \
--p-front-f $P1 \
--p-front-r $P2 \
--o-trimmed-sequences $WORK_DIR/pe-demux-trimmed.qza
qiime demux summarize \
--i-data $WORK_DIR/pe-demux-trimmed.qza \
--o-visualization $WORK_DIR/pe-demux-trimmed.qzv
JOINING_INTPUT=$WORK_DIR/pe-demux-trimmed.qza
fi
echo "qiime cutadapt trim-paired --i-demultiplexed-sequences $WORK_DIR/pe-demux.qza --p-front-f $P1 --p-front-r $P2 --o-trimmed-sequences $WORK_DIR/pe-demux-trimmed.qza"
########################################################################
### clustering pipeline : open reference clustering paired end
# The features produced by clustering methods are known as operational taxonomic units (OTUs), which is Esperanto for suboptimal, imprecise rubbish
# join pair
# paramètres par defaut:
# --p-minlen 1
# --p-maxns (optional)
# --p-allowmergestagger False
# --p-minovlen 10
# --p-maxdiffs 10
# --p-minmergelen (optional)
# --p-maxmergelen (optional)
# --p-maxee (optional)
# --p-qmin 0
# --p-qminout 0
# --p-qmax 41
# --p-qmaxout 41
qiime vsearch join-pairs \
--i-demultiplexed-seqs $JOINING_INTPUT \
--p-allowmergestagger --p-minovlen 10 --p-minmergelen 20 --p-maxmergelen 490 \
--o-joined-sequences $WORK_DIR/pe-join.qza
qiime demux summarize \
--i-data $WORK_DIR/pe-join.qza \
--o-visualization $WORK_DIR/pe-join.qzv
# dereplication
qiime vsearch dereplicate-sequences \
--i-sequences $WORK_DIR/pe-join.qza \
--o-dereplicated-table $WORK_DIR/pe-join-derepTable.qza \
--o-dereplicated-sequences $WORK_DIR/pe-join-derepSeqs.qza
qiime feature-table summarize \
--i-table $WORK_DIR/pe-join-derepTable.qza \
--o-visualization $WORK_DIR/pe-join-table-or-99.qzv # à renommer en pe-join-derepTable.qzv
# clustering
# Given a feature table and the associated feature sequences, cluster the
# features against a reference database based on user-specified percent
# identity threshold of their sequences. Any sequences that don't match are
# then clustered de novo. This is not a general-purpose clustering method,
# but rather is intended to be used for clustering the results of quality-
# filtering/dereplication methods, such as DADA2, or for re-clustering a
# FeatureTable at a lower percent identity than it was originally clustered
# at. When a group of features in the input table are clustered into a
# single feature, the frequency of that single feature in a given sample is
# the sum of the frequencies of the features that were clustered in that
# sample. Feature identifiers will be inherited from the centroid feature of
# each cluster. For features that match a reference sequence, the centroid
# feature is that reference sequence, so its identifier will become the
# feature identifier. The clustered_sequences result will contain feature
# representative sequences that are derived from the sequences input for all
# features in clustered_table. This will always be the most abundant
# sequence in the cluster. The new_reference_sequences result will contain
# the entire reference database, plus feature representative sequences for
# any de novo features. This is intended to be used as a reference database
# in subsequent iterations of cluster_features_open_reference, if
# applicable. See the vsearch documentation for details on how sequence
# clustering is performed.
# default param : --p-strand plus
qiime vsearch cluster-features-open-reference \
--i-table $WORK_DIR/pe-join-derepTable.qza \
--i-sequences $WORK_DIR/pe-join-derepSeqs.qza \
--i-reference-sequences $DB_REF_SEQ \
--p-perc-identity 0.99 \
--o-clustered-table $WORK_DIR/pe-clusterTable-or-99.qza \
--o-clustered-sequences $WORK_DIR/pe-clusterSeqs-or-99.qza \
--o-new-reference-sequences $WORK_DIR/new-ref-seqs-or-99.qza
qiime feature-table summarize \
--i-table $WORK_DIR/pe-clusterTable-or-99.qza \
--o-visualization $WORK_DIR/pe-clusterTable-or-99.qzv
# suppression des chimères
# Apply the vsearch uchime_denovo method to identify chimeric feature
# sequences. The results of this method can be used to filter chimeric
# features from the corresponding feature table. For additional details,
# please refer to the vsearch documentation.
# paramètres par défaut (défaut de vsearch, mais manque abskew : min abundance ratio of parent vs chimera (2.0)):
# --p-dn 1.4
# --p-mindiffs 3
# --p-mindiv 0.8
# --p-minh 0.28
# --p-xn 8.0
qiime vsearch uchime-denovo \
--i-table $WORK_DIR/pe-clusterTable-or-99.qza \
--i-sequences $WORK_DIR/pe-clusterSeqs-or-99.qza \
--output-dir $WORK_DIR/pe-uchime-dn-out
qiime feature-table filter-features \
--i-table $WORK_DIR/pe-clusterTable-or-99.qza \
--m-metadata-file $WORK_DIR/pe-uchime-dn-out/nonchimeras.qza \
--o-filtered-table $WORK_DIR/pe-nonchimeric-wo-borderline-table.qza
qiime feature-table summarize \
--i-table $WORK_DIR/pe-nonchimeric-wo-borderline-table.qza \
--o-visualization $WORK_DIR/pe-nonchimeric-wo-borderline-table.qzv
qiime feature-table filter-seqs \
--i-data $WORK_DIR/pe-clusterSeqs-or-99.qza \
--m-metadata-file $WORK_DIR/pe-uchime-dn-out/nonchimeras.qza \
--o-filtered-data $WORK_DIR/pe-nonchimeric-wo-borderline-seqs.qza
# filter single singleton
# q2-vsearch implements three different OTU clustering strategies: de novo, closed reference, and open reference.
# All should be preceded by basic quality-score-based filtering and followed by chimera filtering and aggressive
# OTU filtering (the treacherous trio, a.k.a. the Bokulich method). ==> 0.00005%
# ==> problème impossible de filtrer sur une fréquence de façon automatique
# ==> Filtre sur les single singleton
qiime feature-table filter-features \
--i-table $WORK_DIR/pe-nonchimeric-wo-borderline-table.qza \
--p-min-frequency 2 \
--o-filtered-table $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qza
qiime feature-table summarize \
--i-table $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qza \
--o-visualization $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qzv
qiime feature-table filter-seqs \
--i-data $WORK_DIR/pe-nonchimeric-wo-borderline-seqs.qza \
--i-table $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qza \
--o-filtered-data $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-seqs.qza
# affiliation
qiime feature-classifier classify-sklearn \
--i-classifier $DB_REF_CLASSIFIER \
--i-reads $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-seqs.qza \
--o-classification $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-tax.qza
qiime metadata tabulate \
--m-input-file $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-tax.qza \
--o-visualization $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-tax.qzv
#~ # export biom
qiime tools export \
--input-path $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-table.qza \
--output-path $WORK_DIR/export
qiime tools export \
--input-path $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-tax.qza \
--output-path $WORK_DIR/export
# create final biom
sed -i 's/Feature/#Feature/' $WORK_DIR/export/taxonomy.tsv
biom add-metadata -i $WORK_DIR/export/feature-table.biom \
--observation-metadata-fp $WORK_DIR/export/taxonomy.tsv \
-o $OUT_DIR/pe-qiime.biom \
--output-as-json
# export fasta
qiime tools export \
--input-path $WORK_DIR/pe-nonchimeric-wo-borderline-noSingle-seqs.qza \
--output-path $WORK_DIR/export
mv $WORK_DIR/export/dna-sequences.fasta $OUT_DIR/pe-qiime.fasta
# create abundance table in TSV format
biom_to_tsv.py -b $OUT_DIR/pe-qiime.biom -f $OUT_DIR/pe-qiime.fasta -t $OUT_DIR/pe-qiime.tsv
#!/usr/bin/sh
# FROGS/app directory need to be in the PATH
module load system/Miniconda3-4.4.10
module load bioinfo/qiime2-2019.4
MANIFEST=$1
OUT_DIR=`dirname $MANIFEST`
WORK_DIR=$OUT_DIR/qiime_workdir
mkdir -p $WORK_DIR
DB_PREFIX=$2
DB_REF_SEQ=$DB_PREFIX.seq.qza
DB_REF_CLASSIFIER=$DB_PREFIX.classifier.qza
CPU=$3
P1=$4
### import
qiime tools import \
--type 'SampleData[SequencesWithQuality]' \
--input-path $MANIFEST \
--output-path $WORK_DIR/se-demux.qza \
--input-format SingleEndFastqManifestPhred33V2
qiime demux summarize \
--i-data $WORK_DIR/se-demux.qza \
--o-visualization $WORK_DIR/se-demux.qzv
# trimming primers
JOINING_INPUT=$WORK_DIR/se-demux.qzv
if [[ $P1 != "" ]]
then
qiime cutadapt trim-single \
--i-demultiplexed-sequences $WORK_DIR/se-demux.qza \
--p-front $P1 \
--o-trimmed-sequences $WORK_DIR/se-demux-trimmed.qza
qiime demux summarize \
--i-data $WORK_DIR/se-demux-trimmed.qza \
--o-visualization $WORK_DIR/se-demux-trimmed.qzv
JOINING_INTPUT=$WORK_DIR/se-demux-trimmed.qza
fi
########################################################################
### clustering pipeline : open reference clustering single end
# The features produced by clustering methods are known as operational taxonomic units (OTUs), which is Esperanto for suboptimal, imprecise rubbish
qiime vsearch dereplicate-sequences \
--i-sequences $JOINING_INTPUT \
--o-dereplicated-table $WORK_DIR/se-derepTable.qza \
--o-dereplicated-sequences $WORK_DIR/se-derepSeqs.qza
qiime feature-table summarize \
--i-table $WORK_DIR/se-derepTable.qza \
--o-visualization $WORK_DIR/se-derepTable.qzv
# clustering
# Given a feature table and the associated feature sequences, cluster the
# features against a reference database based on user-specified percent
# identity threshold of their sequences. Any sequences that don't match are
# then clustered de novo. This is not a general-purpose clustering method,
# but rather is intended to be used for clustering the results of quality-
# filtering/dereplication methods, such as DADA2, or for re-clustering a
# FeatureTable at a lower percent identity than it was originally clustered
# at. When a group of features in the input table are clustered into a
# single feature, the frequency of that single feature in a given sample is
# the sum of the frequencies of the features that were clustered in that
# sample. Feature identifiers will be inherited from the centroid feature of
# each cluster. For features that match a reference sequence, the centroid
# feature is that reference sequence, so its identifier will become the
# feature identifier. The clustered_sequences result will contain feature
# representative sequences that are derived from the sequences input for all
# features in clustered_table. This will always be the most abundant
# sequence in the cluster. The new_reference_sequences result will contain
# the entire reference database, plus feature representative sequences for
# any de novo features. This is intended to be used as a reference database
# in subsequent iterations of cluster_features_open_reference, if
# applicable. See the vsearch documentation for details on how sequence
# clustering is performed.
# default param : --p-strand plus
qiime vsearch cluster-features-open-reference \
--i-table $WORK_DIR/se-derepTable.qza \
--i-sequences $WORK_DIR/se-derepSeqs.qza \
--i-reference-sequences $DB_REF_SEQ \
--p-perc-identity 0.99 --p-threads $CPU \
--o-clustered-table $WORK_DIR/se-clusterTable-or-99.qza \
--o-clustered-sequences $WORK_DIR/se-clusterSeqs-or-99.qza \
--o-new-reference-sequences $WORK_DIR/new-ref-seqs-or-99.qza
qiime feature-table summarize \
--i-table $WORK_DIR/se-clusterTable-or-99.qza \
--o-visualization $WORK_DIR/se-clusterTable-or-99.qzv
# suppression des chimères
# Apply the vsearch uchime_denovo method to identify chimeric feature
# sequences. The results of this method can be used to filter chimeric
# features from the corresponding feature table. For additional details,
# please refer to the vsearch documentation.
# paramètres par défaut (défaut de vsearch, mais manque abskew : min abundance ratio of parent vs chimera (2.0)):
# --p-dn 1.4
# --p-mindiffs 3
# --p-mindiv 0.8
# --p-minh 0.28
# --p-xn 8.0
qiime vsearch uchime-denovo \
--i-table $WORK_DIR/se-clusterTable-or-99.qza \
--i-sequences $WORK_DIR/se-clusterSeqs-or-99.qza \
--output-dir $WORK_DIR/se-uchime-dn-out
qiime feature-table filter-features \
--i-table $WORK_DIR/se-clusterTable-or-99.qza \
--m-metadata-file $WORK_DIR/se-uchime-dn-out/nonchimeras.qza \
--o-filtered-table $WORK_DIR/se-nonchimeric-wo-borderline-table.qza
qiime feature-table summarize \
--i-table $WORK_DIR/se-nonchimeric-wo-borderline-table.qza \
--o-visualization $WORK_DIR/se-nonchimeric-wo-borderline-table.qzv
qiime feature-table filter-seqs \
--i-data $WORK_DIR/se-clusterSeqs-or-99.qza \
--m-metadata-file $WORK_DIR/se-uchime-dn-out/nonchimeras.qza \
--o-filtered-data $WORK_DIR/se-nonchimeric-wo-borderline-seqs.qza
# filter single singleton
# q2-vsearch implements three different OTU clustering strategies: de novo, closed reference, and open reference.
# All should be preceded by basic quality-score-based filtering and followed by chimera filtering and aggressive
# OTU filtering (the treacherous trio, a.k.a. the Bokulich method). ==> 0.00005%
# ==> problème impossible de filtrer sur une fréquence de façon automatique
# ==> Filtre sur les single singleton
qiime feature-table filter-features \
--i-table $WORK_DIR/se-nonchimeric-wo-borderline-table.qza \
--p-min-frequency 2 \
--o-filtered-table $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qza
qiime feature-table summarize \
--i-table $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qza \
--o-visualization $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qzv
qiime feature-table filter-seqs \
--i-data $WORK_DIR/se-nonchimeric-wo-borderline-seqs.qza \
--i-table $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qza \
--o-filtered-data $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-seqs.qza
# affiliation
qiime feature-classifier classify-sklearn \
--i-classifier $DB_REF_CLASSIFIER \
--i-reads $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-seqs.qza \
--o-classification $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-tax.qza
qiime metadata tabulate \
--m-input-file $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-tax.qza \
--o-visualization $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-tax.qzv
# export biom
qiime tools export \
--input-path $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-table.qza \
--output-path $WORK_DIR/export
qiime tools export \
--input-path $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-tax.qza \
--output-path $WORK_DIR/export
# create final biom
sed -i 's/Feature/#Feature/' $WORK_DIR/export/taxonomy.tsv
biom add-metadata -i $WORK_DIR/export/feature-table.biom \
--observation-metadata-fp $WORK_DIR/export/taxonomy.tsv \
-o $OUT_DIR/se-qiime.biom \
--output-as-json
# export fasta
qiime tools export \
--input-path $WORK_DIR/se-nonchimeric-wo-borderline-noSingle-seqs.qza \
--output-path $WORK_DIR/export
mv $WORK_DIR/export/dna-sequences.fasta $OUT_DIR/se-qiime.fasta
# create abundance table in TSV format
biom_to_tsv.py -b $OUT_DIR/se-qiime.biom -f $OUT_DIR/se-qiime.fasta -t $OUT_DIR/se-qiime.tsv
#!/bin/bash
#export PYTHONPATH=/save/frogs/src/FROGS_dev/lib:$PYTHONPATH
#export PATH=/save/frogs/src/FROGS_dev/libexec/:$PATH
#export PATH=/save/frogs/src/FROGS_dev/app/:$PATH
SAVE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/results/DADA2"
INPUT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/"
SCRIPT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/scripts/"
#REFERENCE="/work/frogsfungi/FungiPubli/JeuTestDuMoment/Unite_ref/Unite_7.1_clean_certae_refs_for_DADA2.fasta"
REFERENCE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/REFERENCES/"
TYPE="ADN PCR"
ITS="ITS1 ITS2 D1D2 RPB2"
CPU=8
declare -A FIVE_PRIM_PRIMERS=([ITS1]=CTTGGTCATTTAGAGGAAGTAA [ITS2]=GCATCGATGAAGAACGCAGC [D1D2]=GCATATCAATAAGCGGAGGAAAAG [RPB2]=TGGGGYATGGTNTGYCCYGC)
declare -A THREE_PRIM_PRIMERS=([ITS1]=GCATCGATGAAGAACGCAGC [ITS2]=GCAWAWCAAWAAGCGGAGGA [D1D2]=CCGTCTTGAAACACGGACC [RPB2]=CBTTCCCYGAYCAYAAYCARTC)
for its in $ITS
do
for type in $TYPE
do
mkdir -p "$SAVE_DIR/$its/$type/"
cd "$SAVE_DIR/$its/$type/"
mkdir -p LOG
# launch DADA2
FIVE_PRIM=${FIVE_PRIM_PRIMERS[$its]}
THREE_PRIM=${THREE_PRIM_PRIMERS[$its]}
echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$its/$type/ --output $SAVE_DIR/$its/$type/ --reference $REFERENCE_DIR/MEAT_${its}_GGtax_dada2.fasta --cpus $CPU" > "dada2-$its-$type.sh"
sbatch --mem=32G -J dada2_${its}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2-$its-$type.sh"
cd -
done
done
#!/usr/bin/env Rscript
library(argparse)
library(ShortRead)
library(Biostrings)
library(tidyverse)
library(phyloseq)
library(dada2)
library(ShortRead)
library(Biostrings)
## Get arguments from bash script
args <- commandArgs(TRUE)
parser <- ArgumentParser()
parser$add_argument("-i", "--input", type="character", default=NULL,
help="dataset file name", metavar="character")
parser$add_argument("-o", "--output", type="character", default="out.txt",
help="output file name", metavar="character")
parser$add_argument("--reference", type="character", default=NULL,
help="reference database", metavar="character")
parser$add_argument("--cpus", type="integer", default=1,
help="number of cpus", metavar="N")
parser$add_argument("--five-prim-primer", type="character", default=NULL,
help="forward primer sequence")
parser$add_argument("--three-prim-primer", type="character", default=NULL,
help="reverse primer sequence")
args <- parser$parse_args()
FWD <- args$five_prim_primer ## CHANGE ME to your forward primer sequence
REV <- args$three_prim_primer ## CHANGE ME...
input_dir <- args$input
output_dir <- args$output
## Input files
fnFs <- sort(list.files(input_dir, pattern = "_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(input_dir, pattern = "_R2.fastq.gz", full.names = TRUE))
# Cutadapt
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs[[1]]))
cutadapt <- "/usr/local/bioinfo/src/cutadapt/cutadapt-2.10/cutadapt-2.10_venv/bin/cutadapt"
path.cut <- file.path(output_dir, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))
FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV)
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV.RC, "-A", FWD.RC)
# Run Cutadapt
for(i in seq_along(fnFs)) {
system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
fnFs[i], fnRs[i])) # input files
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
# Remove sequences with N
path.N <- file.path(output_dir, "filtered")
filtFs <- file.path(output_dir, "filtered", basename(fnFs)) # Put N-filterd files in filtN/ subdirectory
filtRs <- file.path(output_dir, "filtered", basename(fnRs))
out <- filterAndTrim(fnFs.cut, filtFs, fnRs.cut, filtRs, maxN = 0, maxEE = c(2, 2),
truncQ = 2, minLen = 50, rm.phix = TRUE, compress = TRUE, multithread = 16)
get.sample.name <- function(fname) paste(strsplit(basename(fname), "_R")[[1]][1],collapse="_")
sample.names <- unname(sapply(fnFs, get.sample.name))
# Learn errors
#errF <- learnErrors(filtFs, multithread = args$cpus)
#errR <- learnErrors(filtRs, multithread = args$cpus)
# Dada
dadaFs <- dada(filtFs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
dadaRs <- dada(filtRs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
#dadaFs <- dada(filtFs, err=errF, multithread = args$cpus)
#dadaRs <- dada(filtRs, err=errR, multithread = args$cpus)
names(dadaFs) <- sample.names
names(dadaRs) <- sample.names
# Dereplication
#derepFs <- derepFastq(filtFs, verbose = TRUE)
#derepRs <- derepFastq(filtRs, verbose = TRUE)
#names(derepFs) <- sample.names
#names(derepRs) <- sample.names
# Deal with pairs
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs)
seqtab <- makeSequenceTable(mergers)
seqtab.nochim <- removeBimeraDenovo(seqtab, multithread=args$cpus)
taxa <- assignTaxonomy(seqtab.nochim, args$reference, multithread = args$cpus)
saveRDS(taxa,file.path(output_dir,"taxa.rds"))
taxa <- taxa[,c(1,2,3,4,5,6,7)]
colnames(taxa) <- c("Kingdom", "Phylum", "Class", "Order" , "Family" , "Genus" , "Species")
### Track reads
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
print(track)
###
asv_seqs <- colnames(seqtab.nochim)
biomtable <- t(seqtab.nochim)
b <- as_tibble(biomtable)
obs_sum <- rowSums(b)
b$observation_sum <- obs_sum
b$observation_name <- paste0("ASV", 1:nrow(b))
b$seed_sequence <- asv_seqs
taxonomy <- as_tibble(taxa) %>% unite("#taxonomy", 1:7 , sep=";", remove = FALSE)
b$"#taxonomy" <- taxonomy$"#taxonomy"
col_order <- c("#taxonomy", "seed_sequence", "observation_name", "observation_sum",
rownames(seqtab.nochim))
b2 <- b[, col_order]
write.table(b2, file.path(output_dir,"dada2.tsv"), sep="\t", quote=F, row.names=F)
tsv_to_biom <- "/save/frogs/src/FROGS_dev/app/tsv_to_biom.py"
system2(tsv_to_biom, args = c("--input-tsv", file.path(output_dir,"dada2.tsv"),
"--output-biom", file.path(output_dir,"dada2.biom"),
"--output-fasta", file.path(output_dir,"dada2.fasta"),
"--log-file", file.path(output_dir,"tsv_to_biom.log")
)
)
samples.out <- rownames(seqtab.nochim)
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE) ,
tax_table(taxa))
saveRDS(ps, file.path(output_dir,"dada2.rds"))
###############
ids_study <- paste("Cluster", 1:ncol(seqtab.nochim), sep = "_")
asv_headers <- paste(">",ids_study, sep="")
asv_seqs <- colnames(seqtab.nochim)
asv_fasta <- c(rbind(asv_headers, asv_seqs))
taxanomy_fields <- apply(unname(taxa), 1, function(x) paste(x, collapse=";"))
seqtab.nochim_t <- as.data.frame(t(seqtab.nochim))
samples_ids <- colnames(seqtab.nochim_t)
seqtab.nochim_t$ids <- ids_study#rownames(seqtab.nochim_t)
seqtab.nochim_t$"#blast_taxonomy" <- taxanomy_fields
seqtab.nochim_t$"seed_sequence" <- asv_seqs
seqtab.nochim_t$"observation_name" <- ids_study
seqtab.nochim_t$observation_sum <- rowSums(t(seqtab.nochim))
seqtab.nochim_t <- seqtab.nochim_t[, c("#blast_taxonomy", "seed_sequence", "observation_name", "observation_sum", samples_ids)]
write.table(x = rbind(colnames(seqtab.nochim_t), seqtab.nochim_t),
file = file.path(output_dir, "dada2.tsv"),
quote = FALSE,
sep = "\t",
col.names = FALSE,
row.names = FALSE)
system2("/save/frogs/src/FROGS_dev/app/tsv_to_biom.py", args=c("--input-tsv", file.path(output_dir, "dada2.tsv"), "--output-biom", file.path(output_dir, "dada2.biom")))
SAVE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/results/DADA2_SE"
INPUT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/"
SCRIPT_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/scripts/"
#REFERENCE="/work/frogsfungi/FungiPubli/JeuTestDuMoment/Unite_ref/Unite_7.1_clean_certae_refs_for_DADA2.fasta"
REFERENCE_DIR="/work/frogsfungi/FungiPubli/Real_mock/METABARFOOD/MEAT/REFERENCES/"
TYPE="ADN PCR"
ITS="ITS1 ITS2"
LONG="D1D2 RPB2"
CPU=8
declare -A FIVE_PRIM_PRIMERS=([ITS1]=CTTGGTCATTTAGAGGAAGTAA [ITS2]=GCATCGATGAAGAACGCAGC [D1D2]=GCATATCAATAAGCGGAGGAAAAG [RPB2]=TGGGGYATGGTNTGYCCYGC)
declare -A THREE_PRIM_PRIMERS=([ITS1]=GCATCGATGAAGAACGCAGC [ITS2]=GCAWAWCAAWAAGCGGAGGA [D1D2]=CCGTCTTGAAACACGGACC [RPB2]=CBTTCCCYGAYCAYAAYCARTC)
for its in $ITS
do
for type in $TYPE
do
mkdir -p "$SAVE_DIR/$its/$type/"
cd "$SAVE_DIR/$its/$type/"
mkdir -p LOG
# launch DADA2
FIVE_PRIM=${FIVE_PRIM_PRIMERS[$its]}
THREE_PRIM=${THREE_PRIM_PRIMERS[$its]}
echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2_se.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$its/$type/ --output $SAVE_DIR/$its/$type/ --reference $REFERENCE_DIR/MEAT_${its}_GGtax_dada2.fasta --cpus $CPU" > "dada2-$its-$type.sh"
sbatch --mem=32G -J dada2_${its}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2-$its-$type.sh"
cd -
done
done
for long in $LONG
do
for type in $TYPE
do
mkdir -p "$SAVE_DIR/$long/$type/"
cd "$SAVE_DIR/$long/$type/"
mkdir -p LOG
# launch DADA2
FIVE_PRIM=${FIVE_PRIM_PRIMERS[$long]}
THREE_PRIM=${THREE_PRIM_PRIMERS[$long]}
echo "module load bioinfo/cutadapt-2.10 system/R-3.6.1 libraries/proj-4.9.3 libraries/gdal-2.3.0 system/Python-3.7.4 && $SCRIPT_DIR/dada2_se.R --five-prim-primer $FIVE_PRIM --three-prim-primer $THREE_PRIM --input $INPUT_DIR/$long/$type/ --output $SAVE_DIR/$long/$type/ --reference $REFERENCE_DIR/MEAT_${long}_GGtax_dada2.fasta --cpus $CPU" > "dada2-$long-$type.sh"
sbatch --mem=32G -J dada2_${long}_${type} --cpus-per-task=$CPU -o LOG/%x.out -e LOG/%x.err --wrap="sh dada2-$long-$type.sh"
cd -
done
done
#!/usr/bin/env Rscript
library(argparse)
library(ShortRead)
library(Biostrings)
library(tidyverse)
library(phyloseq)
library(dada2)
## Get arguments from bash script
args <- commandArgs(TRUE)
parser <- ArgumentParser()
parser$add_argument("-i", "--input", type="character", default=NULL,
help="dataset file name", metavar="character")
parser$add_argument("-o", "--output", type="character", default="out.txt",
help="output file name", metavar="character")
parser$add_argument("--reference", type="character", default=NULL,
help="reference database", metavar="character")
parser$add_argument("--cpus", type="integer", default=1,
help="number of cpus", metavar="N")
parser$add_argument("--five-prim-primer", type="character", default=NULL,
help="forward primer sequence")
parser$add_argument("--three-prim-primer", type="character", default=NULL,
help="reverse primer sequence")
args <- parser$parse_args()
FWD <- args$five_prim_primer ## CHANGE ME to your forward primer sequence
REV <- args$three_prim_primer ## CHANGE ME...
input_dir <- args$input
output_dir <- args$output
## Input files
fnFs <- sort(list.files(input_dir, pattern = "_R1.fastq.gz", full.names = TRUE))
#fnRs <- sort(list.files(input_dir, pattern = "_R2.fastq.gz", full.names = TRUE))
# Cutadapt
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs[[1]]),
#FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs[[1]]))
#REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs[[1]]))
cutadapt <- "/usr/local/bioinfo/src/cutadapt/cutadapt-2.10/cutadapt-2.10_venv/bin/cutadapt"
path.cut <- file.path(output_dir, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
#fnRs.cut <- file.path(path.cut, basename(fnRs))
FWD.RC <- dada2:::rc(FWD)
#REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV)
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
#R2.flags <- paste("-G", REV.RC, "-A", FWD.RC)
# Run Cutadapt
for(i in seq_along(fnFs)) {
system2(cutadapt, args = c(R1.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fnFs.cut[i], # output files
fnFs[i])) # input files
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
#FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]))
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]))#,
#REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
# Remove sequences with N
path.N <- file.path(output_dir, "filtered")
filtFs <- file.path(output_dir, "filtered", basename(fnFs)) # Put N-filterd files in filtN/ subdirectory
#filtRs <- file.path(output_dir, "filtered", basename(fnRs))
filterAndTrim(fnFs.cut, filtFs, maxN = 0, maxEE = 2,
truncQ = 2, minLen = 50, rm.phix = TRUE, compress = TRUE, multithread = 16)
get.sample.name <- function(fname) paste(strsplit(basename(fname), "_R")[[1]][1],collapse="_")
sample.names <- unname(sapply(fnFs, get.sample.name))
# Learn errors
#errF <- learnErrors(filtFs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
#errR <- learnErrors(filtRs, multithread = args$cpus)
# Dada
dadaFs <- dada(filtFs, selfConsist=TRUE, pool=TRUE, multithread = args$cpus)
#dadaRs <- dada(filtRs, err=errR, multithread = args$cpus)
names(dadaFs) <- sample.names
#names(dadaRs) <- sample.names
# Dereplication
#derepFs <- derepFastq(filtFs, verbose = TRUE)
#derepRs <- derepFastq(filtRs, verbose = TRUE)
#names(derepFs) <- sample.names
#names(derepRs) <- sample.names
# Deal with pairs
#mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs)
seqtab <- makeSequenceTable(dadaFs)
seqtab.nochim <- removeBimeraDenovo(seqtab, multithread=args$cpus)
taxa <- assignTaxonomy(seqtab.nochim, args$reference, multithread = args$cpus)
saveRDS(taxa,file.path(output_dir,"taxa.rds"))
taxa <- taxa[,c(1,2,3,4,5,6,7)]
colnames(taxa) <- c("Kingdom", "Phylum", "Class", "Order" , "Family" , "Genus" , "Species")
asv_seqs <- colnames(seqtab.nochim)
biomtable <- t(seqtab.nochim)
b <- as_tibble(biomtable)
obs_sum <- rowSums(b)
b$observation_sum <- obs_sum
b$observation_name <- paste0("ASV", 1:nrow(b))
b$seed_sequence <- asv_seqs
taxonomy <- as_tibble(taxa) %>% unite("#taxonomy", 1:7 , sep=";", remove = FALSE)
b$"#taxonomy" <- taxonomy$"#taxonomy"
col_order <- c("#taxonomy", "seed_sequence", "observation_name", "observation_sum",
rownames(seqtab.nochim))
b2 <- b[, col_order]
write.table(b2, file.path(output_dir,"dada2.tsv"), sep="\t", quote=F, row.names=F)
tsv_to_biom <- "/save/frogs/src/FROGS_dev/app/tsv_to_biom.py"
system2(tsv_to_biom, args = c("--input-tsv", file.path(output_dir,"dada2.tsv"),
"--output-biom", file.path(output_dir,"dada2.biom"),
"--output-fasta", file.path(output_dir,"dada2.fasta"),
"--log-file", file.path(output_dir,"tsv_to_biom.log")
)
)
samples.out <- rownames(seqtab.nochim)
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE) ,
tax_table(taxa))
saveRDS(ps, file.path(output_dir,"dada2.rds"))
###############
ids_study <- paste("Cluster", 1:ncol(seqtab.nochim), sep = "_")
asv_headers <- paste(">",ids_study, sep="")
asv_seqs <- colnames(seqtab.nochim)
asv_fasta <- c(rbind(asv_headers, asv_seqs))
taxanomy_fields <- apply(unname(taxa), 1, function(x) paste(x, collapse=";"))
seqtab.nochim_t <- as.data.frame(t(seqtab.nochim))
samples_ids <- colnames(seqtab.nochim_t)
seqtab.nochim_t$ids <- ids_study#rownames(seqtab.nochim_t)
seqtab.nochim_t$"#blast_taxonomy" <- taxanomy_fields
seqtab.nochim_t$"seed_sequence" <- asv_seqs
seqtab.nochim_t$"observation_name" <- ids_study
seqtab.nochim_t$observation_sum <- rowSums(t(seqtab.nochim))
seqtab.nochim_t <- seqtab.nochim_t[, c("#blast_taxonomy", "seed_sequence", "observation_name", "observation_sum", samples_ids)]
write.table(x = rbind(colnames(seqtab.nochim_t), seqtab.nochim_t),
file = file.path(output_dir, "dada2.tsv"),
quote = FALSE,
sep = "\t",
col.names = FALSE,
row.names = FALSE)
system2("/save/frogs/src/FROGS_dev/app/tsv_to_biom.py", args=c("--input-tsv", file.path(output_dir, "dada2.tsv"), "--output-biom", file.path(output_dir, "dada2.biom")))
Different metrics were computed to compare methods:
From these metrics we can compute the Precision (TP/(TP+FP)) and the Recall Rate (TP/(TP+FN)) of each method.
Finally, as we knew the exact expected sequences, we computed the number of sequences perfectly reconstructed: OTUs/ASVs/zOTUs whose nucleic sequence is strictly identic to reference sequence. For long sequences that we cannot get the complete sequence with 2x250 bp, 100% of identity of OTU/ASV/zOTU sequence is enough to be considered as perfectly reconstructed.
source("functions.R")
plot_divergence <- function(dataset, divergence_file){
p1<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")) %>% mutate(its=factor(its, levels=c("ITS1","ITS2","D1/D2","RPB2"))) , aes(x=sample, color=data, y=Divergence, fill=SampleID)) +
geom_boxplot() + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "none",axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
#stat_summary(fun = "median", geom = "point", aes(color = tool)) +
#scale_fill_manual(values=method_palette_wo_real, aesthetics = c("colour", "fill")) +
#stat_compare_means(ref.group = "FROGS", method = "wilcox.test", label="p.signif" ) +
ggtitle("Divergence (%) at species level", subtitle = "") +
scale_y_continuous(expand = expansion(0, 0)) +
scale_x_discrete(expand = expansion(0, 0)) #+
#expand_limits(y = 0)
return(p1)
}
plot_metrics <- function(dataset, divergence_file){
p2<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")) %>% mutate(data=str_replace(data,"ADN","DNA")), aes(x=sample, color=data,y=False.Negative, fill=SampleID)) + facet_grid(~its)+
geom_boxplot() + guides(fill = FALSE) +
scale_y_continuous(expand = expansion(0, 0)) +
scale_x_discrete(expand = expansion(0, 0)) +
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), legend.position = "none") #+
#expand_limits(y = 0)
p3<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")), aes(x=sample, color=data, y=False.Positive, fill=SampleID)) + facet_grid(~its)+
geom_boxplot() + guides(fill = FALSE) +
scale_y_continuous(expand = expansion(0, 0)) +
scale_x_discrete(expand = expansion(0, 0)) +
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), legend.position = "none") #+
#expand_limits(y = 0)
p4<-ggplot(as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")), aes(x=sample, color=data, y=True.Positive, fill=SampleID)) + facet_grid(~its)+
geom_boxplot() + guides(fill = FALSE) +
scale_y_continuous(expand = expansion(0, 0)) +
scale_x_discrete(expand = expansion(0, 0)) +
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), legend.position = "none") #+
#expand_limits(y = 0)
p <- ggarrange(p2, p3, p4, ncol=3, nrow=1, common.legend = TRUE, legend="right")
p <- annotate_figure(p,
top = text_grob("Metrics at species level")
)
return(p)
}
plot_precision_and_recall <- function(dataset, divergence_file){
infos <- as_tibble(read.table(divergence_file, sep="\t", header=TRUE)) %>% mutate(SampleID=paste0(sample,"-",its)) %>% filter(tool == "DADA2_FROGS") %>% mutate(its=replace(its, its=="D1D2", "D1/D2")) %>% mutate(sample=str_replace(sample,"ADN","DNA")) %>% mutate(data=str_replace(data,"ADN","DNA")) %>% mutate(Precision = (True.Positive/(True.Positive + False.Positive))) %>% mutate(Recall.Rate = (True.Positive/(True.Positive + False.Negative)))
p1<-ggplot(infos, aes(x=sample, color=data, y=Recall.Rate, fill=SampleID)) + facet_grid(~its)+
geom_boxplot() + guides(fill = FALSE) + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) #+
#theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) #+
#expand_limits(y = 0)
p2<-ggplot(infos, aes(x=sample, color=data, y=Precision, fill=SampleID)) + facet_grid(~its)+
geom_boxplot() + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))# +
#theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank()) #+
#expand_limits(y = 0)
return(ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="right"))
}
metabarfood <- TRUE
amplicons <- c("ITS1","ITS2", "D1D2", "RPB2")
types <- c("ADN","PCR")
absent_color <- "black"
absent_color <- "#F6CFB8"
partially_color <- "#b1c5c4"
partially_color <- "#DAF7A6"
perfect_color <- "#cce5ff"
perfect_color <- "#86a5a4"
perfect_color <- "#cce5ff"
tools <- c("FROGS", "DADA2_FROGS", "USEARCH", "DADA2-pe", "DADA2-se", "QIIME-se", "QIIME-pe")
dedicated_palette <- c("#cccc00", "#8eadac", "#9a9a9a", "#56B4E9", "#8a6479", "#8a8964", "#ad8e9f", "#adac8e")
method_palette <- setNames(
object = dedicated_palette,
nm = c("EXPECTED", "FROGS", "DADA2_FROGS", "USEARCH", "DADA2-se", "QIIME-se", "DADA2-pe", "QIIME-pe")
)
dedicated_palette_wo_real <- c("#8eadac", "#9a9a9a", "#56B4E9", "#8a6479", "#8a8964", "#ad8e9f", "#adac8e")
method_palette_wo_real <- setNames(
object = dedicated_palette_wo_real,
nm = c("FROGS", "DADA2_FROGS", "USEARCH", "DADA2-se", "QIIME-se", "DADA2-pe", "QIIME-pe")
)
This table shows, for each of the 118 strains, the taxonomy, the sequence of the four markers used in this study, and the presence or absence for the four ecosystems.
ggd <- read.delim("REFERENCES/118sp_wo_droso.tsv", sep="\t", header=T, stringsAsFactors = F)
datatable(ggd, extensions='Buttons', options= list(dom='Bfrtip',buttons = c('excel','csv')))
These results show the agglomerated results of the four ecosystems (bread, cheese, fermented meat and wine)
a <- read.csv2("data/divergence_all_ecos.tsv", sep="\t", stringsAsFactors = F, dec = ".", header = T) %>% as_tibble()
a$Recall.rate <- as.numeric(a$Recall.rate)
a$Precision <- as.numeric(a$Precision)
a$Divergence <- as.numeric(a$Divergence)
means <- aggregate(Recall.rate ~ tool, a, mean)
means$Recall.rate = as.numeric(format(round(means$Recall.rate, 2),nsmall=2))
a$tool <- factor(a$tool, levels=c("FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2-se",
"QIIME-se",
"DADA2-pe",
"QIIME-pe"))
# RECALL RATE
p1 <- a %>%
ggplot(aes(tool,Recall.rate, fill=tool)) +
geom_boxplot(outlier.size=0.3) +
scale_fill_manual(values=dedicated_palette_wo_real) +
geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p1 <- p1 +labs(title="A. Recall rate",x="",y="")
# PRECISION
means <- aggregate(Precision ~ tool, a, mean)
means$Precision = as.numeric(format(round(means$Precision, 2),nsmall=2))
p2 <- a %>%
ggplot(aes(tool,Precision, fill=tool)) +
geom_boxplot(outlier.size=0.3) +
scale_fill_manual(values=dedicated_palette_wo_real) +
geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p2 <- p2 +labs(title="B. Precision",x="",y="")
# DIVERGENCE
means <- aggregate(Divergence ~ tool, a, mean)
means$Divergence = as.numeric(format(round(means$Divergence, 2),nsmall=2))
p3 <- a %>%
ggplot(aes(tool,Divergence, fill=tool)) +
geom_boxplot(outlier.size=0.3) +
scale_fill_manual(values=dedicated_palette_wo_real) +
geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p3 <- p3 +labs(title="C. Divergence",x="",y="")
b <- read.csv2("data/PR_all_ecos.tsv", sep="\t", stringsAsFactors = F, dec = ".", header = T) %>% as_tibble()
b$Expected <- as.numeric(b$Expected)
b$Found <- as.numeric(b$Found)
b$Perfect <- as.numeric(b$Perfect)
b <- b %>% mutate(Percent.Found = Found/Expected*100)
b <- b %>% mutate(Percent.Perfect = Perfect/Expected*100)
b$tool <- factor(b$tool, levels=c("FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2-se",
"QIIME-se",
"DADA2-pe",
"QIIME-pe"))
means <- aggregate(Percent.Found ~ tool, b, mean)
means$Percent.Found = as.numeric(format(round(means$Percent.Found, 2),nsmall=2))
p <- b %>%
ggplot(aes(tool,Percent.Found, fill=tool)) +
geom_boxplot(outlier.size=0.3) +
scale_fill_manual(values=dedicated_palette_wo_real) +
geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
theme(legend.position = "none")
p +labs(title="Proportion of species detected",
y ="", x = "")
#a <- a %>% filter(its == "ITS1" | its == "ITS2")
means <- aggregate(Percent.Perfect ~ tool, b, mean)
means$Percent.Perfect = as.numeric(format(round(means$Percent.Perfect, 2),nsmall=2))
p4 <- b %>%
ggplot(aes(tool,Percent.Perfect, fill=tool)) +
geom_boxplot(outlier.size=0.3) +
scale_fill_manual(values=dedicated_palette_wo_real) +
geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p4 <- p4 +labs(title="D. % of perfectly constructed sequences",
x ="", y = "")
b <- b %>% mutate(Percent.Perfect2 = Perfect/Found*100)
means <- aggregate(Percent.Perfect2 ~ tool, b, mean)
means$Percent.Perfect2 = as.numeric(format(round(means$Percent.Perfect2, 2),nsmall=2))
b <- b %>% replace(is.na(.), 0)
p <- b %>%
ggplot(aes(tool,Percent.Perfect2, fill=tool)) +
#geom_violin(scale = "count") +
geom_boxplot(outlier.size=0.3) +
scale_fill_manual(values=dedicated_palette_wo_real) +
geom_jitter(shape=16, position=position_jitter(0.2), size=0.5) +
theme(legend.position = "none")
p +labs(title="Proportion of perfectly constructed ASVs/OTUs/ZOTUs",
x ="tools", y = "")
ggarrange(p1, p2, p3, p4, ncol=2, nrow=2, common.legend = TRUE, legend="right")
DADA2_FROGS seems to be the most accurate method! This solution is chosen for further analyis.
sp <- "MEAT"
divergence_file <- file.path(sp,"divergence.tsv")
for(its in amplicons){
for(type in types){
#path <- file.path("../benchmark_real",sp,its,type)
path <- file.path(sp,its,type)
infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
if(its=="D1D2"){
if(type=="ADN"){
cat(" \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
}else{
cat(" \n####",sp, " ", "D1/D2", " ", type, " \n\n")
}
}else{
if(type=="ADN"){
cat(" \n####",sp, " ", its, " ", "DNA", " \n\n")
}
else{
cat(" \n####",sp, " ", its, " ", type, " \n\n")
}
}
if(!(file.exists(infos_file))){
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
}
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
nb_species <- length(species)
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
nb_species <- length(species)
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
nb_species <- length(species)
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
nb_species <- length(species)
}
data <- tibble(richeness = c(
nb_species,
nrow(frogs_infos),
nrow(dada2_frogs_infos),
nrow(usearch_infos),
nrow(dada2_se_infos),
nrow(qiime_se_infos),
nrow(dada2_pe_infos),
nrow(qiime_pe_infos)
),
tool = c("EXPECTED",
"FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2_SE",
"QIIME_SE",
"DADA2_PE",
"QIIME_PE"),
reads = c(
metadata_infos %>% filter(Ecosystem == sp) %>% filter(Marker == its) %>% filter(str_detect(Sample, paste("^",type,sep=""))) %>% select(Reads) %>% sum,
frogs_infos %>% select(observation_sum) %>% sum(),
dada2_frogs_infos %>% select(observation_sum) %>% sum(),
usearch_infos %>% select(observation_sum) %>% sum(),
dada2_se_infos %>% select(observation_sum) %>% sum(),
qiime_se_infos %>% select(observation_sum) %>% sum(),
dada2_pe_infos %>% select(observation_sum) %>% sum(),
qiime_pe_infos %>% select(observation_sum) %>% sum()
))
data$tool <- factor(data$tool, levels=c("EXPECTED",
"FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2_SE",
"QIIME_SE",
"DADA2_PE",
"QIIME_PE"))
saveRDS(data, infos_file)
}
else{
data <- readRDS(infos_file)
}
print(plot_depth(data))
cat("\n")
}
}
for(its in amplicons){
for(type in types){
if(its=="D1D2"){
if(type=="ADN"){
cat(" \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
}else{
cat(" \n####",sp, " ", "D1/D2", " ", type, " \n\n")
}
}else{
if(type=="ADN"){
cat(" \n####",sp, " ", its, " ", "DNA", " \n\n")
}
else{
cat(" \n####",sp, " ", its, " ", type, " \n\n")
}
}
#path <- file.path("../benchmark_real",sp,its,type)
path <- file.path(sp,its,type)
infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
data <- readRDS(infos_file)
cat("\n")
cat("\n")
print(plot_richeness_graph(data))
cat("\n")
}
}
The results obtained for each tool were compared to what was expected and different metrics were calculated. Affiliations and associated abundances are taken into account.
The metrics are:
for(its in amplicons){
for(type in types){
if(its=="D1D2"){
if(type=="ADN"){
cat(" \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
}else{
cat(" \n####",sp, " ", "D1/D2", " ", type, " \n\n")
}
}else{
if(type=="ADN"){
cat(" \n####",sp, " ", its, " ", "DNA", " \n\n")
}
else{
cat(" \n####",sp, " ", its, " ", type, " \n\n")
}
}
print(plot_divergence_real(type, its, divergence_file))
cat(" \n")
cat(" \n")
print(plot_metrics_real(type, its, divergence_file))
cat(" \n")
print(plot_precision_and_recall_real(type, its, divergence_file))
cat(" \n")
}
}
cat(" \n###"," Comparison of markers and types for DADA2_FROGS", " \n")
print(plot_divergence(type, divergence_file) + facet_grid(~its))
cat(" \n")
print(plot_metrics(type, divergence_file))
cat(" \n")
print(plot_precision_and_recall(type, divergence_file))
cat(" \n")
Tools are clustered using the canberra distance.
for(its in amplicons){
for(type in types){
#path <- file.path("../benchmark_real",sp,its,type)
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
if(its=="D1D2"){
if(type=="ADN"){
cat(" \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
}else{
cat(" \n####",sp, " ", "D1/D2", " ", type, " \n\n")
}
}else{
if(type=="ADN"){
cat(" \n####",sp, " ", its, " ", "DNA", " \n\n")
}
else{
cat(" \n####",sp, " ", its, " ", type, " \n\n")
}
}
if(!file.exists(mat_file)){
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
}
nb_expected_species <- length(species)
species <- c(species,"Others")
mat = matrix(nrow = nb_expected_species+1, ncol = 8)
rownames(mat) <- species
colnames(mat) <- c(tools,"EXPECTED")
mat[,"EXPECTED"] <- c(rep(1/nb_expected_species,times=nb_expected_species),0)
mat[is.na(mat)] <- 0
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
mat <- fill_abundance_matrix(mat, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
mat <- fill_abundance_matrix(mat, dada2_se_infos, "DADA2-se")
mat <- fill_abundance_matrix(mat, dada2_pe_infos, "DADA2-pe")
mat <- fill_abundance_matrix(mat, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, qiime_se_infos, "QIIME-se")
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, qiime_pe_infos, "QIIME-pe")
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, usearch_infos, "USEARCH")
}
rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
rownames(mat)[nrow(mat)] <- "Others"
saveRDS(mat, mat_file)
}else{
mat <- readRDS(mat_file)
}
if(!metabarfood){
mat <- subset( mat, select = -DADA2_FROGS )
}
l <- build_heatmap_abundance_real(mat)
h <- l[[1]]
mat <- l[[2]]
lgd_list <- l[[3]]
draw(h, annotation_legend_list = lgd_list)
cat(" \n")
}
}
for(its in amplicons){
for(type in types){
#path <- file.path("../benchmark_real",sp,its,type)
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
if(its=="D1D2"){
if(type=="ADN"){
cat(" \n####",sp, " ", "D1/D2", " ", "DNA", " \n\n")
}else{
cat(" \n####",sp, " ", "D1/D2", " ", type, " \n\n")
}
}else{
if(type=="ADN"){
cat(" \n####",sp, " ", its, " ", "DNA", " \n\n")
}
else{
cat(" \n####",sp, " ", its, " ", type, " \n\n")
}
}
if(!file.exists(mat_file)){
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
real_infos <- real_infos_dict$ITS1
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
real_infos <- real_infos_dict$ITS2
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
real_infos <- real_infos_dict$D1D2
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
real_infos <- real_infos_dict$RPB2
}
}
if(!file.exists(mat_file)){
nb_expected_species <- length(species)
mat = matrix(nrow = nb_expected_species, ncol = 8)
rownames(mat) <- species
colnames(mat) <- c("EXPECTED","FROGS","DADA2_FROGS","USEARCH","DADA2-se","DADA2-pe","QIIME-se","QIIME-pe")
mat[,"EXPECTED"] <- 1
mat[is.na(mat)] <- 0
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
mat <- fill_presence_matrix(mat, real_infos, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
mat <- fill_presence_matrix(mat, real_infos, dada2_se_infos, "DADA2-se")
mat <- fill_presence_matrix(mat, real_infos, dada2_pe_infos, "DADA2-pe")
mat <- fill_presence_matrix(mat, real_infos, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, qiime_se_infos, "QIIME-se")
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, qiime_pe_infos, "QIIME-pe")
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, usearch_infos, "USEARCH")
}
rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
saveRDS(mat, mat_file)
}else{
mat <- readRDS(mat_file)
}
l <- build_heatmap_presence_real(mat)
tt <- summarise_all(as_tibble(ceiling(mat)),sum)
table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU detected") %>% kable_styling("striped", full_width = F)
cat(" \n")
print(table)
cat(" \n\n\n")
tt <- summarise_all(as_tibble(floor(mat)),sum)
table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU perfectly reconstructed") %>% kable_styling("striped", full_width = F)
cat(" \n")
print(table)
cat(" \n\n\n")
h <- l[[1]]
mat <- l[[2]]
draw(h)
cat(" \n")
cat(" \n")
}
}
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 34 | 35 | 24 | 32 | 30 | 28 | 28 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 26 | 30 | 23 | 7 | 25 | 1 | 27 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 38 | 38 | 29 | 36 | 34 | 31 | 31 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 26 | 33 | 26 | 7 | 28 | 1 | 28 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 35 | 36 | 25 | 29 | 28 | 25 | 25 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 25 | 29 | 25 | 4 | 28 | 0 | 24 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 37 | 36 | 25 | 31 | 30 | 25 | 25 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 24 | 30 | 25 | 4 | 30 | 0 | 24 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 34 | 37 | 23 | 23 | 3 | 24 | 1 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 27 | 32 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 35 | 37 | 23 | 24 | 4 | 24 | 1 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 28 | 33 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 29 | 32 | 0 | 34 | 11 | 33 | 4 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 13 | 12 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 33 | 33 | 0 | 34 | 4 | 33 | 7 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
40 | 15 | 14 | 0 | 0 | 0 | 0 | 0 |
for(tool in tools){
cat(" \n####",tool, " \n\n")
htmp_list <- c()
htmp_list_presence <- c()
min <- 0
max <- 0
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
mat <- readRDS(mat_file)
mat_presence <- readRDS(mat_presence_file)
submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix()
rownames(submat) <- rownames(mat)
rownames(submat_presence) <- rownames(mat_presence)
submat[submat==0] <- NA
#submat_presence[submat_presence==0] <- NA
max <- max(max,max(submat,na.rm = TRUE))
colnames(submat) <- paste0(its," ",type)
colnames(submat_presence) <- paste0(its," ",type)
ht <- build_heatmap_one_tool(submat, min, max)
ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
htmp_list <- c(htmp_list,ht)
htmp_list_presence <- c(htmp_list_presence,ht_presence)
}
}
l <- c()
for(i in htmp_list){
l <- l + i
}
l_presence <- c()
for(i in htmp_list_presence){
l_presence <- l_presence + i
}
draw(l, row_title = "Expected species",
column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
))
cat(" \n")
# draw(l_presence, row_title = "Expected species",
# column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
# Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color)),
# Legend(at = 2, title = "", labels=c("Not well reconstructed"), legend_gp = gpar(fill="#b1c5c4")),
# Legend(at = 3, title = "", labels=c("Perfect"), legend_gp = gpar(fill="#86a5a4"))
# )
# )
# cat(" \n")
}
sp <- "CHEESE"
divergence_file <- file.path(sp,"divergence.tsv")
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
cat(" \n####",sp, " ", its, " ", type, " \n\n")
if(!(file.exists(infos_file))){
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
}
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
nb_species <- length(species)
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
nb_species <- length(species)
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
nb_species <- length(species)
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
nb_species <- length(species)
}
data <- tibble(richeness = c(
nb_species,
nrow(frogs_infos),
nrow(dada2_frogs_infos),
nrow(usearch_infos),
nrow(dada2_se_infos),
nrow(qiime_se_infos),
nrow(dada2_pe_infos),
nrow(qiime_pe_infos)
),
tool = c("EXPECTED",
"FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2_SE",
"QIIME_SE",
"DADA2_PE",
"QIIME_PE"),
reads = c(
metadata_infos %>% filter(Ecosystem == sp) %>% filter(Marker == its) %>% filter(str_detect(Sample, paste("^",type,sep=""))) %>% select(Reads) %>% sum,
frogs_infos %>% select(observation_sum) %>% sum(),
dada2_frogs_infos %>% select(observation_sum) %>% sum(),
usearch_infos %>% select(observation_sum) %>% sum(),
dada2_se_infos %>% select(observation_sum) %>% sum(),
qiime_se_infos %>% select(observation_sum) %>% sum(),
dada2_pe_infos %>% select(observation_sum) %>% sum(),
qiime_pe_infos %>% select(observation_sum) %>% sum()
))
data$tool <- factor(data$tool, levels=c("EXPECTED",
"FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2_SE",
"QIIME_SE",
"DADA2_PE",
"QIIME_PE"))
saveRDS(data, infos_file)
}
else{
data <- readRDS(infos_file)
}
print(plot_depth(data))
cat("\n")
}
}
for(its in amplicons){
for(type in types){
cat(" \n####",sp, " ", its, " ", type, " \n\n")
path <- file.path(sp,its,type)
infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
data <- readRDS(infos_file)
cat("\n")
cat("\n")
print(plot_richeness_graph(data))
cat("\n")
}
}
The results obtained for each tool were compared to what was expected and different metrics were calculated. Affiliations and associated abundances are taken into account.
The metrics are:
for(its in amplicons){
for(type in types){
cat(" \n####",sp, " ", its, " ", type, " \n")
print(plot_divergence_real(type, its, divergence_file))
cat(" \n")
cat(" \n")
print(plot_metrics_real(type, its, divergence_file))
cat(" \n")
print(plot_precision_and_recall_real(type, its, divergence_file))
cat(" \n")
}
}
cat(" \n###"," Comparison of markers and types for DADA2_FROGS", " \n")
print(plot_divergence(type, divergence_file) + facet_grid(~its))
cat(" \n")
print(plot_metrics(type, divergence_file))
cat(" \n")
print(plot_precision_and_recall(type, divergence_file))
cat(" \n")
Tools are clustered using the canberra distance.
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
cat(" \n####",sp, " ", its, " ", type, " \n\n")
if(!file.exists(mat_file)){
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
}
nb_expected_species <- length(species)
species <- c(species,"Others")
mat = matrix(nrow = nb_expected_species+1, ncol = 8)
rownames(mat) <- species
colnames(mat) <- c(tools,"EXPECTED")
mat[,"EXPECTED"] <- c(rep(1/nb_expected_species,times=nb_expected_species),0)
mat[is.na(mat)] <- 0
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
mat <- fill_abundance_matrix(mat, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
mat <- fill_abundance_matrix(mat, dada2_se_infos, "DADA2-se")
mat <- fill_abundance_matrix(mat, dada2_pe_infos, "DADA2-pe")
mat <- fill_abundance_matrix(mat, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, qiime_se_infos, "QIIME-se")
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, qiime_pe_infos, "QIIME-pe")
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, usearch_infos, "USEARCH")
}
rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
rownames(mat)[nrow(mat)] <- "Others"
saveRDS(mat, mat_file)
}else{
mat <- readRDS(mat_file)
}
if(!metabarfood){
mat <- subset( mat, select = -DADA2_FROGS )
}
l <- build_heatmap_abundance_real(mat)
h <- l[[1]]
mat <- l[[2]]
lgd_list <- l[[3]]
draw(h, annotation_legend_list = lgd_list)
cat(" \n")
}
}
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
cat(" \n####",sp, " ", its, " ", type, " \n\n")
if(!file.exists(mat_file)){
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
real_infos <- real_infos_dict$ITS1
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
real_infos <- real_infos_dict$ITS2
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
real_infos <- real_infos_dict$D1D2
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
real_infos <- real_infos_dict$RPB2
}
}
if(!file.exists(mat_file)){
nb_expected_species <- length(species)
mat = matrix(nrow = nb_expected_species, ncol = 8)
rownames(mat) <- species
colnames(mat) <- c("EXPECTED","FROGS","DADA2_FROGS","USEARCH","DADA2-se","DADA2-pe","QIIME-se","QIIME-pe")
mat[,"EXPECTED"] <- 1
mat[is.na(mat)] <- 0
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
mat <- fill_presence_matrix(mat, real_infos, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
mat <- fill_presence_matrix(mat, real_infos, dada2_se_infos, "DADA2-se")
mat <- fill_presence_matrix(mat, real_infos, dada2_pe_infos, "DADA2-pe")
mat <- fill_presence_matrix(mat, real_infos, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, qiime_se_infos, "QIIME-se")
}else{
mat[,"QIIME-se"] <- 0
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, qiime_pe_infos, "QIIME-pe")
}else{
mat[,"QIIME-pe"] <- 0
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, usearch_infos, "USEARCH")
}else{
mat[,"USEARCH"] <- 0
}
rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
saveRDS(mat, mat_file)
}else{
mat <- readRDS(mat_file)
}
l <- build_heatmap_presence_real(mat)
tt <- summarise_all(as_tibble(ceiling(mat)),sum)
table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU detected") %>% kable_styling("striped", full_width = F)
cat(" \n")
print(table)
cat(" \n\n\n")
tt <- summarise_all(as_tibble(floor(mat)),sum)
table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU perfectly reconstructed") %>% kable_styling("striped", full_width = F)
cat(" \n")
print(table)
cat(" \n\n\n")
h <- l[[1]]
mat <- l[[2]]
draw(h)
cat(" \n")
cat(" \n")
}
}
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 23 | 23 | 20 | 23 | 24 | 23 | 23 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 21 | 22 | 19 | 7 | 21 | 0 | 21 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 23 | 23 | 21 | 23 | 24 | 23 | 23 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 21 | 22 | 20 | 7 | 22 | 0 | 21 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 24 | 24 | 0 | 24 | 24 | 24 | 24 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 23 | 23 | 0 | 4 | 22 | 0 | 22 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 24 | 24 | 23 | 24 | 24 | 24 | 25 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 23 | 23 | 23 | 4 | 23 | 0 | 22 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 21 | 21 | 20 | 20 | 10 | 19 | 14 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 13 | 14 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 20 | 21 | 19 | 20 | 5 | 19 | 7 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 13 | 14 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 18 | 17 | 22 | 22 | 8 | 23 | 13 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 13 | 12 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 15 | 15 | 0 | 20 | 6 | 21 | 6 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
25 | 10 | 10 | 0 | 0 | 0 | 0 | 0 |
for(tool in tools){
cat(" \n####",tool, " \n\n")
htmp_list <- c()
htmp_list_presence <- c()
min <- 0
max <- 0
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
mat <- readRDS(mat_file)
mat_presence <- readRDS(mat_presence_file)
submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix()
rownames(submat) <- rownames(mat)
rownames(submat_presence) <- rownames(mat_presence)
submat[submat==0] <- NA
#submat_presence[submat_presence==0] <- NA
max <- max(max,max(submat,na.rm = TRUE))
colnames(submat) <- paste0(its," ",type)
colnames(submat_presence) <- paste0(its," ",type)
ht <- build_heatmap_one_tool(submat, min, max)
ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
htmp_list <- c(htmp_list,ht)
htmp_list_presence <- c(htmp_list_presence,ht_presence)
}
}
l <- c()
for(i in htmp_list){
l <- l + i
}
l_presence <- c()
for(i in htmp_list_presence){
l_presence <- l_presence + i
}
draw(l, row_title = "Expected species",
column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
))
cat(" \n")
# draw(l_presence, row_title = "Expected species",
# column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
# Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color)),
# Legend(at = 2, title = "", labels=c("Not well reconstructed"), legend_gp = gpar(fill="#b1c5c4")),
# Legend(at = 3, title = "", labels=c("Perfect"), legend_gp = gpar(fill="#86a5a4"))
# )
# )
# cat(" \n")
}
sp <- "BREAD"
divergence_file <- file.path(sp,"divergence.tsv")
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
cat(" \n####",sp, " ", its, " ", type, " \n\n")
if(!(file.exists(infos_file))){
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
}
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
nb_species <- length(species)
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
nb_species <- length(species)
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
nb_species <- length(species)
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
nb_species <- length(species)
}
data <- tibble(richeness = c(
nb_species,
nrow(frogs_infos),
nrow(dada2_frogs_infos),
nrow(usearch_infos),
nrow(dada2_se_infos),
nrow(qiime_se_infos),
nrow(dada2_pe_infos),
nrow(qiime_pe_infos)
),
tool = c("EXPECTED",
"FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2_SE",
"QIIME_SE",
"DADA2_PE",
"QIIME_PE"),
reads = c(
metadata_infos %>% filter(Ecosystem == sp) %>% filter(Marker == its) %>% filter(str_detect(Sample, paste("^",type,sep=""))) %>% select(Reads) %>% sum,
frogs_infos %>% select(observation_sum) %>% sum(),
dada2_frogs_infos %>% select(observation_sum) %>% sum(),
usearch_infos %>% select(observation_sum) %>% sum(),
dada2_se_infos %>% select(observation_sum) %>% sum(),
qiime_se_infos %>% select(observation_sum) %>% sum(),
dada2_pe_infos %>% select(observation_sum) %>% sum(),
qiime_pe_infos %>% select(observation_sum) %>% sum()
))
data$tool <- factor(data$tool, levels=c("EXPECTED",
"FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2_SE",
"QIIME_SE",
"DADA2_PE",
"QIIME_PE"))
saveRDS(data, infos_file)
}
else{
data <- readRDS(infos_file)
}
print(plot_depth(data))
cat("\n")
}
}
for(its in amplicons){
for(type in types){
cat(" \n####",sp, " ", its, " ", type, " \n\n")
path <- file.path(sp,its,type)
infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
data <- readRDS(infos_file)
cat("\n")
cat("\n")
print(plot_richeness_graph(data))
cat("\n")
}
}
The results obtained for each tool were compared to what was expected and different metrics were calculated. Affiliations and associated abundances are taken into account.
The metrics are:
for(its in amplicons){
for(type in types){
cat(" \n####",sp, " ", its, " ", type, " \n")
print(plot_divergence_real(type, its, divergence_file))
cat(" \n")
cat(" \n")
print(plot_metrics_real(type, its, divergence_file))
cat(" \n")
print(plot_precision_and_recall_real(type, its, divergence_file))
cat(" \n")
}
}
cat(" \n###"," Comparison of markers and types for DADA2_FROGS", " \n")
print(plot_divergence(type, divergence_file) + facet_grid(~its))
cat(" \n")
print(plot_metrics(type, divergence_file))
cat(" \n")
print(plot_precision_and_recall(type, divergence_file))
cat(" \n")
Tools are clustered using the canberra distance.
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
cat(" \n####",sp, " ", its, " ", type, " \n\n")
if(!file.exists(mat_file)){
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
}
nb_expected_species <- length(species)
species <- c(species,"Others")
mat = matrix(nrow = nb_expected_species+1, ncol = 8)
rownames(mat) <- species
colnames(mat) <- c(tools,"EXPECTED")
mat[,"EXPECTED"] <- c(rep(1/nb_expected_species,times=nb_expected_species),0)
mat[is.na(mat)] <- 0
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
mat <- fill_abundance_matrix(mat, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
mat <- fill_abundance_matrix(mat, dada2_se_infos, "DADA2-se")
mat <- fill_abundance_matrix(mat, dada2_pe_infos, "DADA2-pe")
mat <- fill_abundance_matrix(mat, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, qiime_se_infos, "QIIME-se")
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, qiime_pe_infos, "QIIME-pe")
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, usearch_infos, "USEARCH")
}
rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
rownames(mat)[nrow(mat)] <- "Others"
saveRDS(mat, mat_file)
}else{
mat <- readRDS(mat_file)
}
if(!metabarfood){
mat <- subset( mat, select = -DADA2_FROGS )
}
l <- build_heatmap_abundance_real(mat)
h <- l[[1]]
mat <- l[[2]]
lgd_list <- l[[3]]
draw(h, annotation_legend_list = lgd_list)
cat(" \n")
}
}
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
cat(" \n####",sp, " ", its, " ", type, " \n\n")
if(!file.exists(mat_file)){
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
real_infos <- real_infos_dict$ITS1
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
real_infos <- real_infos_dict$ITS2
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
real_infos <- real_infos_dict$D1D2
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
real_infos <- real_infos_dict$RPB2
}
}
if(!file.exists(mat_file)){
nb_expected_species <- length(species)
mat = matrix(nrow = nb_expected_species, ncol = 8)
rownames(mat) <- species
colnames(mat) <- c("EXPECTED","FROGS","DADA2_FROGS","USEARCH","DADA2-se","DADA2-pe","QIIME-se","QIIME-pe")
mat[,"EXPECTED"] <- 1
mat[is.na(mat)] <- 0
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
mat <- fill_presence_matrix(mat, real_infos, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
mat <- fill_presence_matrix(mat, real_infos, dada2_se_infos, "DADA2-se")
mat <- fill_presence_matrix(mat, real_infos, dada2_pe_infos, "DADA2-pe")
mat <- fill_presence_matrix(mat, real_infos, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, qiime_se_infos, "QIIME-se")
}else{
mat[,"QIIME-se"] <- 0
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, qiime_pe_infos, "QIIME-pe")
}else{
mat[,"QIIME-pe"] <- 0
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, usearch_infos, "USEARCH")
}else{
mat[,"USEARCH"] <- 0
}
rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
saveRDS(mat, mat_file)
}else{
mat <- readRDS(mat_file)
}
l <- build_heatmap_presence_real(mat)
tt <- summarise_all(as_tibble(ceiling(mat)),sum)
table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU detected") %>% kable_styling("striped", full_width = F)
cat(" \n")
print(table)
cat(" \n\n\n")
tt <- summarise_all(as_tibble(floor(mat)),sum)
table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU perfectly reconstructed") %>% kable_styling("striped", full_width = F)
cat(" \n")
print(table)
cat(" \n\n\n")
h <- l[[1]]
mat <- l[[2]]
draw(h)
cat(" \n")
cat(" \n")
}
}
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 21 | 21 | 20 | 22 | 20 | 23 | 22 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 19 | 19 | 16 | 6 | 17 | 1 | 18 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 21 | 21 | 20 | 21 | 20 | 23 | 20 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 19 | 18 | 16 | 6 | 16 | 1 | 18 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 21 | 21 | 20 | 21 | 21 | 21 | 21 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 20 | 20 | 19 | 0 | 20 | 0 | 20 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 21 | 21 | 20 | 21 | 21 | 21 | 22 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 20 | 19 | 18 | 0 | 19 | 0 | 19 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 21 | 22 | 0 | 18 | 0 | 16 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 18 | 18 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 21 | 21 | 16 | 18 | 3 | 17 | 5 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 18 | 18 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 10 | 10 | 0 | 19 | 2 | 20 | 3 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 6 | 6 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 9 | 9 | 0 | 16 | 3 | 20 | 5 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
27 | 3 | 4 | 0 | 0 | 0 | 0 | 0 |
for(tool in tools){
cat(" \n####",tool, " \n\n")
htmp_list <- c()
htmp_list_presence <- c()
min <- 0
max <- 0
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
mat <- readRDS(mat_file)
mat_presence <- readRDS(mat_presence_file)
submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix()
rownames(submat) <- rownames(mat)
rownames(submat_presence) <- rownames(mat_presence)
submat[submat==0] <- NA
#submat_presence[submat_presence==0] <- NA
max <- max(max,max(submat,na.rm = TRUE))
colnames(submat) <- paste0(its," ",type)
colnames(submat_presence) <- paste0(its," ",type)
ht <- build_heatmap_one_tool(submat, min, max)
ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
htmp_list <- c(htmp_list,ht)
htmp_list_presence <- c(htmp_list_presence,ht_presence)
}
}
l <- c()
for(i in htmp_list){
l <- l + i
}
l_presence <- c()
for(i in htmp_list_presence){
l_presence <- l_presence + i
}
draw(l, row_title = "Expected species",
column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
))
cat(" \n")
# draw(l_presence, row_title = "Expected species",
# column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
# Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color)),
# Legend(at = 2, title = "", labels=c("Not well reconstructed"), legend_gp = gpar(fill="#b1c5c4")),
# Legend(at = 3, title = "", labels=c("Perfect"), legend_gp = gpar(fill="#86a5a4"))
# )
# )
# cat(" \n")
}
sp <- "WINE"
divergence_file <- file.path(sp,"divergence.tsv")
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
cat(" \n####",sp, " ", its, " ", type, " \n\n")
if(!(file.exists(infos_file))){
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
}
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
nb_species <- length(species)
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
nb_species <- length(species)
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
nb_species <- length(species)
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
nb_species <- length(species)
}
data <- tibble(richeness = c(
nb_species,
nrow(frogs_infos),
nrow(dada2_frogs_infos),
nrow(usearch_infos),
nrow(dada2_se_infos),
nrow(qiime_se_infos),
nrow(dada2_pe_infos),
nrow(qiime_pe_infos)
),
tool = c("EXPECTED",
"FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2_SE",
"QIIME_SE",
"DADA2_PE",
"QIIME_PE"),
reads = c(
metadata_infos %>% filter(Ecosystem == sp) %>% filter(Marker == its) %>% filter(str_detect(Sample, paste("^",type,sep=""))) %>% select(Reads) %>% sum,
frogs_infos %>% select(observation_sum) %>% sum(),
dada2_frogs_infos %>% select(observation_sum) %>% sum(),
usearch_infos %>% select(observation_sum) %>% sum(),
dada2_se_infos %>% select(observation_sum) %>% sum(),
qiime_se_infos %>% select(observation_sum) %>% sum(),
dada2_pe_infos %>% select(observation_sum) %>% sum(),
qiime_pe_infos %>% select(observation_sum) %>% sum()
))
data$tool <- factor(data$tool, levels=c("EXPECTED",
"FROGS",
"DADA2_FROGS",
"USEARCH",
"DADA2_SE",
"QIIME_SE",
"DADA2_PE",
"QIIME_PE"))
saveRDS(data, infos_file)
}
else{
data <- readRDS(infos_file)
}
print(plot_depth(data))
cat("\n")
}
}
for(its in amplicons){
for(type in types){
cat(" \n####",sp, " ", its, " ", type, " \n\n")
path <- file.path(sp,its,type)
infos_file <- file.path(path,paste0(sp,"_",its,"_",type,"_infos.rds"))
data <- readRDS(infos_file)
cat("\n")
cat("\n")
print(plot_richeness_graph(data))
cat("\n")
}
}
The results obtained for each tool were compared to what was expected and different metrics were calculated. Affiliations and associated abundances are taken into account.
The metrics are:
for(its in amplicons){
for(type in types){
cat(" \n####",sp, " ", its, " ", type, " \n")
print(plot_divergence_real(type, its, divergence_file))
cat(" \n")
cat(" \n")
print(plot_metrics_real(type, its, divergence_file))
cat(" \n")
print(plot_precision_and_recall_real(type, its, divergence_file))
cat(" \n")
}
}
cat(" \n###"," Comparison of markers and types for DADA2_FROGS", " \n")
print(plot_divergence(type, divergence_file) + facet_grid(~its))
cat(" \n")
print(plot_metrics(type, divergence_file))
cat(" \n")
print(plot_precision_and_recall(type, divergence_file))
cat(" \n")
Tools are clustered using the canberra distance.
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
cat(" \n####",sp, " ", its, " ", type, " \n\n")
if(!file.exists(mat_file)){
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
}
nb_expected_species <- length(species)
species <- c(species,"Others")
mat = matrix(nrow = nb_expected_species+1, ncol = 8)
rownames(mat) <- species
colnames(mat) <- c(tools,"EXPECTED")
mat[,"EXPECTED"] <- c(rep(1/nb_expected_species,times=nb_expected_species),0)
mat[is.na(mat)] <- 0
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE)
mat <- fill_abundance_matrix(mat, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
mat <- fill_abundance_matrix(mat, dada2_se_infos, "DADA2-se")
mat <- fill_abundance_matrix(mat, dada2_pe_infos, "DADA2-pe")
mat <- fill_abundance_matrix(mat, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, qiime_se_infos, "QIIME-se")
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, qiime_pe_infos, "QIIME-pe")
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum))
mat <- fill_abundance_matrix(mat, usearch_infos, "USEARCH")
}
rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
rownames(mat)[nrow(mat)] <- "Others"
saveRDS(mat, mat_file)
}else{
mat <- readRDS(mat_file)
}
if(!metabarfood){
mat <- subset( mat, select = -DADA2_FROGS )
}
l <- build_heatmap_abundance_real(mat)
h <- l[[1]]
mat <- l[[2]]
lgd_list <- l[[3]]
draw(h, annotation_legend_list = lgd_list)
cat(" \n")
}
}
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
cat(" \n####",sp, " ", its, " ", type, " \n\n")
if(!file.exists(mat_file)){
if(its == "ITS1"){
species <- real_infos_dict$ITS1$Taxonomy
real_infos <- real_infos_dict$ITS1
}else if(its == "ITS2"){
species <- real_infos_dict$ITS2$Taxonomy
real_infos <- real_infos_dict$ITS2
}else if(its == "D1D2"){
species <- real_infos_dict$D1D2$Taxonomy
real_infos <- real_infos_dict$D1D2
}else if(its == "RPB2"){
species <- real_infos_dict$RPB2$Taxonomy
real_infos <- real_infos_dict$RPB2
}
}
if(!file.exists(mat_file)){
nb_expected_species <- length(species)
mat = matrix(nrow = nb_expected_species, ncol = 8)
rownames(mat) <- species
colnames(mat) <- c("EXPECTED","FROGS","DADA2_FROGS","USEARCH","DADA2-se","DADA2-pe","QIIME-se","QIIME-pe")
mat[,"EXPECTED"] <- 1
mat[is.na(mat)] <- 0
dada2_se_infos <- read.csv2(file.path(path,"dada2_se.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
dada2_pe_infos <- read.csv2(file.path(path,"dada2.tsv"), sep="\t", header=TRUE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence), blast_taxonomy = X.blast_taxonomy)
frogs_infos <- read.csv2(file.path(path,"frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
dada2_frogs_infos <- read.csv2(file.path(path,"dada2_frogs.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(seed_sequence))
frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
dada2_frogs_multiaff_infos <- readr::read_tsv(file = file.path(sp,its,type,"dada2_frogs_multiaff.tsv")) %>% as_tibble() %>% separate(blast_taxonomy, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% mutate(observation_name = `#observation_name`)
mat <- fill_presence_matrix(mat, real_infos, frogs_infos, "FROGS", multiaff_infos=frogs_multiaff_infos)
mat <- fill_presence_matrix(mat, real_infos, dada2_se_infos, "DADA2-se")
mat <- fill_presence_matrix(mat, real_infos, dada2_pe_infos, "DADA2-pe")
mat <- fill_presence_matrix(mat, real_infos, dada2_frogs_infos, "DADA2_FROGS", multiaff_infos=dada2_frogs_multiaff_infos)
if(file.exists(file.path(path,"se-qiime.tsv"))){
qiime_se_infos <- read.csv2(file.path(path,"se-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, qiime_se_infos, "QIIME-se")
}else{
mat[,"QIIME-se"] <- 0
}
if(file.exists(file.path(path,"pe-qiime.tsv"))){
qiime_pe_infos <- read.csv2(file.path(path,"pe-qiime.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = Taxon, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, qiime_pe_infos, "QIIME-pe")
}else{
mat[,"QIIME-pe"] <- 0
}
if(file.exists(file.path(path,"usearch.tsv"))){
usearch_infos <- read.csv2(file.path(path,"usearch.tsv"), sep="\t", header=TRUE, stringsAsFactors = FALSE) %>% as_tibble() %>% mutate(Length = str_length(X.seed_sequence), blast_taxonomy = taxonomy, observation_sum = as.integer(observation_sum), seed_sequence = X.seed_sequence)
mat <- fill_presence_matrix(mat, real_infos, usearch_infos, "USEARCH")
}else{
mat[,"USEARCH"] <- 0
}
rownames(mat) <- rownames(mat) %>% as_tibble() %>% separate(value, ";(?=[\\S])", into = c("Kingdom","Phylum","Class","Order","Family","Genus","Species"), remove=FALSE) %>% select(Species) %>% pull()
saveRDS(mat, mat_file)
}else{
mat <- readRDS(mat_file)
}
l <- build_heatmap_presence_real(mat)
tt <- summarise_all(as_tibble(ceiling(mat)),sum)
table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU detected") %>% kable_styling("striped", full_width = F)
cat(" \n")
print(table)
cat(" \n\n\n")
tt <- summarise_all(as_tibble(floor(mat)),sum)
table <- tt %>% kable(caption = "Count of OTU/ASV/ZOTU perfectly reconstructed") %>% kable_styling("striped", full_width = F)
cat(" \n")
print(table)
cat(" \n\n\n")
h <- l[[1]]
mat <- l[[2]]
draw(h)
cat(" \n")
cat(" \n")
}
}
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 50 | 50 | 48 | 51 | 49 | 50 | 50 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 46 | 46 | 43 | 24 | 41 | 2 | 45 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 53 | 53 | 49 | 54 | 52 | 54 | 54 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 48 | 49 | 44 | 25 | 44 | 2 | 48 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 52 | 53 | 51 | 54 | 54 | 53 | 53 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 49 | 49 | 48 | 4 | 49 | 0 | 49 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 52 | 53 | 51 | 54 | 54 | 54 | 54 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 49 | 49 | 48 | 4 | 49 | 0 | 49 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 51 | 51 | 47 | 51 | 4 | 49 | 6 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 41 | 41 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 51 | 51 | 46 | 50 | 5 | 49 | 9 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 41 | 41 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 36 | 38 | 0 | 42 | 1 | 45 | 8 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 23 | 24 | 0 | 0 | 0 | 0 | 0 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 30 | 31 | 0 | 38 | 3 | 39 | 10 |
EXPECTED | FROGS | DADA2_FROGS | USEARCH | DADA2-se | DADA2-pe | QIIME-se | QIIME-pe |
---|---|---|---|---|---|---|---|
60 | 19 | 19 | 0 | 0 | 0 | 0 | 0 |
for(tool in tools){
cat(" \n####",tool, " \n\n")
htmp_list <- c()
htmp_list_presence <- c()
min <- 0
max <- 0
for(its in amplicons){
for(type in types){
path <- file.path(sp,its,type)
mat_file <- file.path(path,paste0(sp,"_",its,"_",type,"_abundmat.rds"))
mat_presence_file <- file.path(path,paste0(sp,"_",its,"_",type,"_presencemat.rds"))
mat <- readRDS(mat_file)
mat_presence <- readRDS(mat_presence_file)
submat <- mat %>% as_tibble() %>% select(tool) %>% as.matrix()
submat_presence <- mat_presence %>% as_tibble() %>% select(tool) %>% as.matrix()
rownames(submat) <- rownames(mat)
rownames(submat_presence) <- rownames(mat_presence)
submat[submat==0] <- NA
#submat_presence[submat_presence==0] <- NA
max <- max(max,max(submat,na.rm = TRUE))
colnames(submat) <- paste0(its," ",type)
colnames(submat_presence) <- paste0(its," ",type)
ht <- build_heatmap_one_tool(submat, min, max)
ht_presence <- build_heatmap_one_tool_presence(submat_presence, 0, 1)
htmp_list <- c(htmp_list,ht)
htmp_list_presence <- c(htmp_list_presence,ht_presence)
}
}
l <- c()
for(i in htmp_list){
l <- l + i
}
l_presence <- c()
for(i in htmp_list_presence){
l_presence <- l_presence + i
}
draw(l, row_title = "Expected species",
column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color))
))
cat(" \n")
# draw(l_presence, row_title = "Expected species",
# column_title_gp = gpar(fontsize = 16), ht_gap = unit(0.1, "cm"), annotation_legend_list = list(
# Legend(at = 1, title = "", labels=c("Absent"), legend_gp = gpar(fill=absent_color)),
# Legend(at = 2, title = "", labels=c("Not well reconstructed"), legend_gp = gpar(fill="#b1c5c4")),
# Legend(at = 3, title = "", labels=c("Perfect"), legend_gp = gpar(fill="#86a5a4"))
# )
# )
# cat(" \n")
}