All data used to produce this document are available on this repository: https://forgemia.inra.fr/migale/metabarfood/. The code used is available in the code chunks.

Introduction

This report gives information about the sequences of the mock communities for the METABARFOOD project.

Description

ggd <- read.delim("REFERENCES/118sp.tsv", sep="\t", header=T, stringsAsFactors = F) %>% filter(MEAT == TRUE | BREAD == TRUE | CHEESE == TRUE | WINE == TRUE)
ggd2 <- ggd %>% select(ITS1, ITS2, D1D2, RPB2, Domain, Phylum, Class, Order, Family, Genus, Species, Strain) %>%
  pivot_longer(c(ITS1,ITS2,D1D2,RPB2), names_to = "Marker", values_to = "Sequence") %>% mutate(Length = nchar(Sequence)) %>% filter(Length > 2) %>% mutate(Marker=replace(Marker, Marker=="D1D2", "D1/D2"))

A total of 469 sequences were used in mock communities. Their id, taxonomic affiliation, sequence and length are available in the following table.

3 RPB2 sequences could not be identified (PCR amplification failure) and are missing.
datatable(ggd2, extensions='Buttons', options= list(dom='Bfrtip',buttons = c('excel','csv')))

Diversity

Genus and species diversity in mock communities are represented in the following figure. Each point corresponds to a genus. The number of species per genus is indicated on the vertical axis. For example, in Meat, one genus is represented by 19 species in the mock community (upper point) and 8 genus are represented by a single species.

b <- read.csv2("REFERENCES/references_compositions_species.tsv", sep="\t") %>% as_tibble
ggplot(b, aes(x=Ecosystem, y=Species,color=Ecosystem)) + geom_violin(show.legend = FALSE) + geom_dotplot(binaxis='y', stackdir='center', dotsize=1, show.legend = FALSE) + ylab("Nb Species by Genus")# + ggtitle("Microdiversity for each ecosytem")

Length distribution

Here are plotted the lengths and their distribution for each marker.

p <- ggplot(ggd2, aes(x=Marker, color=Marker, y=Length)) +
    geom_boxplot() +
  stat_summary(fun = "median", geom = "point", aes(color = Marker)) + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position = "right")
p

p<-ggplot(ggd2, aes(x=Length, fill=Marker, color=Marker)) +
  geom_histogram(position="identity", alpha=0.5, bins=100) +
  scale_fill_discrete(labels=c('ITS1', 'ITS2', 'D1/D2','RPB2')) +
  scale_x_continuous(breaks=seq(50, 2000, 50)) +
   theme(legend.position = 'right') + guides(col = FALSE)
p

Representativity in public databanks

The 468 sequences were affiliated against public databanks:

  • UNITE 9.0 for ITS1 and ITS2
  • SILVA 28S v138 for D1/D2
  • nt (nt_2021-07-30) for RPB2

The best hit was kept and % of coverage and % of identity were plotted for each marker, as described in the following command lines.

Results are available in the above tables for each marker.

cd /projet/metabarfood/work/FINAL_REFS

conda activate frogs-3.2.3

### ITS1 ###
perl -F"\t" -lane 'if((!/^#/) && ($F[12] ne "none")){print ">$F[7]\t$F[0];$F[1];$F[2];$F[3];$F[4];$F[5];$F[6]\n$F[12]"}' <(grep -v "Domain" 118_final.tsv) > ALL_ITS1.fasta
cat <(echo -e "#blast_taxonomy\tblast_subject\tobservation_name\tobservation_sum\tSample1") <(sed "s/ /#/g" ALL_ITS1.fasta | perl fasta2tab.pl - | perl -lane 'print "$F[1]\t$F[0]\t$F[0]\t1\t1"' | sed "s/#/ /g") >> ALL_ITS1.tsv
tsv_to_biom.py --input-tsv ALL_ITS1.tsv --output-biom ALL_ITS1.biom
affiliation_OTU.py --input-fasta ALL_ITS1.fasta --input-biom ALL_ITS1.biom --nb-cpus 16 --log-file affiliation.log --output-biom affiliation_ITS1.biom --summary affiliation_ITS1.html --reference /db/frogs_databanks/assignation/Unite_Fungi_8.3_20210510/Unite_Fungi_8.3_20210510.fasta
affiliation_OTU.py --input-fasta ALL_ITS1.fasta --input-biom ALL_ITS1.biom --nb-cpus 16 --log-file affiliation.log --output-biom affiliation_ITS1_U9.biom --summary affiliation_ITS1_U9.html --reference ~/work/PROJECTS/METABARFOOD/Unite_Fungi_9.0_20221016/Unite_Fungi_9.0_20221016.fasta
biom_to_tsv.py --input-biom affiliation_ITS1.biom --output-tsv affiliation_ITS1.tsv --input-fasta ALL_ITS1.fasta
biom_to_tsv.py --input-biom affiliation_ITS1_U9.biom --output-tsv affiliation_ITS1_U9.tsv --input-fasta ALL_ITS1.fasta
paste <(sort -k8,8 <(sed "s/ /#/g" <(grep -v "^#" affiliation_ITS1.tsv))) <(sort -k2,2 <(sed "s/ /#/g" <(grep -v "^#" ALL_ITS1.tsv))) | sed "s/#/ /g" |cut -f 1-7,11 > infos_ITS1.tsv
paste <(sort -k8,8 <(sed "s/ /#/g" <(grep -v "^#" affiliation_ITS1_U9.tsv))) <(sort -k2,2 <(sed "s/ /#/g" <(grep -v "^#" ALL_ITS1.tsv))) | sed "s/#/ /g" |cut -f 1-7,11 > infos_ITS1_U9.tsv
sed -i -e 's/d:/k__/g' -e 's/p:/p__/g' -e 's/c:/c__/g' -e 's/o:/o__/g' -e 's/f:/f__/g' -e 's/g:/g__/g' -e 's/s:/s__/g' infos_ITS1_U9.tsv
affiliations_stat.py --input-biom affiliation_ITS1.biom --output-file affiliations_stats_ITS1.html --log-file affiliations_stats.log --multiple-tag blast_affiliations --tax-consensus-tag blast_taxonomy --identity-tag perc_identity --coverage-tag perc_query_coverage
affiliations_stat.py --input-biom affiliation_ITS1_U9.biom --output-file affiliations_stats_ITS1_U9.html --log-file affiliations_stats_U9.log --multiple-tag blast_affiliations --tax-consensus-tag blast_taxonomy --identity-tag perc_identity --coverage-tag perc_query_coverage

### ITS2 ###
perl -F"\t" -lane 'if((!/^#/) && ($F[13] ne "none")){print ">$F[7]\t$F[0];$F[1];$F[2];$F[3];$F[4];$F[5];$F[6]\n$F[13]"}' <(grep -v "Domain" 118_final.tsv) > ALL_ITS2.fasta
cat <(echo -e "#blast_taxonomy\tblast_subject\tobservation_name\tobservation_sum\tSample1") <(sed "s/ /#/g" ALL_ITS2.fasta | perl fasta2tab.pl - | perl -lane 'print "$F[1]\t$F[0]\t$F[0]\t1\t1"' | sed "s/#/ /g") >> ALL_ITS2.tsv
tsv_to_biom.py --input-tsv ALL_ITS2.tsv --output-biom ALL_ITS2.biom
affiliation_OTU.py --input-fasta ALL_ITS2.fasta --input-biom ALL_ITS2.biom --nb-cpus 16 --log-file affiliation.log --output-biom affiliation_ITS2.biom --summary affiliation_ITS2.html --reference /db/frogs_databanks/assignation/Unite_Fungi_8.3_20210510/Unite_Fungi_8.3_20210510.fasta
affiliation_OTU.py --input-fasta ALL_ITS1.fasta --input-biom ALL_ITS1.biom --nb-cpus 16 --log-file affiliation.log --output-biom affiliation_ITS1_U9.biom --summary affiliation_ITS1_U9.html --reference ~/work/PROJECTS/METABARFOOD/Unite_Fungi_9.0_20221016/Unite_Fungi_9.0_20221016.fasta
biom_to_tsv.py --input-biom affiliation_ITS2.biom --output-tsv affiliation_ITS2.tsv --input-fasta ALL_ITS2.fasta
biom_to_tsv.py --input-biom affiliation_ITS2_U9.biom --output-tsv affiliation_ITS2_U9.tsv --input-fasta ALL_ITS2.fasta
paste <(sort -k8,8 <(sed "s/ /#/g" <(grep -v "^#" affiliation_ITS2.tsv))) <(sort -k2,2 <(sed "s/ /#/g" <(grep -v "^#" ALL_ITS2.tsv))) | sed "s/#/ /g" |cut -f 1-7,11 > infos_ITS2.tsv
paste <(sort -k8,8 <(sed "s/ /#/g" <(grep -v "^#" affiliation_ITS2_U9.tsv))) <(sort -k2,2 <(sed "s/ /#/g" <(grep -v "^#" ALL_ITS2.tsv))) | sed "s/#/ /g" |cut -f 1-7,11 > infos_ITS2_U9.tsv
sed -i -e 's/d:/k__/g' -e 's/p:/p__/g' -e 's/c:/c__/g' -e 's/o:/o__/g' -e 's/f:/f__/g' -e 's/g:/g__/g' -e 's/s:/s__/g' infos_ITS2_U9.tsv
affiliations_stat.py --input-biom affiliation_ITS2.biom --output-file affiliations_stats_ITS2.html --log-file affiliations_stats.log --multiple-tag blast_affiliations --tax-consensus-tag blast_taxonomy --identity-tag perc_identity --coverage-tag perc_query_coverage
affiliations_stat.py --input-biom affiliation_ITS2_U9.biom --output-file affiliations_stats_ITS2_U9.html --log-file affiliations_stats_U9.log --multiple-tag blast_affiliations --tax-consensus-tag blast_taxonomy --identity-tag perc_identity --coverage-tag perc_query_coverage

### D1/D2 ###
perl -F"\t" -lane 'if((!/^#/) && ($F[14] ne "none")){print ">$F[7]\t$F[0];$F[1];$F[2];$F[3];$F[4];$F[5];$F[6]\n$F[14]"}' <(grep -v "Domain" 118_final.tsv) > ALL_D1D2.fasta
sed -i "s/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTCAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGAGGCTCTCAGCCCCCGAGTTGTAATTTGAAGATGGTGTTCTGGTGCCGGCCCCCTGTCTACGTTCCTTGGAACAGGACATCACAGAGGGTGAGAATCCCGTCTGGCGGGGCGGCCTGGCTCCGTGTAGAGCGCCATCGACGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGTATTGGATCAGACTTGGTGCTGTGCGAATAGCGGCNCTTCTTGGGCTGCCCACTCGCACTCCACCGGGCCAGCATCGGTTTGGGCGGCAAGACAATGGCGGGGGAACGTGGCACTGCTCTCGGGCAGTGTGTTTATAGCCCCCGCTGATGTTGCCTGCCTAGACCGAGGACTGCGGCTTCTGCCTAGGATGCTGGCGTAATGATCCAACACCGCCCGTCTTGAAACACGGACC/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTCAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGAGGCTCTCAGCCCCCGAGTTGTAATTTGAAGATGGTGTTCTGGTGCCGGCCCCCTGTCTACGTTCCTTGGAACAGGACATCACAGAGGGTGAGAATCCCGTCTGGCGGGGCGGCCTGGCTCCGTGTAGAGCGCCATCGACGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGTATTGGATCAGACTTGGTGCTGTGCGAATAGCGGCACTTCTTGGGCTGCCCACTCGCACTCCACCGGGCCAGCATCGGTTTGGGCGGCAAGACAATGGCGGGGGAACGTGGCACTGCTCTCGGGCAGTGTGTTTATAGCCCCCGCTGATGTTGCCTGCCTAGACCGAGGACTGCGGCTTCTGCCTAGGATGCTGGCGTAATGATCCAACACCGCCCGTCTTGAAACACGGACC/" ALL_D1D2.fasta
sed -i "s/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTCAGTAGCGGCGAGTGAAGCGGCAAGAGCTCAGATTTGAAATCTCACCTAGTGTGCGAGTTGTAAATTGCAGGTTGGAGTCTCGGGTTAGACGTGTGTGCAAGTCCCTTGGAACAGGGTGCCACTGAGGGTGAGAGCCCCGTANCGTGCATGTCGACACCTGTGAGGCCCTTCTGACGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAGGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACTGTGAAGGAAAGATGAAAAGCACTTTGAAAAGAGAGTGAAACAGCACGTGAAATTGTTGAAAGGGAAGGGTATTGGGCTCGACATGGGATTTACGCATCGTTGCCTCTCGTGGGCGGCGCTCTGGGTTTTTCCTGGGCCAGCATCGGTTTTCGTTGCAGGATAAGGACAATTGGAATGTGGCTCCTCGGAGTGTTATAGCCTTTTGTAGATGCTGCGTATGGGGACCGAGGGCTGCGGCGGACTCGTTTCGTCTCGGATGCTGGCACAACGGCGCAATACCGCCCGTCTTGAAACACGGACC/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTCAGTAGCGGCGAGTGAAGCGGCAAGAGCTCAGATTTGAAATCTCACCTAGTGTGCGAGTTGTAAATTGCAGGTTGGAGTCTCGGGTTAGACGTGTGTGCAAGTCCCTTGGAACAGGGTGCCACTGAGGGTGAGAGCCCCGTAACGTGCATGTCGACACCTGTGAGGCCCTTCTGACGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAGGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACTGTGAAGGAAAGATGAAAAGCACTTTGAAAAGAGAGTGAAACAGCACGTGAAATTGTTGAAAGGGAAGGGTATTGGGCTCGACATGGGATTTACGCATCGTTGCCTCTCGTGGGCGGCGCTCTGGGTTTTTCCTGGGCCAGCATCGGTTTTCGTTGCAGGATAAGGACAATTGGAATGTGGCTCCTCGGAGTGTTATAGCCTTTTGTAGATGCTGCGTATGGGGACCGAGGGCTGCGGCGGACTCGTTTCGTCTCGGATGCTGGCACAACGGCGCAATACCGCCCGTCTTGAAACACGGACC/" ALL_D1D2.fasta
sed -i "s/GCATATCAATAAGCGGAGGAAAAGNAACCAACAGGGATTGCCTTAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGCACCTTCGGTGTCCGAGTTGTAATTTGTAGAAGGTAACTTTGGAATTGGCTCTTGTCTATGTTCCTTGGAACAGGACGTCACAGAGGGTGAGAATCCCGTGCGATGAGGTGCCCAATTCTATGTAAAGTGCTTTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCTTGAGATCAGACTTGGTATTTTGCGTTCCTTCCCCTCGTGGGTTGGTTCCTCGCAGCTTACCGGGCCAGCATCGGTTTGGATGGTAGGATAATGACATTGGAATGTGACACCGCTTCGGTGGTGTGTTATAGCCTTTGTTGATACTGCCTGTCTAGACCGAGGACTGCGTCTTTGACTAGGATGTTGGCATAATGATCTTAAGTCGCCCGTCTTGAAACACGGACC/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTTAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGCACCTTCGGTGTCCGAGTTGTAATTTGTAGAAGGTAACTTTGGAATTGGCTCTTGTCTATGTTCCTTGGAACAGGACGTCACAGAGGGTGAGAATCCCGTGCGATGAGGTGCCCAATTCTATGTAAAGTGCTTTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCTTGAGATCAGACTTGGTATTTTGCGTTCCTTCCCCTCGTGGGTTGGTTCCTCGCAGCTTACCGGGCCAGCATCGGTTTGGATGGTAGGATAATGACATTGGAATGTGACACCGCTTCGGTGGTGTGTTATAGCCTTTGTTGATACTGCCTGTCTAGACCGAGGACTGCGTCTTTGACTAGGATGTTGGCATAATGATCTTAAGTCGCCCGTCTTGAAACACGGACC/" ALL_D1D2.fasta
sed -i "s/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTTAGTAGCGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGCGCCTTCGGTGTCCGAGTTGTAATTTGAAGATTGTAACCTTGGGGTTGGCTCTTGTCTATGTTTCTTGGAACAGGACGTCACAGAGGGTGAGAATCCCGTGCGATGAGATGCCCAATTCTATGTAAGGTGCTTTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGTTTGAGATCAGACTCGATATTTTGTGAGCCTTGCCTTCGTGGCGGGGTGACCCGCAGCTTATCGGGCCAGCATCGGTTTGGNCGGTAGGATAATGGCGTAGGAATGTGACTTTACTTCGGTGAAGTGTTATAGCCTGCGTTGATGCTGCCTGCCTAGACCGAGGACTGCGATTTTATCAAGGATGCTGGCATAATGATCCCAAACCGCCCGTCTTGAAACACGGACC/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTTAGTAGCGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGCGCCTTCGGTGTCCGAGTTGTAATTTGAAGATTGTAACCTTGGGGTTGGCTCTTGTCTATGTTTCTTGGAACAGGACGTCACAGAGGGTGAGAATCCCGTGCGATGAGATGCCCAATTCTATGTAAGGTGCTTTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGTTTGAGATCAGACTCGATATTTTGTGAGCCTTGCCTTCGTGGCGGGGTGACCCGCAGCTTATCGGGCCAGCATCGGTTTGGACGGTAGGATAATGGCGTAGGAATGTGACTTTACTTCGGTGAAGTGTTATAGCCTGCGTTGATGCTGCCTGCCTAGACCGAGGACTGCGATTTTATCAAGGATGCTGGCATAATGATCCCAAACCGCCCGTCTTGAAACACGGACC/" ALL_D1D2.fasta
sed -i "s/GCATATCAATAAGCGGAGGAAAAGAAACCAACCGGGATTGCCTCAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGCACTCTCAGTGTCCGAGTTGTAATTTGTAGAAGTAGTTTTGGGGCTGGTCCTTGTCTATGTTCCTTGGAACAGGACGTCATAGAGGGTGAGAATCCCGTGTGGCGAGGATCCCAGTTCTTTGTAAAACGCTTTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCATTTGATCAGACATGGTGTTTGGACGCCCTGCGCTCCTTGTGGGTGCAGGACACGCTTTTCACTGGGCCAACATCGGTTTTGGCAGCAGGATAAATCGTTAGGAACGTAGCTGCCCTCGGGTAGTGTTACAGCCTGGCGGAATACTGCTAGCCGGGACTGANGACTGCGTCTTTTGACAAGGATGTTGGCATAATGGTTAAATGCCGCCCGTCTTGAAACACGGACC/GCATATCAATAAGCGGAGGAAAAGAAACCAACCGGGATTGCCTCAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGCACTCTCAGTGTCCGAGTTGTAATTTGTAGAAGTAGTTTTGGGGCTGGTCCTTGTCTATGTTCCTTGGAACAGGACGTCATAGAGGGTGAGAATCCCGTGTGGCGAGGATCCCAGTTCTTTGTAAAACGCTTTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCATTTGATCAGACATGGTGTTTGGACGCCCTGCGCTCCTTGTGGGTGCAGGACACGCTTTTCACTGGGCCAACATCGGTTTTGGCAGCAGGATAAATCGTTAGGAACGTAGCTGCCCTCGGGTAGTGTTACAGCCTGGCGGAATACTGCTAGCCGGGACTGAAGACTGCGTCTTTTGACAAGGATGTTGGCATAATGGTTAAATGCCGCCCGTCTTGAAACACGGACC/" ALL_D1D2.fasta
sed -i "s/GCATATCAATAAGCGGAGGAAAAGNAACCAACCGGGATTGCCTTAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGTACCTTCGGTGCCCGAGTTGTAATTTGTAGAGGGCGACTTTGGGGCGGCTCCTTGTCTATGTTCCTTGGAACAGGACGTCATAGAGGGTGAGAATCCCGTGTGGCGAGGAGTGCGGTTCCGTGTAAAGCGCTCTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCATTTGATCAGACATGGTGTTTTGTGCCCCTCGCTCCTTGTGGGTGGGGGAATCTCGCAGCTCACTGGGCCAGCATCAGTTTTGGCGGTCGGATAAAACCAGGGGAACGTAGCTTGCTTCAGGAAGTATTATAGCCTCTGGGAATACGGCCAGCCGGGACTGAGGAACGCGATTCGTCAAGGATGCTGGCATAATGGTTATATGCCGCCCGTCTTGAAACACGGACC/GCATATCAATAAGCGGAGGAAAAGAAACCAACCGGGATTGCCTTAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGTACCTTCGGTGCCCGAGTTGTAATTTGTAGAGGGCGACTTTGGGGCGGCTCCTTGTCTATGTTCCTTGGAACAGGACGTCATAGAGGGTGAGAATCCCGTGTGGCGAGGAGTGCGGTTCCGTGTAAAGCGCTCTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCATTTGATCAGACATGGTGTTTTGTGCCCCTCGCTCCTTGTGGGTGGGGGAATCTCGCAGCTCACTGGGCCAGCATCAGTTTTGGCGGTCGGATAAAACCAGGGGAACGTAGCTTGCTTCAGGAAGTATTATAGCCTCTGGGAATACGGCCAGCCGGGACTGAGGAACGCGATTCGTCAAGGATGCTGGCATAATGGTTATATGCCGCCCGTCTTGAAACACGGACC/" ALL_D1D2.fasta
sed -i "s/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTCAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCCTTCGGGAATTGTATTTTGAAGGTGGGTTTGGTTAGGAAAAGTTACTTTAAGTCCATTGGAAAATGGCGCCATGGAGGGTGATAGCCCCGTAAAAGTANCCCTTTTCCTTTTATCCATTCCCTCCAAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGCACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCTTGCAAGCAGACACAACCTCGGTTGGGCCAGCATCGGAGTGGGGGGAGACAAAAAAGAAAAGGAATGTAACTCTTTCGAGTATTATAGCCTTTTTCTCATATCTCCACCCCCTTCCGAGGCCTGCGATTCTTCAAGGATGCTGGCGTAATGGTTGCAAGTCGCCCGTCTTGAAACACGGACC/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTCAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCCTTCGGGAATTGTATTTTGAAGGTGGGTTTGGTTAGGAAAAGTTACTTTAAGTCCATTGGAAAATGGCGCCATGGAGGGTGATAGCCCCGTAAAAGTAACCCTTTTCCTTTTATCCATTCCCTCCAAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGCACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCTTGCAAGCAGACACAACCTCGGTTGGGCCAGCATCGGAGTGGGGGGAGACAAAAAAGAAAAGGAATGTAACTCTTTCGAGTATTATAGCCTTTTTCTCATATCTCCACCCCCTTCCGAGGCCTGCGATTCTTCAAGGATGCTGGCGTAATGGTTGCAAGTCGCCCGTCTTGAAACACGGACC/" ALL_D1D2.fasta
sed -i "s/GCATATCAATAAGCGGAGGAAAAGNAACCAACAGGGATTGCCTTAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGCACCTTCGGTGTCCGAGTTGTAATTTGAAGAAGGTAACTTTGGAGTTGGCTCTTGTCTATGTTCCTTGGAACAGGACGTCACAGAGGGTGAGAATCCCGTGCGATGAGATGCCCAATTCTATGTAAAGTGCTTTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCTTGAGATCAGACTTGGTATTTTGCGATCCTTTCCTTCTTGGTTGGGTTCCTCGCAGCTTACTGGGCCAGCATCGGTTTGGATGGTAGGATAATGACTAAGGAATGTGGCTCTACTTCGGTGGAGTGTTATAGCCTTGGTTGATACTGCCTGTCTAGACCGAGGACTGCGTCTTTTGACTAGGATGTTGGCATAATGATCTTAAGCCACCCGTCTTGAAACACGGACC/GCATATCAATAAGCGGAGGAAAAGAAACCAACAGGGATTGCCTTAGTAACGGCGAGTGAAGCGGCAAAAGCTCAAATTTGAAATCTGGCACCTTCGGTGTCCGAGTTGTAATTTGAAGAAGGTAACTTTGGAGTTGGCTCTTGTCTATGTTCCTTGGAACAGGACGTCACAGAGGGTGAGAATCCCGTGCGATGAGATGCCCAATTCTATGTAAAGTGCTTTCGAAGAGTCGAGTTGTTTGGGAATGCAGCTCTAAGTGGGTGGTAAATTCCATCTAAAGCTAAATATTGGCGAGAGACCGATAGCGAACAAGTACAGTGATGGAAAGATGAAAAGAACTTTGAAAAGAGAGTGAAAAAGTACGTGAAATTGTTGAAAGGGAAGGGCTTGAGATCAGACTTGGTATTTTGCGATCCTTTCCTTCTTGGTTGGGTTCCTCGCAGCTTACTGGGCCAGCATCGGTTTGGATGGTAGGATAATGACTAAGGAATGTGGCTCTACTTCGGTGGAGTGTTATAGCCTTGGTTGATACTGCCTGTCTAGACCGAGGACTGCGTCTTTTGACTAGGATGTTGGCATAATGATCTTAAGCCACCCGTCTTGAAACACGGACC/" ALL_D1D2.fasta

cat <(echo -e "#blast_taxonomy\tblast_subject\tobservation_name\tobservation_sum\tSample1") <(sed "s/ /#/g" ALL_D1D2.fasta | perl fasta2tab.pl - | perl -lane 'print "$F[1]\t$F[0]\t$F[0]\t1\t1"' | sed "s/#/ /g") >> ALL_D1D2.tsv
tsv_to_biom.py --input-tsv ALL_D1D2.tsv --output-biom ALL_D1D2.biom
affiliation_OTU.py --input-fasta ALL_D1D2.fasta --input-biom ALL_D1D2.biom --nb-cpus 16 --log-file affiliation.log --output-biom affiliation_D1D2.biom --summary affiliation_D1D2.html --reference D1D2.fasta
biom_to_tsv.py --input-biom affiliation_D1D2.biom --output-tsv affiliation_D1D2.tsv --input-fasta ALL_D1D2.fasta
paste <(sort -k8,8 <(sed "s/ /#/g" <(grep -v "^#" affiliation_D1D2.tsv))) <(sort -k2,2 <(sed "s/ /#/g" <(grep -v "^#" ALL_D1D2.tsv))) | sed "s/#/ /g" |cut -f 1-7,11 > infos_D1D2.tsv
affiliations_stat.py --input-biom affiliation_D1D2.biom --output-file affiliations_stats_D1D2.html --log-file affiliations_stats.log --multiple-tag blast_affiliations --tax-consensus-tag blast_taxonomy --identity-tag perc_identity --coverage-tag perc_query_coverage

### RPB2 ###
grep -v -w CBS10083 118_final.tsv | grep -v -w CBS121719_UBOCC_A_110191 | grep -v -w CBS131276_UBOCC_A_109153 | grep -v -w CBS20 > 118_final_RPB2.tsv
perl -F"\t" -lane 'if((!/^#/) && ($F[15] ne "none")){print ">$F[7]\t$F[0];$F[1];$F[2];$F[3];$F[4];$F[5];$F[6]\n$F[15]"}' <(grep -v "Domain" 118_final_RPB2.tsv) > ALL_RPB2.fasta
sed -i "s/TGGGGTTTGGTGTGTCCTGCTGAGACTCCTGAAGGCCAAGCGTGTGGTTTGGTGAAGAATTTGTCTTTGATGACATGTATTTCCGTTGGTACTCCATCAGAGCCAATCCTGTCGTTTTTAAGAGATTGGGGATTGGAGCCTTTGGAAGATTATGTTCCTTCCAACTCACCTGATGCAACGAGAGTGTTTGTTAATGGTGTTTGGGTGGGTGTGCATAGAGAGCCAGCGTCATTGGTTGAGTACATGCGTGACCTTAGAAGGAATGGAGATATTTCACCGGAGGTATCCATCATTAGAGATATCAGAGAAAAAGAACTCAAGATTTTTACTGATGCTGGTAGAGTGTACCGTCCATTGTTTATTGTCGACGACAATCCAGATTCTGAAACCAAGGGTGATTTGAAATTGCGTAAGGAACATGTCAATCAATTATTGGAGTCTTCATACGAAGGATATGATGATGAAGCAACCTACACTTGGTCATCATTAGTGAGAGAAGGTATAGTTGAGTATGTTGATGCCGAAGAAGAAGAGACAATTATGATTGCCATGACACCAGATGATCTAGAAGCATCCAAGAGTAACTTAACTGAGACTCAACAGCAAGATTTNCAAATGGAGGAACAGGAACTAGATCCAGCAAAGAGAATTAAACCAACCGGTAGTAGCAATACTCATACATTCACACATTGTGAGATCCATCCTTCAATGATTCTAGGAGTTGCCGCATCTATTATCCCA/TGGGGTTTGGTGTGTCCTGCTGAGACTCCTGAAGGCCAAGCGTGTGGTTTGGTGAAGAATTTGTCTTTGATGACATGTATTTCCGTTGGTACTCCATCAGAGCCAATCCTGTCGTTTTTAAGAGATTGGGGATTGGAGCCTTTGGAAGATTATGTTCCTTCCAACTCACCTGATGCAACGAGAGTGTTTGTTAATGGTGTTTGGGTGGGTGTGCATAGAGAGCCAGCGTCATTGGTTGAGTACATGCGTGACCTTAGAAGGAATGGAGATATTTCACCGGAGGTATCCATCATTAGAGATATCAGAGAAAAAGAACTCAAGATTTTTACTGATGCTGGTAGAGTGTACCGTCCATTGTTTATTGTCGACGACAATCCAGATTCTGAAACCAAGGGTGATTTGAAATTGCGTAAGGAACATGTCAATCAATTATTGGAGTCTTCATACGAAGGATATGATGATGAAGCAACCTACACTTGGTCATCATTAGTGAGAGAAGGTATAGTTGAGTATGTTGATGCCGAAGAAGAAGAGACAATTATGATTGCCATGACACCAGATGATCTAGAAGCATCCAAGAGTAACTTAACTGAGACTCAACAGCAAGATTTACAAATGGAGGAACAGGAACTAGATCCAGCAAAGAGAATTAAACCAACCGGTAGTAGCAATACTCATACATTCACACATTGTGAGATCCATCCTTCAATGATTCTAGGAGTTGCCGCATCTATTATCCCA/" ALL_RPB2.fasta
sed -i "s/TGGGGTTTGGTTTGTCCTGCNGAGACTCCAGAAGGTCAAGCGTGTGGTTTGGTGAAGAACTTGTCGCTTATGTCNTGTATCTCTGTGGGTACGCCCTCAGATCCTATCTTGTCGTTTCTTCGTGATTGGGGTTTAGAGCCTTTNGANGATTATACTCCTGCCAACTCACCAGATGTCACTAGAGTTTTTGTTAATGGTGTTTGGGTAGGTGTCCACAGNGAGCCAGCTTCTTTGGTTGATTATATGCGTGATCTTAGACGAAATGGTGACATTTCNCCTGAAGTGTCACTCATCAGAGATATTAGGGANAAGGAATTCAAAATCTTTACTGATGCAGGTAGAGTTTATCGTCCATTATTCATTGTTGACGATAAACCTGATTCTGCTACCAAAGGTGATTTGAAACTTCAAAAAGAACATGTTAATCAGCTTTTGGAGTCTTCATACGAAGGATATGATGAAGATGANGAGGATGCGGCACCAAAATATACATGGTCTTCATTGGTTAAGGAAGGTATTGTTGAATATGTCGATGCTGAAGAAGAAGAAACGATTATGATTGCCATGACGCCCGATGATCTTGAAGCTTCAAAGAGCAGTTTGACCGAGACTCAACAACAAAACTTGCAAATGGAAGAGCAAGAGCTTGATCCAGCAAAGAGAATCAAACCCACAAACAGTAGTAACACGCATACGTTTACTCATTGTGAGATTCACCCTTCAATGATTCTTGGTGTTGCAGCCTCGATTATTCCGTTCCCTGATCATAATCAATC/TGGGGTTTGGTTTGTCCTGCAGAGACTCCAGAAGGTCAAGCGTGTGGTTTGGTGAAGAACTTGTCGCTTATGTCATGTATCTCTGTGGGTACGCCCTCAGATCCTATCTTGTCGTTTCTTCGTGATTGGGGTTTAGAGCCTTTAGAAGATTATACTCCTGCCAACTCACCAGATGTCACTAGAGTTTTTGTTAATGGTGTTTGGGTAGGTGTCCACAGAGAGCCAGCTTCTTTGGTTGATTATATGCGTGATCTTAGACGAAATGGTGACATTTCACCTGAAGTGTCACTCATCAGAGATATTAGGGAAAAGGAATTCAAAATCTTTACTGATGCAGGTAGAGTTTATCGTCCATTATTCATTGTTGACGATAAACCTGATTCTGCTACCAAAGGTGATTTGAAACTTCAAAAAGAACATGTTAATCAGCTTTTGGAGTCTTCATACGAAGGATATGATGAAGATGAAGAGGATGCGGCACCAAAATATACATGGTCTTCATTGGTTAAGGAAGGTATTGTTGAATATGTCGATGCTGAAGAAGAAGAAACGATTATGATTGCCATGACGCCCGATGATCTTGAAGCTTCAAAGAGCAGTTTGACCGAGACTCAACAACAAAACTTGCAAATGGAAGAGCAAGAGCTTGATCCAGCAAAGAGAATCAAACCCACAAACAGTAGTAACACGCATACGTTTACTCATTGTGAGATTCACCCTTCAATGATTCTTGGTGTTGCAGCCTCGATTATTCCGTTCCCTGATCATAATCAATC/" ALL_RPB2.fasta
sed -i "s/TGGGGGCTTGTGTGTCCTGCTGAGACCCCGGAAGGACAGGCGTGTGGTTTGGTGAAGAATTTGTCGTTGATGTCGTGTATCTCGGTGGGGTCACCCTCCGATATGATTTTGGACTATTTGGATGAGTGGGGGATGGAGCCGCTGGAAGAGTATGTACCTTCTGATGCGGAGCACAGCACCAAGATCTTTGTGAACGGTGTGTGGGTCGGAACGCATCGTATTCCTGCGGATCTTGTGCGGAATATTAAGGATTTGAGAAGGCGTGGAGACATCTCGCCTGAAGTTTCCATTATCAGAAACATCAGGGAAAAGGAGTTCAAATTGTTTACCGATGCAGGACGTGTGTATNGGCCACTTTTCATTGTAGATGACGACCCGGAGAGCGAAAATAAGGGAGAGCTCAAGCTCAACAAGTCCCACATTGAGCGGCTCCAGGCAGNGCAAGATGAGACAGGTGAGTACGGCATGGATGTGGACGAGGAGGAGGAAGACGGAATNTACGGATGGTCGTCTCTGGTGAAGAATGGTGTCGTTGAGTACGTAGACGCCGAGGAAGAGGAAACCATCATGATNGCCATGGCTCCAGAAGATCTCACCAGTGGAAGAGTTGGTGTTGATGAGCAGCGCAATGTGGATATGGATCGCGATCCCGGTTTGAGAATCAAGCCCGCCATTAATCCTTCCACGCACACTTTTACGCATTGCGAGATTCATCCGTCGATGATTCTGGGAGTGGCAGCTTCGATCATTCCGTTCCCGGATCACAATCAGTC/TGGGGGCTTGTGTGTCCTGCTGAGACCCCGGAAGGACAGGCGTGTGGTTTGGTGAAGAATTTGTCGTTGATGTCGTGTATCTCGGTGGGGTCACCCTCCGATATGATTTTGGACTATTTGGATGAGTGGGGGATGGAGCCGCTGGAAGAGTATGTACCTTCTGATGCGGAGCACAGCACCAAGATCTTTGTGAACGGTGTGTGGGTCGGAACGCATCGTATTCCTGCGGATCTTGTGCGGAATATTAAGGATTTGAGAAGGCGTGGAGACATCTCGCCTGAAGTTTCCATTATCAGAAACATCAGGGAAAAGGAGTTCAAATTGTTTACCGATGCAGGACGTGTGTATAGGCCACTTTTCATTGTAGATGACGACCCGGAGAGCGAAAATAAGGGAGAGCTCAAGCTCAACAAGTCCCACATTGAGCGGCTCCAGGCAGAGCAAGATGAGACAGGTGAGTACGGCATGGATGTGGACGAGGAGGAGGAAGACGGAATATACGGATGGTCGTCTCTGGTGAAGAATGGTGTCGTTGAGTACGTAGACGCCGAGGAAGAGGAAACCATCATGATAGCCATGGCTCCAGAAGATCTCACCAGTGGAAGAGTTGGTGTTGATGAGCAGCGCAATGTGGATATGGATCGCGATCCCGGTTTGAGAATCAAGCCCGCCATTAATCCTTCCACGCACACTTTTACGCATTGCGAGATTCATCCGTCGATGATTCTGGGAGTGGCAGCTTCGATCATTCCGTTCCCGGATCACAATCAGTC/" ALL_RPB2.fasta
sed -i "s/TGGGGTATGGAACCANTNGAAGATTATGTNCCNCATCAATCTCCAGATGCTACAAGNGTCTTNGTTAACGGTGTATGGCACGGTGTTCATAGAAACCCAGCTAGATTAATGGAAACTTTAAGAACNTTAAGAAGAAAGGGTGATATNAACCCAGAAGTCTCTATGGTTAGAGATATTCGTGAAAAGGAANTAAAGATTTTCACTGATGCTGGTAGAGTTTACAGACCGNTATTCATTGTTGAAGANGATCATGAATTAGGTCGTAAGGAANTGAAGGTAAGAAAGGGTCATATCAATAAGTTGATGGCTACTGAATATCAAGATATTGAAGGTGGTCTAGATGAATCGGAAGAATACACATGGACATCTNTGTTAAGTGAAGGTCTGGTAGAATATATTGACGCTGAAGAAGAAGAAACTATTCTGATTGCTATGCAACCAGAAGATTTGGAACCGATTACAGAAGAAGCCACTGCTGCTATAAATGAAATGGATCCAGCTAGACGTATCAAGGCTGTTCAACATGCCACTACATTTACACANTGTGAAATTCATCCATCTATGATTTTAGGTGTTGCTGCTTCTATTATTCCTTTCCCAGATCATAACCAATC/TGGGGTATGGAACCAATAGAAGATTATGTACCACATCAATCTCCAGATGCTACAAGAGTCTTAGTTAACGGTGTATGGCACGGTGTTCATAGAAACCCAGCTAGATTAATGGAAACTTTAAGAACATTAAGAAGAAAGGGTGATATAAACCCAGAAGTCTCTATGGTTAGAGATATTCGTGAAAAGGAAATAAAGATTTTCACTGATGCTGGTAGAGTTTACAGACCGATATTCATTGTTGAAGAAGATCATGAATTAGGTCGTAAGGAAATGAAGGTAAGAAAGGGTCATATCAATAAGTTGATGGCTACTGAATATCAAGATATTGAAGGTGGTCTAGATGAATCGGAAGAATACACATGGACATCTATGTTAAGTGAAGGTCTGGTAGAATATATTGACGCTGAAGAAGAAGAAACTATTCTGATTGCTATGCAACCAGAAGATTTGGAACCGATTACAGAAGAAGCCACTGCTGCTATAAATGAAATGGATCCAGCTAGACGTATCAAGGCTGTTCAACATGCCACTACATTTACACAATGTGAAATTCATCCATCTATGATTTTAGGTGTTGCTGCTTCTATTATTCCTTTCCCAGATCATAACCAATC/" ALL_RPB2.fasta
sed -i "s/TGGGGGTTGGTGTGTCCCGCCGAGACCCCCGAATGGCAGGCGTGTGGTTTGGTGAAGAACTTGTCGTTGATGACGTGTATCTCGGTGGGTCAGGACCTGGAGTTGGTGGCATCGGTGTTGGACGAGTGGGGGATGGAGCCGTTGGAGGACTACGTCCCCAGCAACTCGCCCGAAGCCACGAGAGTGTTCCTCAACGGGGTGTGGTTGGGTACGCATAGAGACCCGGCGATGCTTTTGGAGACGATGCGCCTCATGCGGCGGAAGGAGGAGATCCCCTCCGAAGTGCTGATTATCCGCGATATCCGCGAGCGCGAGTTCAAAGTGTTTACTGACGCCGGGCGTGTGTACCGTCCGTTGTTCATTGTTGACGACGACCGCGAGCTGGACAAGTTCGGCCACTTGAAGCTCACCAAGGAGCACGCCCGCCAGTTGGAGACCCCCGAGATCTTCGAAGACGAAGACGGAACTCCCAAGCCGTGGTCGCATCTCGTGACTAACGGCATTGTAGAATACGTCGACGCCGAGGAAGAAGAGACGATCATGATTGCCATGCTGCCGGAAGACCTCGAACTGCTGAGAAACGCCATGAACGGCATCGTCAAGACGGAGACGGAAGAGGAGATCGACCCCGCCAAGCGGATCACCCAGCTGTTCCTGGGCAAGAACCACACCTTCACTCNNTGTGANATCCACCCGCTGATGATCTTGGGGGTGGCGGCGCTGATTATCCCC/TGGGGGTTGGTGTGTCCCGCCGAGACCCCCGAATGGCAGGCGTGTGGTTTGGTGAAGAACTTGTCGTTGATGACGTGTATCTCGGTGGGTCAGGACCTGGAGTTGGTGGCATCGGTGTTGGACGAGTGGGGGATGGAGCCGTTGGAGGACTACGTCCCCAGCAACTCGCCCGAAGCCACGAGAGTGTTCCTCAACGGGGTGTGGTTGGGTACGCATAGAGACCCGGCGATGCTTTTGGAGACGATGCGCCTCATGCGGCGGAAGGAGGAGATCCCCTCCGAAGTGCTGATTATCCGCGATATCCGCGAGCGCGAGTTCAAAGTGTTTACTGACGCCGGGCGTGTGTACCGTCCGTTGTTCATTGTTGACGACGACCGCGAGCTGGACAAGTTCGGCCACTTGAAGCTCACCAAGGAGCACGCCCGCCAGTTGGAGACCCCCGAGATCTTCGAAGACGAAGACGGAACTCCCAAGCCGTGGTCGCATCTCGTGACTAACGGCATTGTAGAATACGTCGACGCCGAGGAAGAAGAGACGATCATGATTGCCATGCTGCCGGAAGACCTCGAACTGCTGAGAAACGCCATGAACGGCATCGTCAAGACGGAGACGGAAGAGGAGATCGACCCCGCCAAGCGGATCACCCAGCTGTTCCTGGGCAAGAACCACACCTTCACTCAATGTGAAATCCACCCGCTGATGATCTTGGGGGTGGCGGCGCTGATTATCCCC/" ALL_RPB2.fasta
sed -i "s/TGGGGTCTTGTATGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTTTGGTGAAAAATTTGTCCTTGATGTCTTGTATTTCAGTGGGGACGTCGTCAGAACCAATTTTGTACTTCTTGGAGGAGTGGGGTATGGAGCCGCTCGAAGATTACGTTCCTTCAAATTCCCCAGACTCAACCAGAGTCTTTGTCAATGGCGTTTGGGTCGGTACTCACAGAGAGCCAGCGCATTTAGTTGACACCATGCGCAACTTAAGAAGAAGGGGAGACATTTCCCCAGAAGTCTCAATCATCAGAGACATTCGTGAAAAGGAATTCAGTATCTTTACCGACGCAGGTCGGGTTTATCGCCCCTTGTTTGTGGTTGACGATGATCTGGAAAGTGATACTTTTGGAGAATTAAAATTGCAGAAGGAGAACATTCACAAGCTCATGAACTCGGAGTATGATGAATTTGATGAAGAGAGCAATGCGGAGAATTATAACTGGTCTTCTTTGGTGAACGAAGGCATAGTGGAGTACGTCGATGCCGAGGAAGAGGAAACCATCATGATTGCAATGACGCCCGAGGACTTGGAGGCTTCAAAATCGCCCTTGAGCGACACGCAACAAAAGGATATCCAGATGGAAGAACAAGAATTGGATCCGGCCAAGAGAATCAAGCCCACCTTCAACAGCAACACTCACACGTTTACGCATTGTGAGATTCATCCCTCGATGATCTTAGGCGTGGCTGCGTCGATTATCCCC/TGGGGTCTTGTATGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGTTTGGTGAAAAATTTGTCCTTGATGTCTTGTATTTCAGTGGGGACGTCGTCAGAACCAATTTTGTACTTCTTGGAGGAGTGGGGTATGGAGCCGCTCGAAGATTACGTTCCTTCAAATTCCCCAGACTCAACCAGAGTCTTTGTCAATGGCGTTTGGGTCGGTACTCACAGAGAGCCAGCGCATTTAGTTGACACCATGCGCAACTTAAGAAGAAGGGGAGACATTTCCCCAGAAGTCTCAATCATCAGAGACATTCGTGAAAAGGAATTCAGTATCTTTACCGACGCAGGTCGGGTTTATCGCCCCTTGTTTGTGGTTGACGATGATCTGGAAAGTGATACTTTTGGAGAATTAAAATTGCAGAAGGAGAACATTCACAAGCTCATGAACTCGGAGTATGATGAATTTGATGAAGAGAGCAATGCGGAGAATTATAACTGGTCTTCTTTGGTGAACGAAGGCATAGTGGAGTACGTCGATGCCGAGGAAGAGGAAACCATCATGATTGCAATGACGCCCGAGGACTTGGAGGCTTCAAAATCGCCCTTGAGCGACACGCAACAAAAGGATATCCAGATGGAAGAACAAGAATTGGATCCGGCCAAGAGAATCAAGCCCACCTTCAACAGCAACACTCACACGTTTACGCATTGTGAGATTCATCCCTCGATGATCTTAGGCGTGGCTGCGTCGATTATCCCC/" ALL_RPB2.fasta
sed -i "s/TGGGGTATGGAACCACTGGAAGATTATGTTCCACATCAATCTCCAGACGCTACNAGAGTNTTTGTTAACGGTGTATGGCACGGTGTTCATAGAAATCCAGCNAGATTAATGGAAACTTTAAGAACCTTAAGAAGAAAGGGTGATATTAACCCAGAAGTCTCTATGGTTAGAGATATTCGTGAAAAGGAATTAAAGATTTTCACTGATGCTGGTAGAGTTTACAGNCCATTATTCATTGTCGAAGATGATCAAGAATTGGGCCGTAAGGANCTAAAGGTAAGAAAGGGNCATATCAACAANTTGATGGCTACNGAATACCAAGATATCGANGGTGGTCTTGATGAATCAGAAGAATACACATGGACATCTCTGTTAAGTGAAGGTTTNGTGGAATACATCGATGCAGAAGAAGAAGAAACGATTCTGATTGCTATGCAACCAGAAGATCTGGAACCTGTCACAGAAGAAGCCGCCGCCGCTATTAATGAAATGGATCCAGCTAGACGTATTAAAGCTGTTCAAAATGCTACTACCTTTACACACTGTGAAATTCATCCTTCAATGATTTTAGGTGTTGCTGCATCTATTATCCCATTCCCAGATCATAACCANTC/TGGGGTATGGAACCACTGGAAGATTATGTTCCACATCAATCTCCAGACGCTACAAGAGTATTTGTTAACGGTGTATGGCACGGTGTTCATAGAAATCCAGCAAGATTAATGGAAACTTTAAGAACCTTAAGAAGAAAGGGTGATATTAACCCAGAAGTCTCTATGGTTAGAGATATTCGTGAAAAGGAATTAAAGATTTTCACTGATGCTGGTAGAGTTTACAGACCATTATTCATTGTCGAAGATGATCAAGAATTGGGCCGTAAGGAACTAAAGGTAAGAAAGGGACATATCAACAAATTGATGGCTACAGAATACCAAGATATCGAAGGTGGTCTTGATGAATCAGAAGAATACACATGGACATCTCTGTTAAGTGAAGGTTTAGTGGAATACATCGATGCAGAAGAAGAAGAAACGATTCTGATTGCTATGCAACCAGAAGATCTGGAACCTGTCACAGAAGAAGCCGCCGCCGCTATTAATGAAATGGATCCAGCTAGACGTATTAAAGCTGTTCAAAATGCTACTACCTTTACACACTGTGAAATTCATCCTTCAATGATTTTAGGTGTTGCTGCATCTATTATCCCATTCCCAGATCATAACCAATC/" ALL_RPB2.fasta
sed -i "s/TGGGGTCTTGTGTGTCCCGCAGAAACACCTGAAGGACAGGCTTGCGGTCTGGTCAAGAACTTGTCTCTCATGTGTTACGTCAGTGTTGGTACGCCTGCCGAGCCCATCGTCGAGTTCATGAACCAGCGAAACATGGAATTGCTCGAAGAGTATGAGCCCAAGAACAACCCGGATGCCACAAAGGTCTTCGTCAACGGTGTATGGGTTGGTGTCCACAGAGACCCCTCTCAGCTCGTCAAGGTTGTGCAAAGTCTCCGCCGTAACGGCACCATCTCTTTCGAAATCTCACTCATCAGAGATGTTCGTGAGCGAGAGTTCAAGATTTTCACTGATGCTGGTCGTGTCATGAGACCGCTGTTCGTTGTCAACAATGACCCTGCAAGTCCGACCAAGGGTCAGCTTACCCTGAACAGATCTCACATTTCTCAGCTGCTTAATGCACGCGAGACTGACACTATCGGTCTTAGCGAAGAGGAGCGCGACGGTACAATTTATGGCTGGAAGAACTTGATCAGTGATGGTGTCGTTGAGTACCTCGATGCCGAGGAAGAAGAGGTTGCCATGATGGTCATGTCACCTGAAGACCTCGACGAGCATCGCCAGATGAGAGCTGGACTCGTCTATGAAGAGCCAGTGACTGACCCTCATCGAAGAATCAAGAGCAGGCCAAACGCCAACGTTAGAACATGGACTCATTGCGAGATTCACCCTGCCATGATTCTTGGTATTTGCGCTTCCATTATTCCTTTCCCNGATCACAACCAGTC/TGGGGTCTTGTGTGTCCCGCAGAAACACCTGAAGGACAGGCTTGCGGTCTGGTCAAGAACTTGTCTCTCATGTGTTACGTCAGTGTTGGTACGCCTGCCGAGCCCATCGTCGAGTTCATGAACCAGCGAAACATGGAATTGCTCGAAGAGTATGAGCCCAAGAACAACCCGGATGCCACAAAGGTCTTCGTCAACGGTGTATGGGTTGGTGTCCACAGAGACCCCTCTCAGCTCGTCAAGGTTGTGCAAAGTCTCCGCCGTAACGGCACCATCTCTTTCGAAATCTCACTCATCAGAGATGTTCGTGAGCGAGAGTTCAAGATTTTCACTGATGCTGGTCGTGTCATGAGACCGCTGTTCGTTGTCAACAATGACCCTGCAAGTCCGACCAAGGGTCAGCTTACCCTGAACAGATCTCACATTTCTCAGCTGCTTAATGCACGCGAGACTGACACTATCGGTCTTAGCGAAGAGGAGCGCGACGGTACAATTTATGGCTGGAAGAACTTGATCAGTGATGGTGTCGTTGAGTACCTCGATGCCGAGGAAGAAGAGGTTGCCATGATGGTCATGTCACCTGAAGACCTCGACGAGCATCGCCAGATGAGAGCTGGACTCGTCTATGAAGAGCCAGTGACTGACCCTCATCGAAGAATCAAGAGCAGGCCAAACGCCAACGTTAGAACATGGACTCATTGCGAGATTCACCCTGCCATGATTCTTGGTATTTGCGCTTCCATTATTCCTTTCCCAGATCACAACCAGTC/" ALL_RPB2.fasta
cat <(echo -e "#blast_taxonomy\tblast_subject\tobservation_name\tobservation_sum\tSample1") <(sed "s/ /#/g" ALL_RPB2.fasta | perl fasta2tab.pl - | perl -lane 'print "$F[1]\t$F[0]\t$F[0]\t1\t1"' | sed "s/#/ /g") >> ALL_RPB2.tsv
tsv_to_biom.py --input-tsv ALL_RPB2.tsv --output-biom ALL_RPB2.biom
affiliation_OTU.py --input-fasta ALL_RPB2.fasta --input-biom ALL_RPB2.biom --nb-cpus 16 --log-file affiliation.log --output-biom affiliation_RPB2.biom --summary affiliation_RPB2.html --reference RPB2.fasta
biom_to_tsv.py --input-biom affiliation_RPB2.biom --output-tsv affiliation_RPB2.tsv --input-fasta ALL_RPB2.fasta
paste <(sort -k8,8 <(sed "s/ /#/g" <(grep -v "^#" affiliation_RPB2.tsv))) <(sort -k2,2 <(sed "s/ /#/g" <(grep -v "^#" ALL_RPB2.tsv))) | sed "s/#/ /g" |cut -f 1-7,11 > infos_RPB2.tsv
affiliations_stat.py --input-biom affiliation_RPB2.biom --output-file affiliations_stats_RPB2.html --log-file affiliations_stats.log --multiple-tag blast_affiliations --tax-consensus-tag blast_taxonomy --identity-tag perc_identity --coverage-tag perc_query_coverage

ITS1

ITS2

D1/D2

RPB2

The following figures show identity and coverage percents of the best hit for all markers.

Phylogenetic trees

The 118 sequences of ITS1, ITS2, D1/D2 and RPB2 were aligned and a phylogenetic tree was built for each marker with FROGS tree tool. Static and interactive trees are available. Newick files are also available for a visualization in dedicated viewers like ITOL.

# Example for RPB2 sequences
cd /save_projet/metabarfood/REFERENCES/
conda activate frogs-3.2.3
tsv_to_biom.py -t references.tsv --output-biom test.biom
# references.tsv contient les 118 séquences et les samples sont MEAT, CHEESE, WINE et BREAD avec une abondance de 1 pour présente, 0 pour absente
tree.py --input-sequences ALL_RPB2.fasta --biom-file test.biom --out-tree ALL_RPB2.nwk --html ALL_RPB2.html

ITS1

library(phyloseq)
library(phyloseq.extended)
library(ggplot2)
library(plotly)
physeq_its1 <- import_frogs("REFERENCES/TREES/test.biom")
phy_tree(physeq_its1) <- read_tree("REFERENCES/TREES/ALL_ITS1.nwk")

metadata <- data.frame(Name = c("BREAD","CHEESE","WINE","MEAT"))
rownames(metadata) <- c("BREAD","CHEESE","WINE","MEAT")
sample_data(physeq_its1) <- metadata
sample_data(physeq_its1)$Marker <- "ITS1"

#p <- plot_tree(physeq_its1, nodelabf=nodeplotblank, ladderize="left", color="Class", shape="Sample", base.spacing=0.04, label.tips = "taxa_names") + coord_polar(theta="y") #+ theme(legend.position="bottom")
#p

#p <- plot_tree(physeq_its1, nodelabf=nodeplotblank, ladderize="left", color="Genus", shape="Sample", base.spacing=0.04, label.tips = "taxa_names") + coord_polar(theta="y") #+ theme(legend.position="bottom")
#p

#p <- plot_tree(physeq_its1, nodelabf=nodeplotblank, ladderize="left", color="Class", shape="Sample", base.spacing=0.04, label.tips = "taxa_names")  #+ theme(legend.position="bottom")
#p
#ggplotly(p)

p <- ggtree(physeq_its1, layout="fan", open.angle=10)+ geom_tiplab(offset=0.3, size = 2) +
  geom_point2(mapping=aes(color=Class, shape=Name),  size=2, data = . %>% filter(Name == "BREAD"), position = position_nudge(x = 0.05)) + 
  geom_point2(mapping=aes(color=Class, shape=Name), size=2, data = . %>% filter(Name == "WINE"), position = position_nudge(x = 0.1)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "CHEESE"), position = position_nudge(x = 0.15)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "MEAT"), position = position_nudge(x = 0.2))
p + scale_color_brewer(palette = "Paired")

p <- ggtree(physeq_its1, layout="circular", branch.length="none", open.angle=10)+ geom_tiplab(linesize = 0.01, offset=3, size = 2) + 
  geom_point2(mapping=aes(color=Class, shape=Name),  size=2, data = . %>% filter(Name == "BREAD"), position = position_nudge(x = 0.5)) + 
  geom_point2(mapping=aes(color=Class, shape=Name), size=2, data = . %>% filter(Name == "WINE"), position = position_nudge(x = 1.5)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "CHEESE"), position = position_nudge(x = 2)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "MEAT"), position = position_nudge(x = 2.5))
p + scale_color_brewer(palette = "Paired")

ITS2

physeq_its2 <- import_frogs("REFERENCES/TREES/test.biom")
phy_tree(physeq_its2) <- read_tree("REFERENCES/TREES/ALL_ITS2.nwk")

metadata <- data.frame(Name = c("BREAD","CHEESE","WINE","MEAT"))
rownames(metadata) <- c("BREAD","CHEESE","WINE","MEAT")
sample_data(physeq_its2) <- metadata
sample_data(physeq_its2)$Marker <- "ITS2"

#p <- plot_tree(physeq_its2, nodelabf=nodeplotblank, ladderize="left", color="Class", shape="Sample", base.spacing=0.04, label.tips = "taxa_names") + coord_polar(theta="y") #+ theme(legend.position="bottom")
#p
#p <- plot_tree(physeq_its1, nodelabf=nodeplotblank, ladderize="left", color="Genus", shape="Sample", base.spacing=0.04, label.tips = "taxa_names") + coord_polar(theta="y") #+ theme(legend.position="bottom")
#p

#p <- plot_tree(physeq_its2, nodelabf=nodeplotblank, ladderize="left", color="Class", shape="Sample", base.spacing=0.04, label.tips = "taxa_names")  #+ theme(legend.position="bottom")
#p
#ggplotly(p)


p <- ggtree(physeq_its2, layout="fan", open.angle=10)+ geom_tiplab(offset=0.3, size = 2) +
  geom_point2(mapping=aes(color=Class, shape=Name),  size=2, data = . %>% filter(Name == "BREAD"), position = position_nudge(x = 0.05)) + 
  geom_point2(mapping=aes(color=Class, shape=Name), size=2, data = . %>% filter(Name == "WINE"), position = position_nudge(x = 0.1)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "CHEESE"), position = position_nudge(x = 0.15)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "MEAT"), position = position_nudge(x = 0.2))
p + scale_color_brewer(palette = "Paired")

p <- ggtree(physeq_its2, layout="circular", branch.length="none", open.angle=10)+ geom_tiplab(linesize = 0.01, offset=3, size = 2) + 
  geom_point2(mapping=aes(color=Class, shape=Name),  size=2, data = . %>% filter(Name == "BREAD"), position = position_nudge(x = 0.5)) + 
  geom_point2(mapping=aes(color=Class, shape=Name), size=2, data = . %>% filter(Name == "WINE"), position = position_nudge(x = 1.5)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "CHEESE"), position = position_nudge(x = 2)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "MEAT"), position = position_nudge(x = 2.5))
p + scale_color_brewer(palette = "Paired")

D1/D2

physeq_d1d2 <- import_frogs("REFERENCES/TREES/test.biom")
phy_tree(physeq_d1d2) <- read_tree("REFERENCES/TREES/ALL_D1D2.nwk")
metadata <- data.frame(Name = c("BREAD","CHEESE","WINE","MEAT"))
rownames(metadata) <- c("BREAD","CHEESE","WINE","MEAT")
sample_data(physeq_d1d2) <- metadata
sample_data(physeq_d1d2)$Marker <- "D1/D2"

#p <- plot_tree(physeq_d1d2, nodelabf=nodeplotblank, ladderize="left", color="Class", shape="Sample", base.spacing=0.04, label.tips = "taxa_names") + coord_polar(theta="y") #+ theme(legend.position="bottom")
#p

#p <- plot_tree(physeq_d1d2, ladderize="left", nodelabf=nodeplotblank, color="Class", shape="Sample", base.spacing=0.05) + coord_polar(theta="y", start = 2)
#p

p <- ggtree(physeq_d1d2, layout="fan", open.angle=10)+ geom_tiplab(offset=0.3, size = 2) +
  geom_point2(mapping=aes(color=Class, shape=Name),  size=2, data = . %>% filter(Name == "BREAD"), position = position_nudge(x = 0.05)) + 
  geom_point2(mapping=aes(color=Class, shape=Name), size=2, data = . %>% filter(Name == "WINE"), position = position_nudge(x = 0.1)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "CHEESE"), position = position_nudge(x = 0.15)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "MEAT"), position = position_nudge(x = 0.2))
p + scale_color_brewer(palette = "Paired")

p <- ggtree(physeq_d1d2, layout="circular", branch.length="none", open.angle=10)+ geom_tiplab(linesize = 0.01, offset=3, size = 2) + 
  geom_point2(mapping=aes(color=Class, shape=Name),  size=2, data = . %>% filter(Name == "BREAD"), position = position_nudge(x = 0.5)) + 
  geom_point2(mapping=aes(color=Class, shape=Name), size=2, data = . %>% filter(Name == "WINE"), position = position_nudge(x = 1.5)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "CHEESE"), position = position_nudge(x = 2)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "MEAT"), position = position_nudge(x = 2.5))
p + scale_color_brewer(palette = "Paired")

RPB2

CBS17 (Rhodotorula mucilaginosa var. mucilaginosa), CBS20 (Rhodotorula glutinis) and CBS10083 (Cryptococcus neoformans) were excluded beacause that were too far from other sequences.

physeq_rpb2 <- import_frogs("REFERENCES/TREES/test.biom")
physeq_rpb2 <- physeq_rpb2 %>% subset_taxa( Species!= "Rhodotorula mucilaginosa var. mucilaginosa" & Species != "Rhodotorula glutinis" & Species != "Cryptococcus neoformans") 
phy_tree(physeq_rpb2) <- read_tree("REFERENCES/TREES/ALL_RPB2.nwk")

metadata <- data.frame(Name = c("BREAD","CHEESE","WINE","MEAT"))
rownames(metadata) <- c("BREAD","CHEESE","WINE","MEAT")
sample_data(physeq_rpb2) <- metadata
sample_data(physeq_rpb2)$Marker <- "RPB2"

#p <- plot_tree(physeq_rpb2, nodelabf=nodeplotblank, ladderize="left", color="Class", shape="Sample", base.spacing=0.04, label.tips = "taxa_names") + coord_polar(theta="y") #+ theme(legend.position="bottom")
#p
#p <- plot_tree(physeq_its1, nodelabf=nodeplotblank, ladderize="left", color="Genus", shape="Sample", base.spacing=0.04, label.tips = "taxa_names") + coord_polar(theta="y") #+ theme(legend.position="bottom")
#p
#p <- plot_tree(physeq_rpb2, nodelabf=nodeplotblank, ladderize="left", color="Class", shape="Sample", base.spacing=0.04, label.tips = "taxa_names")  #+ theme(legend.position="bottom")
#p
#ggplotly(p)


p <- ggtree(physeq_rpb2, layout="fan", open.angle=10)+ geom_tiplab(offset=0.3, size = 2) +
  geom_point2(mapping=aes(color=Class, shape=Name),  size=2, data = . %>% filter(Name == "BREAD"), position = position_nudge(x = 0.05)) + 
  geom_point2(mapping=aes(color=Class, shape=Name), size=2, data = . %>% filter(Name == "WINE"), position = position_nudge(x = 0.1)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "CHEESE"), position = position_nudge(x = 0.15)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "MEAT"), position = position_nudge(x = 0.2))
p + scale_color_brewer(palette = "Paired")

p <- ggtree(physeq_rpb2, layout="circular", branch.length="none", open.angle=10)+ geom_tiplab(linesize = 0.01, offset=3, size = 2) + 
  geom_point2(mapping=aes(color=Class, shape=Name),  size=2, data = . %>% filter(Name == "BREAD"), position = position_nudge(x = 0.5)) + 
  geom_point2(mapping=aes(color=Class, shape=Name), size=2, data = . %>% filter(Name == "WINE"), position = position_nudge(x = 1.5)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "CHEESE"), position = position_nudge(x = 2)) +
  geom_point2(mapping=aes(color=Class, shape=Name), size=2,  data = . %>% filter(Name == "MEAT"), position = position_nudge(x = 2.5))
p + scale_color_brewer(palette = "Paired")

trees <- c(physeq_its1, physeq_its2, physeq_d1d2, physeq_rpb2)
class(trees) <- "multiPhylo"
markers <- c("ITS1","ITS2","D1/D2","RPB2")
marker_names <- list(
  'Tree#1'="ITS1",
  'Tree#2'="ITS2",
  'Tree#3'="D1/D2",
  'Tree#4'="RPB2"
)
marker_labeller <- function(variable,value){
  return(marker_names[value])
}
p <- ggtree(trees, open.angle=10) + facet_wrap(~.id, scale="free", labeller = marker_labeller) + theme_tree2() + geom_tippoint(mapping=aes(color=Family), 
                                                                 size=1.5,
                                                                 show.legend=TRUE)
p