Reading amplicon data from raw reads with dada2 can be done using the following steps:
- Load the libraries and prepare a plot for each fastq file:
library(dada2) library(cowplot) fq_dir <- file.path(getwd(), "datasets", "ch5", "fq") read_files <- list.files(fq_dir, full.names = TRUE, pattern = "fq.gz") quality_plots <- lapply(read_files, plotQualityProfile)
plot_grid(plotlist = quality_plots)
- Quality trimming and dereplicating the files:
for (fq in read_files ){ out_fq <- paste0(fq, ".trimmed.filtered") fastqFilter(fq, out_fq, trimLeft=10, truncLen=250, maxN=0, maxEE=2, truncQ=2, compress=TRUE) } trimmed_files <- list.files(fq_dir, full.names = TRUE, pattern = "trimmed.filtered") derep_reads <- derepFastq(trimmed_files)
- Estimate the dada2 model from a subset of samples:
trimmed_files <- list.files(fq_dir, full.names = TRUE, pattern = "trimmed.filtered") derep_reads <- derepFastq(trimmed_files) dd_model <- dada(derep_reads[1:5], err=NULL, selfConsist=TRUE)
- Infer the sequence composition of the samples using the parameters estimated in Step 3:
dada_all <- dada(derep_reads, err=dd_model[[1]]$err_out, pool=TRUE)
- Assign taxonomy to the sequences:
sequence_tb <-makeSequenceTable( dada_all ) taxonomy_tb <- assignTaxonomy(sequence_tb, refFasta = file.path(getwd(), "datasets", "ch5", "rdp_train_set_14.fa")) taxonomy_tb[1, 1:6]