Hi,
I am trying to create a sequence table from the dereplicated FASTA files by Vsearch and run the chimera filtering by DADA2 over it. I created an OTU table with Vsearch using the commands below:
#!/bin/bash
# Combine all FASTA files into one
cat *.fasta > combined.fasta
# Perform global dereplication to simplify the database
vsearch --derep_fulllength combined.fasta --output dereplicated.fasta --sizeout --minuniquesize 1
# Use the dereplicated sequences as a database; find exact matches and count occurrences
vsearch --search_exact combined.fasta --db dereplicated.fasta --otutabout otu_table.txt --threads 8
echo "OTU table generated with exact sequence matches."
Later on, I transposed the table and also changed the OTU ids, which in this case are my FASTA headers, to the related sequences. I try to transform the table into a "matrix" "array" as well. Here is the script that i use for this matter:
library(data.table)
library(tibble)
library(parallel)
library(doParallel)
library(dada2)
# Load OTU table
otu_table <- fread("otu_table.txt", data.table = FALSE)
# Transpose the table
otu_table_t <- t(otu_table)
# Convert transposed data to data frame and handle potential issues with row names
otu_table_df <- as.data.frame(otu_table_t, stringsAsFactors = FALSE)
names(otu_table_df) <- otu_table_df[1, ] # Set first row as column names
otu_table_df <- otu_table_df[-1, ] # Remove the first row after setting column names
library(Biostrings)
# Load sequences from FASTA file
sequences <- readDNAStringSet("dereplicated.fasta") # adjust the path as needed
fasta_ids <- unlist(mclapply(names(sequences), function(name) {
strsplit(name, ";")[[1]][1]
}, mc.cores = detectCores()))
# Update column names in otu_table_df using parallel mapply
colnames(otu_table_df) <- mcmapply(function(id) {
sequence_index <- which(fasta_ids == id)
if(length(sequence_index) > 0) {
as.character(sequences[sequence_index])
} else {
NA # Ensures that unmatched IDs are flagged
}
}, colnames(otu_table_df), mc.cores = detectCores())
# Save the processed data
write.csv(otu_table_df, "otu_parallel.csv", row.names = TRUE, quote = FALSE)
# Convert the data frame to a matrix and coerce types if necessary
otu_matrix <- data.matrix(otu_table_df)
if(any(is.na(otu_matrix))) {
warning("There are NA values in the matrix; these will need to be handled before proceeding.")
}
# Check the structure and class of the matrix to ensure it's suitable for DADA2
print(paste("Dimensions of matrix:", dim(otu_matrix)))
print(paste("Class of matrix:", class(otu_matrix)))
# Remove or impute NA/infinite values if necessary
otu_matrix[is.na(otu_matrix)] <- 0 # Example: Replace NAs with 0
otu_matrix[is.infinite(otu_matrix)] <- 0 # Replace infinite values with 0
# Re-check if all values are now appropriate
if(any(is.na(otu_matrix)) || any(is.infinite(otu_matrix))) {
stop("Data still contains NA or infinite values, please check and correct them.")
}
# Ensure otu_table_df is loaded and inspect data types
print("Data types in the original data frame:")
print(sapply(otu_table_df, class))
# Convert all columns in the data frame to numeric type if they are not already
otu_table_df[] <- lapply(otu_table_df, function(x) as.numeric(as.character(x)))
# Handle any possible NA values that could be introduced by incorrect conversions
otu_table_df[is.na(otu_table_df)] <- 0
# Convert the cleaned data frame to a matrix
otu_matrix <- data.matrix(otu_table_df)
but when I proceed to the chimera removal, i get this error:
nonchimeric_seqtab <- removeBimeraDenovo(otu_matrix, minSampleFraction = 0.9, ignoreNNegatives = 1, method = "consensus", minFoldParentOverAbundance = 8, minParentAbundance = 2, allowOneOff = TRUE, minOneOffParentDistance = 2, maxShift = 16, multithread = TRUE, verbose = TRUE)
Error in isBimeraDenovoTable(unqs[[i]], ..., verbose = verbose) : Input must be a valid sequence table.
I am not sure what else I should do to ensure that the sequence table is properly formatted for the DADA2 to process. I appreciate your help in advance.