I want to bin the contigs using the ESOM method. But the length of input sequences needs > 2kb and <5kb as the suggestion in the original paper. What kind of software can I use to split the contigs longer than 5 kb into smaller ones.
# Load libraries
library("Biostrings")
library(seqinr)
# Variables
size_to_split<-99999
input_file<-"assembly.fasta"
# End of variables
assembly<-readDNAStringSet(filepath=input_file)
newframe<-data.frame(name_seq=character(0),size_seq=integer(0),seq_seq=character(0),stringsAsFactors = F)
name_seq = names(assembly)
seq_seq = paste(assembly)
size_seq<-nchar(seq_seq)
mydataframe <- data.frame(name_seq, size_seq, seq_seq, stringsAsFactors = F)
# For every contig
for (i in which(size_seq>size_to_split)) {
# Test if the contig is longer than the limit of size_to_split
if (mydataframe[i,2]>size_to_split) {
# Get the number of fragments needed for the contig
fragments<-ceiling(mydataframe[i,2]/size_to_split)
# For every fragment
for (n in 1:fragments) {
# Calculate the start position
start<-(1+((n-1)*size_to_split))
# Check if the fragment is the last fragment
if (n==fragments) {
# Set the stop position to the length tf the contig
stop=nchar(mydataframe[i,3])
} else {
# Else calculate the stop position
stop<- (((n)*size_to_split))
}
splitted<-substr(mydataframe[i,3], start = start, stop = stop)
tmp_frame<-data.frame(name_seq=(paste0(i,"_",n)),size_seq=nchar(splitted),seq_seq=splitted,stringsAsFactors = F)
newframe<-rbind(newframe,tmp_frame)
}
} else {
newframe<-rbind(newframe,mydataframe[i,])
}
}
newframe2<-rbind(newframe,
mydataframe[which((size_seq<=size_to_split)),])
# Write the splitted sequence to a new file
write.fasta(sequences = as.list(newframe2$seq_seq),names = newframe$name_seq,file.out = paste0("splitted_",input_file))