This repository describes a R workflow for the generation of random DNA sequences, based on the approach described in Tourlousse et al. (2017).
Approach: The process for generating the sequences starts from a set of random k-mers that are subsequently concatenated into longer sequences and screened based on user-defined conditions, namely homopolymers, GC content, repeats, palindromes and between-sequence identity.
library(dplyr)
library(Biostrings)
library(stringr)
library(stringi)
library(seqinr)
A set of k-mers is generated using R's sample
command, using the following set of parameters:
kmer_seed = 123
# seed for reproducibility #
kmer_size = 12
# k-mer size #
kmer_count = 10000
# number of k-mers to generate #
kmer_baseProbs = c(0.25,0.25,0.25,0.25)
# sampling probabilities for "A" "C" "T" "G" #
kmer_GClow = 30
# lower bound on k-mer GC content #
kmer_GChigh = 70
# upper bound on k-mer GC content #
kmer_maximumLengthHompolymers = 4
# upper bound on homopolymer length #
Based on this set of parameters, additional script parameters are set as follows:
set.seed(kmer_seed)
seedNumbers <- sample(1:(kmer_count*10000), kmer_count*100, replace = FALSE)
outputCounter <- 1
seedCounter <- 1
The k-mers are then generated using the following loop:
kmersDF <- NULL
while (outputCounter <= kmer_count) {
set.seed(seedNumbers[seedCounter])
tmp_kmersDF <- data.frame("kmerID" = paste0("kmer", seedCounter),
"kmer" = paste(Biostrings::DNA_BASES[sample(1:4,
kmer_size,
rep = TRUE,
prob = kmer_baseProbs)],
collapse=""))
tmp_kmersDF <- tmp_kmersDF %>%
dplyr::filter(!grepl(paste(rep("A", kmer_maximumLengthHompolymers), collapse=""), kmer)) %>%
dplyr::filter(!grepl(paste(rep("C", kmer_maximumLengthHompolymers), collapse=""), kmer)) %>%
dplyr::filter(!grepl(paste(rep("T", kmer_maximumLengthHompolymers), collapse=""), kmer)) %>%
dplyr::filter(!grepl(paste(rep("G", kmer_maximumLengthHompolymers), collapse=""), kmer)) %>%
dplyr::mutate(As = str_count(kmer, "A")) %>%
dplyr::mutate(Cs = str_count(kmer, "C")) %>%
dplyr::mutate(Gs = str_count(kmer, "G")) %>%
dplyr::mutate(Ts = str_count(kmer, "T")) %>%
dplyr::filter(As > 0 & Cs > 0 & Gs > 0 & Ts > 0) %>%
dplyr::mutate(GCcontent = round((Cs+Gs)/kmer_size*100, 1)) %>%
dplyr::filter(GCcontent >= kmer_GClow & GCcontent <= kmer_GChigh) %>%
dplyr::mutate_if(is.factor, as.character)
kmersDF <- rbind(kmersDF, tmp_kmersDF) %>% dplyr::distinct(kmer, .keep_all = TRUE)
outputCounter = 1 + nrow(kmersDF)
seedCounter = seedCounter + 1
}
rm(tmp_kmersDF)
Based on the above generated k-mers, full-length random sequences are then generated by randomly combining the k-mers without replacement, using the following set of parameters:
seq_seed = 456
# seed for reproducibility #
seq_size = 1200
# sequence size #
seq_count = 50
# number of sequences to generate #
seq_GClow = 45
# lower bound on sequence GC content #
seq_GChigh = 55
# upper bound on sequence GC content #
seq_maximumLengthExactRepeatsPalindromes = 10
# maximum length of perfect repeats, inverted repeats and palindromes #
seq_maximumLengthHompolymers = 4
# upper bound on homopolymer length #
Based on this set of parameters, additional script parameters are set as follows:
set.seed(seq_seed)
outputCounter <- 1
seedCounter <- 1
seedNumbers <- sample(1:(seq_count*10000), seq_count*100, replace = FALSE)
numberKmers = seq_size / kmer_size
seq_kmerProbs = as.vector(rep(1/kmer_count, kmer_count))
The sequences are then generated using the following loop:
seqsDF <- NULL
while (outputCounter <= seq_count) {
set.seed(seedNumbers[seedCounter])
tmp_seqsDF <- data.frame("seqID" = paste0("seq", seedCounter),
"sequence" = paste(as.vector(kmersDF$kmer)[sample(1:kmer_count,
numberKmers,
rep = FALSE,
prob = seq_kmerProbabilities)],
collapse="")) %>%
dplyr::mutate_if(is.factor, as.character)
tmp_repeats <- NULL
for(i in 1:(seq_size-seq_maximumLengthExactRepeatsPalindromes+1)) {
string = substr(tmp_seqsDF$sequence, i, i+seq_maximumLengthExactRepeatsPalindromes-1)
tmp_repeats <- rbind(tmp_repeats,
data.frame("position" = i,
"count_repeats" = str_count(tmp_seqsDF$sequence,
Biostrings::toString(DNAString(string))) - 1,
"count_invertedrepeats" = str_count(tmp_seqsDF$sequence,
Biostrings::toString(reverse(DNAString(string)))),
"count_palindromes" = str_count(tmp_seqsDF$sequence,
Biostrings::toString(reverseComplement(DNAString(string))))))
}
if(max(tmp_repeats$count_repeats) > 0 |
max(tmp_repeats$count_invertedrepeats) > 0 |
max(tmp_repeats$count_palindromes) > 0) {
tmp_seqsDF <- data.frame("seqID" = as.character(),
"sequence" = as.character())
}
tmp_seqsDF <- tmp_seqsDF %>%
dplyr::filter(!grepl(paste(rep("A", seq_maximumLengthHompolymers), collapse=""), sequence)) %>%
dplyr::filter(!grepl(paste(rep("C", seq_maximumLengthHompolymers), collapse=""), sequence)) %>%
dplyr::filter(!grepl(paste(rep("T", seq_maximumLengthHompolymers), collapse=""), sequence)) %>%
dplyr::filter(!grepl(paste(rep("G", seq_maximumLengthHompolymers), collapse=""), sequence)) %>%
dplyr::mutate(As = str_count(sequence, "A")) %>%
dplyr::mutate(Cs = str_count(sequence, "C")) %>%
dplyr::mutate(Ts = str_count(sequence, "T")) %>%
dplyr::mutate(Gs = str_count(sequence, "G")) %>%
dplyr::mutate(GCcontent = round((Cs+Gs)/seq_size*100,1)) %>%
dplyr::filter(GCcontent >= seq_GClow & GCcontent <= seq_GChigh) %>%
dplyr::mutate_if(is.factor, as.character)
seqsDF <- rbind(seqsDF, tmp_seqsDF)
outputCounter = 1 + nrow(seqsDF)
seedCounter = seedCounter + 1
}
rm(tmp_seqsDF)
Based on the above generated sequences, a set of filtered sequences is then generated based on filtering based on between-sequence identities, using pairwise local alignment with Biostrings's pairwiseAlignment
command. The following parameters are used:
seqfilter_seed = 789
# seed for reproducibility #
seqfilter_count = 5
# number of sequences to generate #
seqfilter_maximumAlignmentLength = 16
# upper bound on pairwise alignment lenghts, including indels #
Based on this set of parameters, additional script parameters are set as follows:
selectedSeqsDFcomplement <- seqsDF
selectedSeqsDF <- NULL
selectedSeqsDF_rows <- 0
outputCounter <- 0
n = seqfilter_count
The sequences are then generated using the following loop:
while(n > 0) {
set.seed(seqfilter_seed)
selectedSeqsDF <- rbind(selectedSeqsDF,
selectedSeqsDFcomplement %>%
dplyr::sample_n(., n, replace = FALSE,
weight = as.vector(rep(1/nrow(selectedSeqsDFcomplement), nrow(selectedSeqsDFcomplement)))))
selectedSeqsDFcomplement <- selectedSeqsDFcomplement %>%
dplyr::filter(., !seqID %in% as.vector(selectedSeqsDF$seqID))
alignmentDistances <- NULL
for(i in 1:(nrow(selectedSeqsDF)-1)) {
for(j in seq(i+1, nrow(selectedSeqsDF), 1)) {
localAligment <- pairwiseAlignment(pattern = selectedSeqsDF[i, 2], subject = selectedSeqsDF[j, 2],
substitutionMatrix = nucleotideSubstitutionMatrix(match = 2,
mismatch = -3, baseOnly = TRUE, type = "DNA"),
type = "local", gapOpening = 5, gapExtension = 2)
alignmentDistances <- rbind(alignmentDistances,
data.frame("pattern" = selectedSeqsDF[i, 1],
"subject" = selectedSeqsDF[j, 1],
"length_pattern" = nchar(localAligment@pattern),
"length_subject" = nchar(localAligment@subject),
"identity" = round(pid(localAligment, type = "PID1"), 2)))
}
}
alignmentDistances <- alignmentDistances %>%
dplyr::filter(length_pattern >= seqfilter_maximumAlignmentLength &
length_subject >= seqfilter_maximumAlignmentLength)
selectedSeqsDF <- selectedSeqsDF %>% dplyr::filter(., !seqID %in% as.vector(alignmentDistances$pattern))
n <- seqfilter_count - nrow(selectedSeqsDF)
}