RandomSequenceGeneratoR

This repository describes a R workflow for the generation of random DNA sequences, based on the approach described in Tourlousse et al. (2017).

Approach: The process for generating the sequences starts from a set of random k-mers that are subsequently concatenated into longer sequences and screened based on user-defined conditions, namely homopolymers, GC content, repeats, palindromes and between-sequence identity.

Loading required R packages

library(dplyr)
library(Biostrings)
library(stringr)
library(stringi)
library(seqinr)

Generating set of k-mers

A set of k-mers is generated using R's sample command, using the following set of parameters:

kmer_seed = 123 # seed for reproducibility #

kmer_size = 12 # k-mer size #

kmer_count = 10000 # number of k-mers to generate #

kmer_baseProbs = c(0.25,0.25,0.25,0.25) # sampling probabilities for "A" "C" "T" "G" #

kmer_GClow = 30 # lower bound on k-mer GC content #

kmer_GChigh = 70 # upper bound on k-mer GC content #

kmer_maximumLengthHompolymers = 4 # upper bound on homopolymer length #

Based on this set of parameters, additional script parameters are set as follows:

set.seed(kmer_seed)
seedNumbers <- sample(1:(kmer_count*10000), kmer_count*100, replace = FALSE)
outputCounter <- 1
seedCounter <- 1

The k-mers are then generated using the following loop:

kmersDF <- NULL
while (outputCounter <= kmer_count) {
  set.seed(seedNumbers[seedCounter])
  tmp_kmersDF <- data.frame("kmerID" = paste0("kmer", seedCounter),
                            "kmer" = paste(Biostrings::DNA_BASES[sample(1:4,
                                                                        kmer_size,
                                                                        rep = TRUE,
                                                                        prob = kmer_baseProbs)],
                                           collapse=""))
  tmp_kmersDF <- tmp_kmersDF %>% 
    dplyr::filter(!grepl(paste(rep("A", kmer_maximumLengthHompolymers), collapse=""), kmer)) %>% 
    dplyr::filter(!grepl(paste(rep("C", kmer_maximumLengthHompolymers), collapse=""), kmer)) %>% 
    dplyr::filter(!grepl(paste(rep("T", kmer_maximumLengthHompolymers), collapse=""), kmer)) %>% 
    dplyr::filter(!grepl(paste(rep("G", kmer_maximumLengthHompolymers), collapse=""), kmer)) %>%
    dplyr::mutate(As = str_count(kmer, "A")) %>%
    dplyr::mutate(Cs = str_count(kmer, "C")) %>%
    dplyr::mutate(Gs = str_count(kmer, "G")) %>%
    dplyr::mutate(Ts = str_count(kmer, "T")) %>%
    dplyr::filter(As > 0 & Cs > 0 & Gs > 0 & Ts > 0) %>%
    dplyr::mutate(GCcontent = round((Cs+Gs)/kmer_size*100, 1)) %>%
    dplyr::filter(GCcontent >= kmer_GClow & GCcontent <= kmer_GChigh) %>%
    dplyr::mutate_if(is.factor, as.character)
  kmersDF <- rbind(kmersDF, tmp_kmersDF) %>% dplyr::distinct(kmer, .keep_all = TRUE) 
  outputCounter = 1 + nrow(kmersDF)
  seedCounter = seedCounter + 1
}
rm(tmp_kmersDF)

Generating set of full length sequences

Based on the above generated k-mers, full-length random sequences are then generated by randomly combining the k-mers without replacement, using the following set of parameters:

seq_seed = 456 # seed for reproducibility #

seq_size = 1200 # sequence size #

seq_count = 50 # number of sequences to generate #

seq_GClow = 45 # lower bound on sequence GC content #

seq_GChigh = 55 # upper bound on sequence GC content #

seq_maximumLengthExactRepeatsPalindromes = 10 # maximum length of perfect repeats, inverted repeats and palindromes #

seq_maximumLengthHompolymers = 4 # upper bound on homopolymer length #

Based on this set of parameters, additional script parameters are set as follows:

set.seed(seq_seed)
outputCounter <- 1
seedCounter <- 1
seedNumbers <- sample(1:(seq_count*10000), seq_count*100, replace = FALSE)
numberKmers = seq_size / kmer_size
seq_kmerProbs = as.vector(rep(1/kmer_count, kmer_count))

The sequences are then generated using the following loop:

seqsDF <- NULL
while (outputCounter <= seq_count) {
  set.seed(seedNumbers[seedCounter])
  tmp_seqsDF <- data.frame("seqID" = paste0("seq", seedCounter),
                           "sequence" = paste(as.vector(kmersDF$kmer)[sample(1:kmer_count, 
                                                                             numberKmers,
                                                                             rep = FALSE, 
                                                                             prob = seq_kmerProbabilities)], 
                                             collapse="")) %>%
    dplyr::mutate_if(is.factor, as.character)
  tmp_repeats <- NULL
  for(i in 1:(seq_size-seq_maximumLengthExactRepeatsPalindromes+1)) {
    string = substr(tmp_seqsDF$sequence, i, i+seq_maximumLengthExactRepeatsPalindromes-1)
    tmp_repeats <- rbind(tmp_repeats, 
                         data.frame("position" = i,
                                    "count_repeats" = str_count(tmp_seqsDF$sequence, 
                                     Biostrings::toString(DNAString(string))) - 1,
                                    "count_invertedrepeats" = str_count(tmp_seqsDF$sequence, 
                                     Biostrings::toString(reverse(DNAString(string)))),
                                    "count_palindromes" = str_count(tmp_seqsDF$sequence, 
                                     Biostrings::toString(reverseComplement(DNAString(string))))))                               
  }
  if(max(tmp_repeats$count_repeats) > 0 | 
     max(tmp_repeats$count_invertedrepeats) > 0 | 
     max(tmp_repeats$count_palindromes) > 0) {
    tmp_seqsDF <- data.frame("seqID" = as.character(), 
                             "sequence" = as.character())
  }
  tmp_seqsDF <- tmp_seqsDF %>% 
    dplyr::filter(!grepl(paste(rep("A", seq_maximumLengthHompolymers), collapse=""), sequence)) %>% 
    dplyr::filter(!grepl(paste(rep("C", seq_maximumLengthHompolymers), collapse=""), sequence)) %>% 
    dplyr::filter(!grepl(paste(rep("T", seq_maximumLengthHompolymers), collapse=""), sequence)) %>% 
    dplyr::filter(!grepl(paste(rep("G", seq_maximumLengthHompolymers), collapse=""), sequence)) %>%
    dplyr::mutate(As = str_count(sequence, "A")) %>%
    dplyr::mutate(Cs = str_count(sequence, "C")) %>%
    dplyr::mutate(Ts = str_count(sequence, "T")) %>%
    dplyr::mutate(Gs = str_count(sequence, "G")) %>%
    dplyr::mutate(GCcontent = round((Cs+Gs)/seq_size*100,1)) %>%
    dplyr::filter(GCcontent >= seq_GClow & GCcontent <= seq_GChigh) %>%
    dplyr::mutate_if(is.factor, as.character)
  seqsDF <- rbind(seqsDF, tmp_seqsDF)
  outputCounter = 1 + nrow(seqsDF)
  seedCounter = seedCounter + 1
}
rm(tmp_seqsDF)

Filtering of full length sequences

Based on the above generated sequences, a set of filtered sequences is then generated based on filtering based on between-sequence identities, using pairwise local alignment with Biostrings's pairwiseAlignment command. The following parameters are used:

seqfilter_seed = 789 # seed for reproducibility #

seqfilter_count = 5 # number of sequences to generate #

seqfilter_maximumAlignmentLength = 16 # upper bound on pairwise alignment lenghts, including indels #

Based on this set of parameters, additional script parameters are set as follows:

selectedSeqsDFcomplement <- seqsDF
selectedSeqsDF <- NULL
selectedSeqsDF_rows <- 0
outputCounter <- 0
n = seqfilter_count

The sequences are then generated using the following loop:

while(n > 0) {
set.seed(seqfilter_seed)
  selectedSeqsDF <- rbind(selectedSeqsDF,
                          selectedSeqsDFcomplement %>%
                            dplyr::sample_n(., n, replace = FALSE,  
                            weight = as.vector(rep(1/nrow(selectedSeqsDFcomplement), nrow(selectedSeqsDFcomplement)))))
  selectedSeqsDFcomplement <- selectedSeqsDFcomplement %>% 
    dplyr::filter(., !seqID %in% as.vector(selectedSeqsDF$seqID))
  alignmentDistances <- NULL
  for(i in 1:(nrow(selectedSeqsDF)-1)) {
  for(j in seq(i+1, nrow(selectedSeqsDF), 1)) {
    localAligment <- pairwiseAlignment(pattern = selectedSeqsDF[i, 2], subject = selectedSeqsDF[j, 2],
                                       substitutionMatrix = nucleotideSubstitutionMatrix(match = 2,
                                       mismatch = -3, baseOnly = TRUE, type = "DNA"),
                                       type = "local", gapOpening = 5, gapExtension = 2) 
    alignmentDistances <- rbind(alignmentDistances,
                                data.frame("pattern" = selectedSeqsDF[i, 1],
                                           "subject" = selectedSeqsDF[j, 1],
                                           "length_pattern" = nchar(localAligment@pattern),
                                           "length_subject" = nchar(localAligment@subject),
                                           "identity" = round(pid(localAligment, type = "PID1"), 2)))
    }
  }
  alignmentDistances <- alignmentDistances %>% 
    dplyr::filter(length_pattern >= seqfilter_maximumAlignmentLength & 
                  length_subject >= seqfilter_maximumAlignmentLength)
  selectedSeqsDF <- selectedSeqsDF %>% dplyr::filter(., !seqID %in% as.vector(alignmentDistances$pattern))
  n <- seqfilter_count - nrow(selectedSeqsDF)
}

Name		Name	Last commit message	Last commit date
Latest commit History 12 Commits
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

RandomSequenceGeneratoR

Loading required R packages

Generating set of k-mers

Generating set of full length sequences

Filtering of full length sequences

About

Releases

Packages

aistBMRG/RandomSequenceGeneratoR

Folders and files

Latest commit

History

Repository files navigation

RandomSequenceGeneratoR

Loading required R packages

Generating set of k-mers

Generating set of full length sequences

Filtering of full length sequences

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Packages