Prepare 3 tab format for RNAseq dataset uploading

The following R code goes through preparing a dataset for upload to gEAR using the 3 tab format. If you wish for assistance with the formatting or upload process, please contact our curator team (

  1. load in count file
		count = read.delim(sep="\t",countsFileName, header=T, check.names=F)

		names(count)[1] = "gene"
		rownames(count) = count$gene
  1. generate the observation file
		options(repr.matrix.max.rows=50, repr.matrix.max.cols=200)

		meta = read.delim("phenotype.txt")

			#re-name the columns, change gender to your variable of interest 
			names(obs) = c("observations","genotype", "gender")

#keep replicate name the same

#pipe the dataframe to the group_by function and send output to mutate function to count row numbers and assign them to the variable replicate

		obs <- obs %>% group_by(genotype, gender) %>% mutate(replicate = row_number())
		obs = data.frame(obs)
		write.table(obs, file= "", sep="\t", row.names=F, quote=F)
  1. annotation. check whether there is ensemble in the count file
			if (nrow(temp)>0) {
 			 print('ensembl ID are detected')

					} else {
  			print('Error: no ensemble detected')}

Here we are using the mouse annotation database, if you wish to use a different database run listDatasets(mart) to get a full list of available annotations.

                        mart = useMart( 'ensembl' )
			datasets <- listDatasets(mart)
			mart = useDataset( 'mmusculus_gene_ensembl' , mart = mart )
			ensembl = getBM( attributes = c('ensembl_gene_id','external_gene_name') , mart=mart)
			names(ensembl)[2] = "gene_symbol"

	#only keep the first ensemble ID for each gene


	#merge by gene_symbol or ensembl ID 

			count.ann = merge(ensembl, count, by.x="gene_symbol", by.y="gene_symbol")

			names(count.ann)[c(2,1)] = c("ensembl_ID", "gene_symbol")

			write.table(count.ann[-1], file= "", sep="\t", quote=F, row.names=F)

			write.table(genes_to_use,file= "", sep="\t", quote=F,row.names=F)

if the count file is raw count, basic normalization need to be completed before uploading




##quantile normalization of cpm

		quantile_normalisation <- function(df){
  		df_rank <- apply(df,2,rank,ties.method="min")
  		df_sorted <- data.frame(apply(df, 2, sort))
  		df_mean <- apply(df_sorted, 1, mean)
  		index_to_mean <- function(my_index, my_mean){
  		df_final <- apply(df_rank, 2, index_to_mean, my_mean=df_mean)
  		rownames(df_final) <- rownames(df)
		write.table(nq, file=paste(outfile_prefix, "", sep = ""), sep="\t", quote=F, row.names=T)

Compress files together for uploading

	system( 'tar -czvf upload.tar.gz *.tab')