-
Notifications
You must be signed in to change notification settings - Fork 0
/
bmc_article.tex
executable file
·712 lines (585 loc) · 54.8 KB
/
bmc_article.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
%% BioMed_Central_Tex_Template_v1.05
%% %
% bmc_article.tex ver: 1.05 % %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% LaTeX template for BioMed Central %%
%% journal article submissions %%
%% %%
%% <27 January 2006> %%
%% %%
%% %%
%% Uses: %%
%% cite.sty, url.sty, bmc_article.cls %%
%% ifthen.sty. multicol.sty %%
%% %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% For instructions on how to fill out this Tex template %%
%% document please refer to Readme.pdf and the instructions for %%
%% authors page on the biomed central website %%
%% http://www.biomedcentral.com/info/authors/ %%
%% %%
%% Please do not use \input{...} to include other tex files. %%
%% Submit your LaTeX manuscript as one .tex document. %%
%% %%
%% All additional figures and files should be attached %%
%% separately and not embedded in the \TeX\ document itself. %%
%% %%
%% BioMed Central currently use the MikTex distribution of %%
%% TeX for Windows) of TeX and LaTeX. This is available from %%
%% http://www.miktex.org %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\NeedsTeXFormat{LaTeX2e}[1995/12/01] \documentclass[10pt]{bmc_article}
% Load packages
\usepackage{cite} % Make references as [1-4], not [1,2,3,4] \usepackage{url} % Formatting web addresses
\usepackage{ifthen} % Conditional \usepackage{multicol} %Columns \usepackage[utf8]{inputenc} %unicode support
%\usepackage[applemac]{inputenc} %applemac support if unicode package fails \usepackage[latin1]{inputenc}
%%UNIX support if unicode package fails \urlstyle{rm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% If you wish to display your graphics for %%
%% your own use using includegraphic or %%
%% includegraphics, then comment out the %%
%% following two lines of code. %%
%% NB: These line *must* be included when %%
%% submitting to BMC. %%
%% All figure files must be submitted as %%
%% separate graphics through the BMC %%
%% submission process, not included in the %%
%% submitted article. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\def\includegraphic{} \def\includegraphics{}
\setlength{\topmargin}{0.0cm} \setlength{\textheight}{21.5cm} \setlength{\oddsidemargin}{0cm}
\setlength{\textwidth}{16.5cm} \setlength{\columnsep}{0.6cm}
\newboolean{publ}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% You may change the following style settings %%
%% Should you wish to format your article %%
%% in a publication style for printing out and %%
%% sharing with colleagues, but ensure that %%
%% before submitting to BMC that the style is %%
%% returned to the Review style setting. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Review style settings
\newenvironment{bmcformat}{\begin{raggedright}\baselineskip20pt\sloppy\setboolean{publ}{false}}{\end{raggedright}\baselineskip20pt\sloppy}
%Publication style settings \newenvironment{bmcformat}{\fussy\setboolean{publ}{true}}{\fussy}
% Begin ...
\begin{document} \begin{bmcformat}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the title of your article here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Six Questions and Answers Defining Cloud Computing for Digital, Sequencing-Based Biological Research}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors here %%
%% %%
%% Ensure \and is entered between all but %%
%% the last two authors. This will be %%
%% replaced by a comma in the final article %%
%% %%
%% Ensure there are no trailing spaces at %%
%% the ends of the lines %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author{Konstantinos Krampis\correspondingauthor$^{1}$%
\email{Konstantinos Krampis\correspondingauthor - agbiotec@gmail.com}%}
\and
Granger Sutton$^{1}$%
\and
Vivek Sarangi
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors' addresses here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\address{ \iid(1) Informatics Department, J. Craig Venter Institute, 9704 Medical Center Dr., Rockville, MD 20850, USA }
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Abstract begins here %%
%% %%
%% The Section headings here are those for %%
%% a Research article submitted to a %%
%% BMC-Series journal. %%
%% %%
%% If your article is not of this type, %%
%% then refer to the Instructions for %%
%% authors on http://www.biomedcentral.com %%
%% and change the section headings %%
%% accordingly. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
% Do not use inserted blank lines (ie \\) until main body of text.
\paragraph*{Background:} Text for this section of the abstract.
\paragraph*{Results:} Text for this section of the abstract \ldots
\paragraph*{Conclusions:} Text for this section of the abstract \ldots \end{abstract}
\ifthenelse{\boolean{publ}}{\begin{multicols}{2}}{}
%%%%%%%%%%%%%%%%
%% Background %%
%%
\section*{Background}
\subsection*{Next-Generation Sequencing, Computing, and Digital Biological Research}
Advances in recent years in the areas of high-throughput sequencing and synthetic genomics has defined "The
Digital Age of Biology" \cite{venterdublin}, where Schrodinger's vision of "Life is code" \cite{Schrodinger1992} is
now implemented in technologies that can be used to convert digital code into DNA that runs a living organism
\cite{Gibson2010}. \pb
The latest generation of sequencing technologies is also being used in the area of metagenomics, where large-scale
studies of uncultivated microbial communities are performed. The J. Craig Venter Institute (JCVI) for example has been
involved in several such metagenomic projects, including the Sorcerer II Global Ocean Sampling (GOS,
\cite{Rusch2007}) expedition to study marine microbial diversity, and also the National Institutes of Health
funded Human Microbiome Project to study human associated microbial communities \cite{Nelson2010}. \pb
Sequencing technologies continue to move in a direction where throughput per run is increasing while cost
per basepair is decreasing (review in \cite{Mason2012}). For instance, one of the most widely used instruments
in the field currently, Illumina's GAIIx system can produce up to 95 Giga-base (Gb) of sequence per run \cite{Illumina}
while the ABI SOLiD sequencer has yields of a similar range \cite{solid5500}. With the latest generation of
instruments such as for example the HiSeq system, yield has reached 600 Gb \cite{Illumina}, while the Pacific
BioSciences sequencer can produce 90 Gb in short amounts of time \cite{PacBio}. \pb
Small-factor, benchtop sequencers are also available which can be acquired at a fraction of the cost, making them
affordable for independent researchers running small laboratories. Examples in this category
include the GS Junior by 454, MiSeq by Illumina and Ion Proton by Life Technologies, providing sequencing
capacity at 0.035Gb, 1Gb and 1.5Gb respectively for GS Junior, Ion Proton and MiSeq (review in \cite{Loman2012a}).
That level of throughput is adequate for sequencing bacterial, small fungal or viral genomes and along with the
low cost per run (US \$225 -\$1100), sequencing has started to become a standard technique even in small laboratories.
Example applications of sequencing for basic biological research include Single Nucleotide Polymorphism
(SNP) variation discovery , gene expression analysis (RNAseq) and DNA-protein interaction analysis
(ChiPseq), (review in \cite{Mardis2008}). \pb
While sequencers generate datasets of significant size, they are typically bundled with only minimal computational and
storage capacity for data capture during a run of the instrument. For example, the un-assembled reads
returned from a single lane, single run of the Illumina GAIIx instrument after base calling are approximately 100 GigaByte
(GB) in size. Therefore, for laboratories acquiring a sequencer, scientific value cannot be obtained
from this investment, unless it is accompanied by an almost equal or greater expense for informatics hardware
infrastructure. In addition, significant software engineering bioinformatic data analysis expertise is required
\cite{gogol2012overview}, in order to go from sequence data to valuable information such as assembled and
annotated genomes. This means that besides large capacity computing servers, trained personnel competent to install,
configure and use specific software to analyze and store the generated data is required. Furthermore, public databases
and software tools currently available online are not an option for downstream sequence data analysis, since
BLAST \cite{altschul1990basic} available from NCBI for example \cite{johnson2008ncbi}, cannot accept input
sequence files of more than 0.5 GB size for sequence similarity search. \pb
An additional conundrum specific to bioinformatic analysis of sequencing data, is that computationally
intensive tasks that require extensive compute resources such as genome assembly or whole genome alignments
for example, are followed by genome annotation that is much less computationally demanding. This leads to sub-optimal utilization
of computer hardware installed within data centers, while maintenance costs including electricity, cooling and salaries
for informatics support personnel are at constant or even increasing levels. For smaller academic institutions this can pose a significant
impediment in leveraging sequencing technology for research, as in addition to securing the funds for purchasing computing
hardware with adequate capacity to handle large-scale genomic datasets, they also need to maintain informatics systems
that are not utilized at its full capacity most of the time. A second complication is related to the fact that
databases with reference genomes \cite{Pruitt2009} are constantly growing in size, and for most bioinformatic analysis
tasks they need to be downloaded to local storage systems when performing comparative genome annotations. As these
databases grow larger the process becomes more time consuming, incurring in higher bandwidth and storage costs for replicating
the data locally. Finally, building a data analysis infrastructure for next-generation sequencing also involves hiring trained
bioinformatics engineers competent to implement specialized software tools and data analysis pipelines, which can incur
higher costs than that of acquiring the computer hardware or maintaining a data center. \pb
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Results and Discussion %%
%%
\section*{Results and Discussion}
\subsection*{What Role can Cloud Computing play for Digital Biological Research ?}
Computational data analysis can become a major bottleneck for smaller laboratories transitioning their experimental
techniques to sequencing-based methods. Furthermore, sequencing instrument capacities follow an uptrend that surpasses
that of Moore's Law \cite{schaller1997moore} while the cost per base pair follows an inverted trend, resulting in the genomics
community to constantly take on sequencing projects of increasing scale and scope. Currently, most federally-funded projects
conclude with upload of the sequences or annotated genomes to NCBI \cite{ncbi} databases such as the Sequence
Read Archive (SRA,\cite{sra}). This provides little value to researchers that do not have access to computational resources
or informatics expertise, since multiple steps including data download, provision of high-performance computer servers and large-scale data storage, in addition to
compiling and installing specialized bioinformatics software are required in order to utilize these datasets. \pb
As an alternative to investing in informatics infrastructure, researchers can rent computational
and storage capacity from Cloud services such as Amazon EC2 \cite{awsec2}. This can potentially be
a better economic model for smaller research laboratories, as the cost for hardware and data center
maintenance cannot be justified for only a few sequencing experiments. The Amazon EC2 Cloud,
employs a charge model similar to traditional utilities such as electricity and users are billed based
on the amount of computational capacity consumed on an hourly basis \cite{ec2price}. This particular Cloud
service consists of thousands of computer servers with petabytes of storage, leveraging economies of
scale to achieve low operational costs that in turn offers as savings to users. Furthermore, the Amazon Cloud
has data centers in US East and West regions, European Union, Asia and Australia \cite{ec2regions}, providing researchers
worldwide with the ability to tap into a large pool of computational resources, outside of institutional,
economic or geographic boundaries. Overall, renting computational capacity from the Cloud has the
potential to eliminate many of the upfront capital expenses for building information technology infrastructures
for next generation sequencing, and result in transformation of the analysis and data processing tasks into well
defined operational costs. \pb
The 1000 Human genomes project \cite{clarke2012} has demonstrated a new approach for distributing genomic datasets
in combination with computational capacity, by depositing sequence reads and reference genome mapping data on the
Amazon Cloud storage. These datasets can be directly accessed by renting servers loaded with pre-installed bioinformatics
software on the Cloud, and such examples have been provided by the Cloud BioLinux project \cite{1000tube1}, \cite{1000tube2},
\cite{Krampis2012}. Overall, by placing data from public projects on Cloud compute platforms, value for the community is
immediately increased; no infrastructure other than a computer with internet access is required for a researcher to upload
data generated at his or her laboratory , rent computational capacity and perform comparative analysis with
data available on the Cloud storage. \pb
On the other hand, large research institutes with in-house bioinformatic core facilities that complete sequencing of new
genomes on a regular basis, also have the resources and expertise to deliver more to the community than just sequence
reads and annotated genomes uploaded to publicly-accessible repositories \cite{Pruitt2009}. Currently, while many of the centers
deposit software on open-source code repositories such as SourceForge \cite{sourceforge} or GitHub \cite{github}, an
impediment for researchers trying to utilize the released software is the requirement to download, compile and configure all the
dependencies including the type of operating system, software libraries, and computer hardware. This creates a bottleneck
for laboratories lacking the required informatics expertise or computational infrastructure, that could be alleviated if bioinformatic
core facilities capture and distribute the bioinformatics expertise developed during each sequencing project on Cloud servers
containing pre-configured bioinformatics tools, made publicly available along with the data on the Cloud. This approach,
can democratize access to computational resources required for high throughput sequencing data analysis and
allow further adoption of sequencing technology for basic biology research. Specialized compute servers with bioinformatics tools
and data pipelines available on the Cloud through the Amazon EC2 service for example, could provide publicly accessible,
high performance data analysis platforms for use by research groups acquiring sequencing capability. These platforms
could be rented on-demand at low cost, removing the need for implementing informatics infrastructure at each laboratory. \pb
\subsection*{Are Cloud-Based Bioinformatics Software Suites Available on the Cloud ?}
A number of systems that allow making bioinformatics tools accessible online through web portals have been
developed during the past years, including the Biology Workbench \cite{Subramaniam1998Biology},
PISE \cite{Letondal2001Web}, wEMBOSS \cite{Sarachu2005WEMBOSS}, Mobyle \cite{Neron2009Mobyle}
BioManager \cite{Cattley2007BioManager} and BioExtract \cite{Lushbough2008Implementing}. Some
of those systems are not actively developed anymore; most require significant software development effort
and back-end system customization in order to deploy the tools through a web interface; with the exception of Mobyle,
none of these portals provides users with a straightforward option to create and edit complex data analysis workflows;
users have the option to download the source code for each system, but need to provision the hardware and
software engineering expertise to set up their own instance of the portal; large-scale data management or collaborative
data sharing and exchange among users is cumbersome, and available storage space is fized; finally, these portals do
not leverage the Cloud's scalability but are rather limited by the capacity of the server where the portal is installed,
posing a limitation on the dataset size that can be processed. \pb
On the other hand, centralized web portals for large-scale data analysis of bioinformatic datasets have been developed by
well-funded institutions, including IMG/M \cite{Grigoriev2012}, CAMERA \cite{Altintas2010}, EBI \cite{Hunter2011}
and MG-RAST \cite{Aziz2010}. While these portals are backed by considerable compute resources and data storage,
their centralized nature is what eventually becomes a bottleneck. First, for those of the centralized portals that provide
access to their resources, researchers are required to apply for an account, in order to receive allocation of computational
capacity for performing data analysis. Second, given the constantly increasing scale of genomic datasets, despite their
capacity these portal cannot possibly support all small laboratories that purchase low-cost, benchtop sequencers and
generate sequence data. Finally, a major drawback is that the software for most of these sites is not open-source, while
researchers often have to perform multiple submissions of their datasets, since each site offers a different sequence data
analysis pipeline. \pb
An alternative to these centralized services are Cloud-based, scalable bioinformatics data analysis systems such as Galaxy
\cite{Goecks2010}, CloVR \cite{Angiuoli2011}, Cloud BioLinux \cite{Krampis2012} and BioKepler \cite{Altintas2011}.
These systems are open-source and accessible to any laboratory or research group through Amazon EC2, but also are
available for download an execution on private compute Clouds \cite{awsec2}. As an example, the Galaxy bioinformatics
workbench includes a range of tools, from simple scripts that extract entries from sequence files, to complex algorithms
for processing next-generation sequence data. Furthermore, Galaxy is a complete platform including a web portal software
stack that provides the user interface for executing the bioinformatics tools, in addition to an intuitive, drag and drop
canvas for composing workflows and data analysis pipelines with the available tools. Finally, it provides a standardized
method for easy deployment on the portal of command-line only bioinformatics software, by editing simple configuration
files to specify the interface design \cite{galaxywiki}. Through the Galaxy-Cloudman \cite{Afgan2010}
framework, compute clusters for parallel data processing on Cloud services such as Amazon EC2 and private cloud can
be instantiated. \pb
Another community-centered, public access offering for computing on the Cloud is through our own work on Cloud Biolinux
\cite{Krampis2012}, \cite{cloudbio}. This offering provides on-demand bioinformatics computing and a set of pre-configured
sequence analysis tools within a high-performance Virtual Machine (VM) server that runs on a host of Cloud and virtualization
platforms. The project is targeted to researchers that do not have access to large-scale informatics infrastructures for
sequencing data analysis, but can instead rent on demand computational capacity from the Cloud. Users can access
the tools by starting the Cloud BioLinux VM through the Amazon console web page \cite{console}, and easily perform
large-scale data analysis as we have demonstrated for examplewith the 1000 Human genomes \cite{1000tube1},
\cite{1000tube2},\cite{Clarke2012}. Furthermore, the Cloud BioLinux VM is open-source, can be downloaded and modified,
while advanced users can install and run it on a private instance of the Eucalyptus \cite{euca} or Openstack \cite{openstack} Cloud platforms. A diverse
community of researchers from both the US (Massachusetts General Hospital, Harvard School of Public Health, Emory
University) and Europe (National Environmental Research Center, King's College London, Denmark Technical University,
Netherlands Wageninen University) has been already established around the project \cite{googlegroup}. Finally, we have
recently expanded Cloud BioLinux by adding support for software developers through a framework for building
and distributing bioinformatics VMs, that essentially provides a toolkit for implementation of customized, Cloud-based
bioinformatics data analysis solutions. The framework includes a software management system that automates
building a VM with a set of bioinformatics tools specified by the user and seamlessly deploys it across different
Cloud platforms, and is freely available from the GitHub code repository \cite{fabric}. The overall goal is
to offer a platform for maintaining a range of specialized VM setups for serving different computing needs
within the bioinformatics community, and allow researchers to focus on the next challenges of providing data,
documentation, and the development of scalable analysis pipelines. \pb
Some more tool suites on the Cloud that have become available recently, including the non-profit/ open-source GenoSpace
by Broad Institute \cite{genomespace} and SeqWare \cite{d2010seqware}, in addition to commercial offerings such as Illumina's
BaseSpace \cite{basespace}, DNAnexus \cite{dnanexus} and Nimbus Informatics \cite{nimbusit}. GenomeSpace essentially integrates
a set of tools and databases developed at the Broad Institute through a unified graphical interface for end-users, in addition to
offering Application Programming Interfaces (API) for programmatic access by developers, backed by Cloud VMs and storage. Users
can access the public GenomeSpace instance or create their local instance by retrieving the source code, in addition to accessing
Virtual Machines (VM) with the complete system pre-installed and ready to execute with minimal configuration on the Amazon EC2
cloud \cite{Afgan2010}. \pb SequeWare.... Similarly to GenomeSpace SeqWare provides source code and an Amazon Cloud VM
with everything pre-installed and ready to execute with only minimal configuration. DNANexus currently includes tools for ChiPseq,
RNAseq, 3'-end sequencing for expression quantification (3SEQ) and enzyme restriction analysis. DNAnexus runs on the Amazon
Elastic Compute Cloud (EC2, online ref. 4), which provides on-demand virtual servers with various compute capacities. \pb
The solutions presented above provide public access to scalable sequence data analysis resources for the genomic
community, through which users can get access to pre-configured software and on-demand computing using
Cloud infrastructures. Nonetheless, specialized, high-performance bioinformatics applications and data pipelines
implemented by bioinformatics core teams at large institutions, are usually coupled with specific hardware and
the informatics infrastructure at each institution. As a result, significant effort might be required to refactor
data analysis pipelines to run at a different site from where they were originally developed or port them on the cloud
\cite{Wilkening2009}. Finally, while the current Cloud-based solutions are great for smaller laboratories that lack
informatics resources and in addition the VM servers provide enhanced portability across sites, they are simply a
sophisticated container for bioinformatics software that in most cases has a monolithic design and does not leverage
the distributed computing characteristics of the Cloud. \pb
\subsection*{Unique characteristics of Cloud Computing versus traditional Bioinformatics Infrastructures}
One of the building blocks of cloud computing technology is virtualization \cite{Uhlig2005}, that allows entire
compute servers including the operating system and all the necessary software packages for data analysis
to be encapsulated within a Virtual Machine (VM). A VM is an emulation of a compute server, with virtual
processors, memory and storage capacity, in the form of a single binary file that executes
independently of the underlying hardware architecture, on both Cloud and desktop computers.
Cloud services such as Amazon EC2 \cite{awsec2} provide high-performance computer hardware with a
virtualization layer, on top of which users run VM servers. Since all software components and dependencies
are encapsulated within the VM, it is possible to distribute data analysis pipelines, databases, website portals,
and all their required code libraries and configuration files in a ready to execute, compact and easy to
download format. This approach can remove many of the technical roadblocks encountered when
performing complex installations of open-source bioinformatics software, and consecutively make bioinformatics
tools more accessible to the research community. \pb
In our experience with development of bioinformatics projects, it is difficult to provide long-term software
support or maintain web portals that provide online access to data analysis tools and databases, especially
for projects funded by government grants that have an expiration date. Alternatively, by using Cloud VM servers
to build and maintain a bioinformatics system and subsequently create Whole System Snapshots (WSSE,
\cite{Dudley2010, Krampis2012}) of the VM servers, bioinformatics web portals and online databases that
are build on the Cloud can be preserved in their precise state when the snapshot was created. A snapshot
essentially is a compressed, exact replica of a VM server, capturing all of the software configuration, bioinformatics
pipelines, input data and sequence assemblies, genome annotations and all other sequence data analysis results.
A snapshot is an executable binary file as the original VM, and by using it as a template the
virtualization layer of a Cloud platform can instantiate multiple replicas of the original VM server \cite{ebs}.
Finally, a researcher can set her snapshots to be publicly accessible or share them only with specific users
within the same Cloud, therefore providing access for collaborators to both data and software in a ready to execute
and compact format. \pb
Regarding costs, the Amazon EC2 Cloud \cite{awsec2} charges for VM snapshots \$0.01 US per GigaByte (GB)
of storage used per month. Such low costs can allow researchers to allocate a relatively small amount compared
to their overall informatics budgets for sequencing data analysis, and maintain a VM server snapshot for a number
of years past the end of a funding cycle. For projects involving data release online though a web portal that is
usually decommissioned shortly after funding for a project ends, using VM snapshots for archiving the portal on
a Cloud platform, enables other researchers to lease compute time on the Cloud, and create fully-functional
instances of the original VM server from the snapshots. Therefore, using VM technology and the Cloud for building
bioinformatics systems and then creating compressed snapshots to reduce costs for long-term storage, offers
an economical and flexible solution throughout and past the life cycle of a research project. \pb
Finally, use of virtualization and VM technology can provide two additional advantages: first, by
depositing data and pre-configured software on a publicly accessible, Cloud-based VM, allows for
reproducibility, provenance and openness of the bioinformatics research. For example, following publication
of assembly and annotation results from a genome sequencing project, researchers in the community
might require to re-run the assembly with additional data generated at their own laboratory or to change algorithmic
parameters and fine tune outputs such as gene predictions. Furthermore, lowering the barrier to access high-
performance informatics infrastructures required for working with next-generation sequencing datasets, is key
for allowing researchers in the community to extract value from data released from publicly funded projects, while
also to add value as similar studies take place. Second, by using the Cloud researchers have the capability to scale
computational resources on-demand according to the amount of data generated from a sequencing project,
through provision of the appropriate number of VM servers. With this approach usage of resources can be
adjusted accordingly during the different analysis phases: while initially extensive computing resources will be
necessary to perform assembly and annotation of the sequence data, computational resources (defined by
the number of running VM servers), can be scaled down for less computationally demanding tasks such as
visualization and browsing of the sequence annotations. After funding has ended, a lab can further lower the
usage of the Cloud's computational resources and cut its informatics costs by archiving the VMs using snapshots
\cite{Dudley2010}. \pb
\subsection*{What the Public, Private, Open-Source or Commercial Cloud Solutions Available to Biologists Today ?}
\subsubsection*{Accessing Computational Cycles on The Cloud}
Amazon Web Services (AWS, \cite{aws}) is one of the better established Cloud computing vendors, running on a
similar infrastructure with the one that powers Amazon.com's e-commerce web portals. Cloud computing services
offered by this vendor that are most applicable to bioinformatics, include the Amazon Elastic Compute Cloud (EC2, \cite{awsec2}),
Elastic Block Store (EBS, \cite{ebs}) and Simple Storage Service (S3, \cite{s3}), while a complete list of the available
services can be found at \cite{aws}. These option provide respectively compute cycles through Virtual Machine (VM)
servers, multiple virtual hard drives up to 1TeraByte (TB) that can attached to a running VM, and web-accessible
data storage. None of these options is tied to any specific operating system or programming model and each comes
with affordable pricing, as for example a large capacity VM server with 64GB memory and 8 processor (CPU) cores
that would suffice for many different types of bioinformatic analysis costs, \$2 US to rent per hour (for a complete price
list see \cite{ec2price}). A large software developer community with a lot of expertise has formed during the past few
years around the Amazon Web Services discussion forums \cite{awsforums}, and based on our experience questions
regarding the different services or requests for technical advice, are always answered in a day or less. \pb
Using the Cloud for bioinformatic data analysis is not limited only to the Amazon EC2 platform, since researchers
with access to a local computing cluster at their home institution have the option to run VM servers (and VM
snapshots downloaded from Amazon) on a private Cloud, such as Eucalyptus \cite{euca} or OpenStack \cite{openstack}.
While OpenStack is the official Cloud of the Ubuntu Linux operating system \cite{ubuntucloud} and is included
by default on a compute cluster that runs this particular Linux flavor, it can also be installed on clusters running other
Linux versions \cite{openstackother}, and similarly for Eucalyptus \cite{eucalyptusother}. These Cloud platforms
are essentially open-source replicas of Amazon EC2 and offer identical Application Programming Interfaces (API),
meaning that applications developed on Eucalyptus or OpenStack will work seamlessly on the Amazon Cloud and vice-versa.
This allows to seamlessly transfer VM server snapshots such as for example the Cloud BioLinux VM \cite{Krampis2012}
across installations of these Cloud platforms, providing researchers with ready-to-execute bioinformatics tools and data
analysis pipelines pre-configured and installed on the VM. \pb
While Amazon Web Services \cite{aws} was the first vendor to offer public Cloud computing access at a large scale,
many alternative platforms became available during recent years including the Google App Engine \cite{appengine}, Microsoft
Azure \cite{azure}, GoGrid \cite{gogrid}, FlexiScale \cite{flexiscale} and the IBM SmartCloud \cite{smartcloud}.Multiple
technical differences exist between these offerings, but they can be categorized into two broad groups: the first includes
Cloud platforms that provide users with access to Virtual Machines (VM) that are no different than standard Unix servers, available
by Amazon, GoGrid, FlexiScale and IBM. On the other hand, Clouds that users access as abstracted computational resources that
run software, without need for logging into Unix filesystems or provisioning VMs, are found on the Google and Microsoft offerings.
This approach might sound as a better solution for smaller research groups with limited software engineering and informatics
expertise, but is not without drawbacks. Specifically, the seamless execution of code and automatic scalability stems from the
fact that the Google and Microsoft Clouds require software developers to implement their applications using programming
frameworks based on Python \cite{python} and .NET \cite{net} respectively. While this might be an option for the development of
new software or web-based applications that are most suitable for these frameworks, the majority of existing
bioinformatics software is designed to run on standard Unix servers and filesystems. In this case, Clouds that provide direct
access to VMs are a better choice, while additionally a framework-specific Cloud implementation despite its advantages could
eventually lead to vendor lock-in. \pb
Overall, Cloud platforms that provide access to VM servers can be a better choice for deploying existing software, in order
to make it accessible to groups that do not have local informatics infrastructure but could instead rent compute time from
the Cloud. In addition, for groups that have a longer-term vision for building Cloud-based bioinformatics infrastructures
outside of traditional in-house data centers, portability across Cloud platforms should be a top criterion. While standalone
software can runs on VMs that can be easily converted and ported across public, private Clouds that provide a virtualization
layer \cite{Krampis2012}, Cloud informatics infrastructures are essentially encoded in scripts that setup VM-based compute
clusters and storage by issuing directives to the Cloud's API (\cite{Afgan2010}, \cite{scriptaws}). Therefore, for achieving
portability of the complete infrastructure across different Cloud platforms with minimal software re-engineering effort,
compatible and inter-operable APIs across the platforms is a requirement. Currently, this is only fulfilled amongst the open-source
Eucalyptus / OpenStack Clouds, and the commercial Amazon Cloud. This fact should be carefully considered before choosing
a Cloud provider for longer-term, especially if instances of the infrastructure is might be required to exist on both a private
and a publicly-accessible, commercial Cloud. \pb
For researchers who do not have access to a compute cluster, neither have the available funds to lease computing time from
Amazon, the government-funded Magellan Cloud \cite{magellan} provides an OpenStack cluster where researchers can apply
for a user account. In addition, a number of academic computing centers in both the US and other countries have similar clusters
with OpenStack installed \cite{openstackinstalls}, where scientistics could access a Cloud platform. Finally, an option for users is
to to run the VM servers on a desktop computer, using virtualization software such as VirtualBox \cite{vbox}, that is also
open-source and can be installed in a single step on Windows, Mac or Linux computers. The Cloud BioLinux project for example,
provides VM server snapshots that run on both private clouds and VirtualBox \cite{cblorg}. \pb
Finally, professors that teach courses on Cloud computing, can apply for the Amazon Web Services educational grants
\cite{awsgrants}, that provide free computing and data storage resources for educational use. Furthermore, the Amazon
Cloud has established a program \cite{awspublicdata} that hosts free of charge a variety of large-scale public datasets that
have significant value for the scientific community. For example, genomic datasets available through this program include
the 1000 human genomes data (\cite{Clarke2012}, \cite{aws1000gen}) the NCBI flu genomes \cite{awsflu} and Ensembl
human genome annotation database \cite{awsensembl}. Users of the Amazon Cloud can access, copy, and perform computation
on the data using VM servers, and just pay for the compute and storage resources they lease. \pb
\subsubsection*{Accessing Storage Capacity on The Cloud}
Cloud data storage services provide the advantage of data centers distributed across the globe, and therefore can make available
connection endpoints in various geographic regions that reduce network transfer latency for data exchange among remotely
located research sites. The Amazon S3 storage for example \cite{s3} has data centers located on the US East and West coast,
European Union (Ireland), South America (Brazil), and Asia-Pacific (Japan, Singapore and Australia). A researcher working with
large-scale sequencing datasets can choose to upload data to their closest data center, then initiate replication across the
different regions through the S3 CloudFront service \cite{cloudfront}, and subsequently allow collaborators worldwide
to retrieve the data from the nearest location. Furthermore, data stored using this service are protected from physical disasters,
since they replicated by default across three different regions. Note that triple-replication is not synonymous with backup that
prevents accidental deletion by the user, but instead refers to protection from permanent loss of a single data center. The S3
storage model deviates from that of POSIX-compliant hard drives \cite{mathur2007new}, but instead uses data objects that have unique Uniform Resource
Location (URL) identifiers across the Amazon Cloud and are also web browser-accessible. In detail, S3 is organized using a two-level
namespace with top-level folders called "buckets", and while each Amazon account may have up to 100 buckets, a bucket can store
an unlimited number of data objects. A file on S3 with open access permissions can be accessed by simply pasting its URL on a web
browser, but for upload, modification, bulk operations or for files that require authentication, one of the two Application Programming
Interfaces (API) must be used to access the service: the first is based on the Representational State Transfer protocol (REST,
\cite{fielding2000}) and the second on the Simple Object Access Protocol (SOAP, \cite{soap}), both with a rich set of programmatic
access code libraries available for developers \cite{s3developer}. Alternatively, users can interact with the service through various
desktop client applications that have graphical front-ends, available for all operating systems (\cite{cyberduck}, \cite{s3browse},
\cite{s3fox}). \pb
A second storage option on the Amazon Cloud is Elastic Block Store (EBS,\cite{ebs}), that provides POSIX-compliant \cite{mathur2007new}
hard drives up to 1 TeraByte (TB), multiples of which can be attached to a running VM. While these data volumes persist after a Virtual
Machine (VM) shutdown and can be re-attached to a new VM booted at a later time, data stored on EBS are not triple-replicated and are
stored only on a single Amazon data center. This makes them prone to loss during physical disasters or hardware failures. Another
option for data storage is the transient virtual hard-drive available as the file system of a running VM, but given that a VM would be
used by a cost-conscious researcher only during execution of software (unless it is a 24/7 uptime web server), its storage should
only be used as a temporary holding for the software's data outputs. \pb
Overall, storage costs can be reduced by maintaining on high-availability, low-latency storage such as Amazon S3 only non-processed
data for which value will be generated following compute. Alternatively, for data that have been already processed and is loss is not
critical an option is the S3 Reduced Redundancy Storage \cite{reduced}, that allows users to cut down on the costs by storing
non-critical, reproducible data at lower levels of redundancy and disaster-protection than Amazon S3’s standard storage. Finally, if
data preservation is important but the data are rarely accessed, the Amazon Glacier service \cite{glacier} provides an archival storage
option at one-tenth of the cost. \pb
\subsection*{Which Factors Challenge Adoption of Cloud-Based Solutions for Bioinformatics }
In the case of Cloud computing and similarly to a large-scale institutional server clusters, besides availability of
compute resources that in both of these cases should not be a concern, three major facts affect users planning
to access the computational system for data analysis: first, flexibility of moving data inputs and outputs to and
from the system; second, available interfaces for the user interacting with the system, and third; since both the
Cloud and institutional clusters are multi-tenant systems, what are the mechanisms safeguarding eacy user's data
integrity, privacy and isolation. While many studies are available in the literature that review available large-scale
institutional clusters based on these criteria (ref,ref,ref), here we attempt to provide an overview of the Cloud
in these respect, using as basis the Amazon EC2 platform.
Another important concern for Cloud-based bioinformatic tools is related to the data transfer bottleneck from
the local sequencing machines to the Cloud servers. According to the data published for the Amazon Cloud
platform (online ref.11), 600GB of data would require approximately one week to upload on to the remote Cloud
servers, when using an average broadband connection of 10Mbps. With a faster T3 connection which is usually
easily obtainable even at small research institutions, within one week 2TB of data can be uploaded or
approximately 600GB in 2 days. Solutions addressing this issue are available both as software that maximizes
data transfer over the network compared to traditional File Transfer Protocol (FTP), or physical disk drive
import/export services offered by the Cloud provider to its customers. Aspera's server (online ref. 12) has
been recently integrated to NCBI's infrastructure, and researchers can download a free client that allows
increased upload speeds to the Short Read Archive (online ref. 13). Through the Aspera software, transfer
bandwidth between NCBI and the European Bioinformatics Institute for data sharing in the 1000 Genomes Project,
has been increased from 20Mbps to 1000Mbps (see online ref. 14).
Finally, the Amazon offers the option for its users to physically ship disk drives to the company's offices
and have the data copied to their servers (online ref. 11). With only 80 import cost for disk drives up to 4TB
of data (4000GB), this is the most efficient method if we take into account the charge by Amazon for 0.10 per
GB of bandwith consumed, which would add up to 60 for a 600GB data upload. In addition to that cost, the
expense for obtaining a high-bandwidth internet connection for the data upload should be taken into account.
We expect the Microsoft Azure Cloud platform to offer a similar service in the near future, given the requests
on the Azure developer forums and the immediate consideration of the matter by Microsoft (online ref. 15).
For researchers that would like to leverage the advantages of a VM with the pre-installed assembly portal for
working with the completed assemblies but consider the public Cloud as not secure option, we will offer the
alternative of returning to them by mail an external hard drive with a VM containing with the portal and
assembly data. Users will then be able to load and execute the VM on a local computer cluster with a
Eucalyptus/OpenStack Cloud or on a PC using Virtualbox. We are currently offering a similar solution with
Cloud BioLinux [3], where the project's VM is available for download and execution on a local Cloud or a PC
from our website [4]. Upon local execution of a VM users will simply need to point their browser to the
portal's Internet or local IP address [55] assigned automatically by either the Cloud or Virtualbox (Fig.1B).
The IP address is available through each Cloud platform's or the Virtualbox software administrative
interface, and we will provide extensive documentation (see subsequent paragraph) on uploading, running and
accessing a local VM on the different platforms by extending the available Cloud BioLinux project
documentation.
An intuitive \cite{youtube}
our VM with the pre-installed tools on the Amazon EC2 Cloud, they will only need to follow four simple steps
through their web browser: visit the Amazon Cloud website and create a new account, start the VM execution
wizard through the Cloud's control console [58], choose computational capacity for the VM (memory,
processor, cores, storage capacity), and specify username and password credentials for accessing the running
VM. Each running VM receives a unique web address, and by using their web browser to access the address, a
researcher can login to the portal interface with the assembly tools. These four steps are described in detail
in our Cloud BioLinux publication and the project's documentation [59].
In the Cloud BioLinux work we combined the convenience of SaaS for end-users with the power of cloud computing,
in order to bring pre-installed specialized bioinformatics application which need large computational capacity
such as those for genome assembly while simplifying the the way users can get onto the cloud.
A user can start and access the OSMF Frame VM instance in three simple steps by using the Amazon EC2 cloud console
graphical user interface that is accessible via a web browser: first the user signs up for an Amazon EC2 account and
after she obtains the credentials logins to the cloud console (http://aws.amazon.com/console); within the Amazon
console the users clicks the “Launch Instance Wizard” button and specifies the OSMF Frame VM volume identifier
(our project website will provide the VM identifier for the most recent update, but the latest VM will be also identifiable
by the meta-data added to the volume); following the steps of the wizard within the web browser the users selects
computational capacity and storage for the OSMF VM, and specifies a username and password for the OSMF
WebInterface login (Fig.1, additional users can be created after the initial login); finally, once the wizard steps are
complete and the VM status shows “running”, the user copies the assigned URL address of the VM from the Amazon
cloud console in a new web browser window in order to access the OSMF interface. Through the URL users can get access
to CloudMan and Galaxy (ref Enis and Brad)The process of starting a VM on
the cloud and connecting to it has been documented for the JCVI Cloud BioLinux VM instances (REF), but nonetheless
more detailed documentation, video tutorials and user support will be available from the proposed project's website
and discussion forum (see Education & Outreach section).
\subsection*{Bioinformatics Computing and Science as A Service on the Cloud ?}
Science as a Service (ScaaS) for bioinformatics, can be defined along similar lines with the Software as a Service
computing model (SaaS, \cite{papazoglou2003}), where software is running on remote datacenters. In this model,
users access the software through a web browser or a desktop client application and there is no requirement other
than a desktop computer with Internet connection, since the SaaS service provisions and manages the computing
infrastructure, in addition to setting up the software and all its dependencies. \pb
A Science as a Service (ScaaS) model could be of benefit to the bioinformatics community, where currently a unified approach is
not available for researchers to access software or datasets, most of which have been generated as a result of federal grants
awarded to individual investigators \cite{Stein2010}. The different approaches range from websites created
by small laboratories and which provide online access to specialized bioinformatic tools, to centralized web portals
developed by large institutions such as NCBI \cite{ncbi} where web-based versions of mainstream applications including BLAST
\cite{altchul} are available. Nonetheless, in most cases datasets and software as source code are simply made available for
download from FTP sites. In every case there are significant drawbacks including restrictions on the size of datasets that can be
processed on the websites provided by small laboratories, as they are backed by compute servers with limited computational
capacity; on the other hand, web portals set up by large institutions are also restricted on the computational resources that
they can offer to the public (for example, users cannot upload multiple Gbp of sequence to the NCBI-BLAST website), while also
NCBI cannot possibly provide online access to all available bioinformatic tools; finally, in the case of software available only as
source code, provisioning informatics infrastructure and the technical expertise for performing specialized installation
procedures can be a burden for non computationally-savvy investigators, with a prime example being genome assembly software. \pb
Edit section on additional ScaaS (tools suites on Cloud), that has been moved on the tool suites on Cloud Section.
For users of the Cloud who require more control and additional flexibility to customize the computational infrastructure
were their software is running, an alternative computing model is Infrastructure as a Service (IaaS, \cite{bhardwaj2010cloud}).
The Amazon Elastic Compute Cloud (EC2) is one of the most popular providers with this model, and essentially has become
a standard for IaaS service providers followed on the open-source replicas OpenStack and Eucalyptus (\cite{openstack},
\cite{euca}). The Amazon Cloud uses Virtual Machine (VM) servers as the basic unit for computational resource allocation,
that are available in different capacities \cite{instancetypes} allowing users the option to lease a portions of the underlying
physical compute server capacity according to their budget and data processing needs. The specific term for the Amazon VMs is
Amazons Machine Images (AMIs), and while these run on top of a virtualization layer user interaction is no different than
accessing a physical server with a full operating system, processors and memory depending on the VM capacity selected. In
a few recent studies VM performance characteristics such as read-write speed of the virtual hard drives, processors speeds and
network latency for inter-communication of nodes of cluster instantiated using EC2 VMs, was found to be lagging that of
in-house built compute clusters and specialized network interconnects such as Myrinet or Infiniband (\cite{jackson2010},
\cite{hill2009quantitative}, \cite{boden1995myrinet}, \cite{infiniband2000infiniband}). Nonetheless, users can rent at higher
cost specialized VMs \cite{instancetypes} that are connected within the Cloud through high-speed network or connect to
virtual hard drives that are physically backed by Solid State Disks (SSD), and which have shown promising results \cite{jackson2010}.
Overall, other than specialized scientific applications that require specialized networking and configuration within a cluster,
the on-demand availability of the Amazon EC2 Cloud service can provide a viable alternative to dedicated clusters, as it is
similar to commodity hardware clusters often built in smaller labs without the expense or labor the build process requires. \pb
%%%%%%%%%%%%%%%%%%%%%%
\section*{Conclusions} Text for this section \ldots
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Authors contributions} Text for this section \ldots
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Acknowledgements} \ifthenelse{\boolean{publ}}{\small}{} Text for this section \ldots
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% The Bibliography %%
%% %%
%% Bmc_article.bst will be used to %%
%% create a .BBL file for submission, which includes %%
%% XML structured for BMC. %%
%% %%
%% %%
%% Note that the displayed Bibliography will not %%
%% necessarily be rendered by Latex exactly as specified %%
%% in the online Instructions for Authors. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\ifthenelse{\boolean{publ}}{\footnotesize}{\small} \bibliographystyle{bmc_article} % Style BST file
\bibliography{bmc_article} } % Bibliography file (usually '*.bib')
%%%%%%%%%%%
\ifthenelse{\boolean{publ}}{\end{multicols}}{}
\end{bmcformat} \end{document}