-
Notifications
You must be signed in to change notification settings - Fork 1
/
Learning-R.r
1627 lines (1383 loc) · 55.8 KB
/
Learning-R.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Author Information ----------------------------------------------------------
# AUTHOR: David Geeraerts
# EMAIL: geeraerd@evergreen.edu
# LOCATION: Olympia, Washington U.S.
# TITLE: Learning R
# Edition: 74
# Copyleft --------------------------------------------------------------------
# Copyright License, Creative Commons:
# Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
# http://creativecommons.org/licenses/by-nc-sa/3.0/
# Version control with GitHub
# Access the latest version, or submit contributions
# https://github.com/Octopoida/Kraken-R
# Purpose ---------------------------------------------------------------------
# Purpose: Kraken-R is a monolithic R script for learning R.
# The script style is as a cheatsheet of sorts to help remember how to do
# certain things, and strives to provide better examples than what is
# available in R help or vignettes.
# It uses The Evergreen State College,
# Computer Applications Lab (CAL) for Scientific Computing,
# HeadCount to learn R, as well as some other datasets.
# Single file to keep it simple,
# even though using Project Template & LCFD file model as project workflows is a good idea.
# LCFD project workflow by Josh Reich
# LCFD: Load, Clean, Function, Do --each as a seperate script.
# [http://stackoverflow.com/questions/1429907/workflow-for-statistical-analysis-and-report-writing/1434424#1434424]
# R Package for LCFD is 'makeProject'
# Project Template information:
# [http://projecttemplate.net/]
# R Workflow (LCFD Model) -----------------------------------------------------
# makeProject()
# https://cran.r-project.org/
# The LCFD model breaks an R script into its [4] component parts:
# load.r getting and loading the data
# clean.r cleaning, munging, and all around tidying of data
# function.r custom functions or processes can be stored in this file
# do.r main processing of the data, including output (i.e. graphs)
# Package Template ------------------------------------------------------------
# [http://projecttemplate.net/]
# R package which defines the folder structure; used for reproducible data
# install.packages('ProjectTemplate')
# library('ProjectTemplate')
#
# Package Projects ------------------------------------------------------------
# https://cran.r-project.org/web/packages/projects/readme/README.html
# Research publishing focus
# Google's R Style Guide ------------------------------------------------------
# Google's R Style Guide
# (main thing is to be consistent)
# https://google.github.io/styleguide/Rguide.xml
# Defensive Programming in R, by Chris von Csefalvay
# https://bitsandbugs.io/2018/07/27/defensive-programming-in-r/#8
# R Cheat Sheets --------------------------------------------------------------
# A collection of printable R cheat sheets
# https://www.rstudio.com/resources/cheatsheets/
# R SWIRL ---------------------------------------------------------------------
# Learn to program with an interactive R console; it's a learning environment.
# install.packages('swirl')
# library('swirl')
# swirl()
# for a list of additional content:
# https://github.com/swirldev/swirl_courses#swirl-courses
# To install additional content:
# install_from_swirl("")
# install_from_swirl("Getting_and_Cleaning_Data")
# install_from_swirl("Data_Analysis")
# install_from_swirl("Statistical_Inference")
# R Books for learning
# R Programming for Data Science by Roger D. Peng
# https://bookdown.org/rdpeng/rprogdatascience/
# R Startup Sequence ----------------------------------------------------------
# to find R_Home: R.home()
# File Variable Location
# 1. Renviron.site R_ENVIRON R_Home/etc/Rprofile.site
# 2. .Renviron R_ENVIRON_USER If set for user
# 3. Rprofile.site R_PROFILE system file
# 4. .Rprofile R_PROFILE one for system (R_Home/library/base/r/)
# 5. .Rprofile R_PROFILE_USER user profile
# 6. .RData found in the working directory
# 7. .First may exist for a project, loads functions; defined in .Rprofile or .Rprofile.sys
# 8. .First.sys load default packages
# 9. .Rhistory R_HISTFILE load history file
# Data structures -------------------------------------------------------------
# Types of structures
# vector --one dimension --1D
# factor --used for categorical/qualitative variables
# matrix --two dimensions with single atomic data type --2D
# array --three or more dimensions | is.array()
# list --different R classes
# data frame --multiple vectors with possible different classes
# NA --missing values
# Notes:
# factors encode as integer vectors for memory efficiency
# factors can be nominal or ordinal, using ordered = TRUE/FALSE parameter
# nominal (=, ≠)
# ordinal (>, <)
# interval (+, -)
# ratio (x, ÷)
# data frame elements in the same column should be of the same data type
# data frame columns should be of equal length, meaning same number of rows/records/observations
# data frame subsetting with [] returns a dataframe
# data frame sbsetting with [[]]/$ returns a vector or a matrix
# Types of Data or modes ------------------------------------------------------
# Logical is.logical() as.logical() {TRUE, FALSE, NA}
# vector is.atomic()
# vector is.vector() as.vector()
# Numeric is.integer() as.integer() #or use L : 1L, 2L 3L
# Numeric is.double() as.double()
# Text is.character() as.character()
# Factor is.factor() as.factor()
# Function is.function() as.function()
# List is.list() as.list()
# Complex is.complex() as.complex() #imaginary value i.
# NA is.na()
# Vector Chart ----------------------------------------------------------------
# (or class)
# typeof Mode storage.mode
# logical logical logical
# integer numeric integer
# double numeric double
# complex complex complex
# character character character
# list list list
# raw raw raw
# *when creating vectors, use the c() function --concatenate function
# *double has a precision of 16 digits (64 bits)
# *integer has a precision of 32 bits [+/- 2*10^9
# Notes -----------------------------------------------------------------------
# !! R IS CASE SENSITIVE !!
# R is case sensitive. Can use tolower() or toupper() for munging data.
# \\ to escape special characters
# variables should only use alpha numeric characters, . (period),
# and _ (underscore); recommend to only use underscore "_"
# for variable objects.
# Periods are used with functions, such as "data.frame".
# A variable can not start with a number!
# A variable can not use these special symbols: ^ ! $ @ + - / *
# Using ' (single quote) or the " (double quote) treats as string;
# to get at a character function such as + - / ? < > etc.,
# the ` (tick) needs to be used: `+`
# Notable Packages ------------------------------------------------------------
# 'readr' read tabular data. Replaces read.csv
# 'random' uses random.org to generate non-deterministic random numbers
# 'plyr' Tools for splitting, applying and combining data
# 'reshape2' modern data wrangling package
# 'stringr' string manipulation
# 'glue' Interpreted string literals
# 'sig' Print function signatures
# 'XML' scraping tool for html & XML pages
# 'XML2R' XML parse
# 'httr' working with HTTP connections
# 'RMySQL' for MySQL connections
# 'bigmemory' for handling large datasets too big for RAM
# 'knitr' enables R Markdown files --> .rmd file
# 'futile.logger' logging package
# PDF ----------------------------------------------
# Package for working with PDF's [pdftools](https://docs.ropensci.org/pdftools/)
# 'pdftools' extracting text and metadata from pdf
# includes utility 'pdf_txt(PDFFile.pdf) #converts pdf to text
# Basic graphing with R ----------------------------------------------
# plot() #most basic
# hist() #histogram
# barplot() #barplot
# boxplot() #boxplot
# pie() #pie chart
# pairs() #matrix of scatterplots
# par() #change a plotting parameter
# par(col = "blue") #change color to blue
# par()$col #check the plotting parameter for color
# Packages for advanced graphing ----------------------------------------------
# ggplot2 based on grammar graphics
# igraph creating undirected and directed graphs
# Packages for Spatial analysis -----------------------------------------------
# 'ggmap' Use Google maps API, OpenStreet map API.
# 'OpenStreetMap' use OpenStreetMap raster images
# 'maps' provides some basic world maps
# library(maps)
# map("state", boundary = FALSE, col="gray", add = TRUE)
# 'usmap' mapping the USA
# 'UScensus2010' US Census 2010 shape files and additional demographic data
# 'raster' workingn with raster data
# 'rgdal' Geospatial Data Abstraction Library
# Google Earth Engine API for R
# https://developers.google.com/earth-engine
# https://github.com/r-spatial/rgee
# 'rgee' Google Earth Engine for R
# Parallel & Distributed R ----------------------------------------------------
# Best to run parallel on GNU/Linux, as Windows has limitations --with some workarounds.
# CRAN Task View: High-Performance and Parallel computing in R
# http://cran.r-project.org/web/views/HighPerformanceComputing.html
# http://www.r-bloggers.com/how-to-go-parallel-in-r-basics-tips/
# https://www.r-bloggers.com/2021/11/running-r-clusters-on-an-amd-threadripper-3990x-in-windows-11-update/
# Packages in R (two types {shared memory & distributed}
# Shared Memory Distributed
# 'foreach' 'rmpi'
# 'parallel' 'pbdr' (programming with big data in R)
# 'snow' 'rhipe'
# 'multicore' 'rhadoop'
# 'mclapply'
#
# High Performance Computing with R -----------------------------------------
# [OpenMPI](https://www.open-mpi.org/) implemenation of MPI.
# R package for MPI: Rmpi() --https://cran.r-project.org/web/packages/Rmpi/index.html
# R packge for OpenMP: Rcpp() --https://cran.r-project.org/web/packages/Rcpp/index.html
#
# OpenMPI is the protocol between nodes in a cluster for parrallel processing.
# OpenMP is the protocol for shared memory between nodes in a cluster.
# Tips & Tricks ---------------------------------------------------------------
# TAB for command completion
# Esc interrupt current command (i.e. when console is waiting for input by showing +
# Ctrl+up command history
# Ctrl+L clear console
# Mathematical Operators -------------------------------------------------------------------
# + Addition
# - Subtraction
# * Multiplication
# / Division
# ^ Exponentiation
# %% Modulo (finds the remainder after division of one number by another)
# Relational Operators ----------------------------------------------------------
# (Conditional checking)
# (contain text in quotes, i.e. "text" == "text") #will return TRUE
# == equals condition
# != not equal condition
# < less than
# > greater than
# >= greater than or equal
# <= less than or equal
# ! not
# & and (use single inside index [] for subsetting)
# | or (use single inside index [] for subsetting)
# && and (but only examines the first element)
# || or (but only examines the first element)
# all logical test that all values are true
# any logical test that some values are true
# examples
# var_numeric <- c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
# var_sub <- var_numeric > 5
## Notes
## Use the (exclamation) ! for NOT, i.e. !is.logical = is not logical
# Conditional Statements ------------------------------------------------------
# if statement
# if(condtion) {expression}
# if(condtion) {expression} else {expression}
# else if condition
# if(condtion) {expression} else if(condition) {expression} else {expression}
# notes:
# the "else" or "else if" must be on the same line as the closing bracket; i.e.
# if(condition) {
# print("expression 1")
# } else if (condition) {
# print(expression 2")
# }
# while loop
# while(condition) {expression}
# with multiple expressions
# while(condition) {
# expression 1;
# expression 2;
# }
# using break
# while(condition) {expression
# if(condition) {break}
# expression
# }
# for loop
# for(variable in sequence) {expression}
# Basic loop to read a file with list
for (item in File_list){print(item)}
# using break
# for(variable in sequence) {
# if(condition) {break}
# expression}
# using next
# for(variable in sequence) {
# if(condition) {next}
# }
# for loop with index
# for(i in 1:length(var)) {
# print(var[i])
# }
# Nested for loop
# for(variable in sequence) {
# for(variable in sequence) {
# expression
# }
# }
# example using 3 x 3 matrix
# var_matrix <- matrix(c("A", "B", "C", "D", "E", "F", "G", "H", "I"), nrow = 3, ncol = 3)
# for(i in 1:nrow(var_matrix)) {
# for(j in 1:ncol(var_matrix)) {
# print(paste("Matrix Position:", i, j,"Value:",var_matrix[i,j]))
# }
# }
# R built-in (for) functions that loop through objects {vectors, lists}
# lapply() applied over list or vectors
# sapply() wrapper for lapply; returns a vector, matrix, or array
# vapply() wrapper for sapply; returns specific value
# rapply() wrapper for lapply; recursive version of lapply
# dapply() wrapper for lapply; returns a data.frame
# mapply() wrapper for sapply; multivariate version; returns multiple list or vector
# Subsetting data basics ------------------------------------------------------
# Working with R index/indices
# [] single bracket used to extract an object of the same class, and more than one element.
# [[]] double bracket used to extract from a list or data frame, and only a single element.
# use double bracket when using computed indices within an index:
# var_z <- matrix(c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"), nrow = 10, ncol = 10)
# var_f <- var_z[ ,10]
# var_i <- var_f[[1]]
# $ used to extract elements from a list or data frame.
# - use - "minus" to remove an element from the vector:
# var_numbers <- c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine")
# var_numbers <- [-1] ## this will remove the first element in the index, "zero".
# Working with index either by using numeric index call [1] or using labels with the index vector ["odd"]
# var_numbers <- c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine")
# var_numbers[c(2,4,6)] #use c() when selecting multiple index's
# var_numbers[2:10] #select a sequential series
# var_names(var_numbers) <- c("zero", "odd", "even", "odd", "even", "odd", "even", "odd", "even", "odd")
# Conditional selection
# var_numeric <- c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
# var_numeric_sub <- var_numeric[var_numeric > 5] #will select all elements greater than 5
# '[[' used as a function, such as in *apply() functions
# lapply(x, '[[', ("label"))
# Regular Expression ------------------------------------------------------
# Website for RegX testing
# https://regex101.com/
# logical return {TRUE, FALSE}
# grepl(pattern = "<regex>" , x = <object>)
# look for numeric 2 at the begining of the date
grepl(pattern = ^2 , x = Sys.Date())
# is there a 2 at the end of the date?
grepl(pattern = "2$" , x = Sys.Date())
# grep returns the index of an item
# grep(pattern = "<regex>" , x = <object>)
grep(pattern = ^2 , x = Sys.Date())
# Replacement
# Sub replaces the first match only
# sub(pattern, replacement, x, ignore.case = FALSE)
# gsub replaces all matches
#gsub(pattern, replacement, x, ignore.case = FALSE)
# CRANberries -----------------------------------------------------------------
# is a website that keeps track of all CRAN package updates
# http://dirk.eddelbuettel.com/cranberries/
# Help ------------------------------------------------------------------------
# Great help resource
# http://en.wikibooks.org/wiki/R_Programming
# One of the better help resources
# http://www.rdocumentation.org/
#
# see the latest R version changes
View(news())
# help on functions
help(c)
help(install.packages)
# getting help on a package
help(package=stringr)
# Example code
example(help)
# open help page as a document
library(help="psych")
# vignette, for examples of functions
vignette()
# Get arguments/parameters for a function
args(help)
# R Information & Options -----------------------------------------------------
# simple
getRversion()
# Returns variable values
R.Version()
# System information
Sys.info()
# more session information
sessionInfo()
#another session info based on devtools
# devtools::session_info()
if(require('devtools')==TRUE) devtools::session_info()
# Type of OS
.Platform$OS.type
# can easily setup a conditional check and do something.
if(.Platform$OS.type == "windows") {getRversion()}
# get list of options
options()
options("digits")
#to change an option for the session
options(digits = 15)
#change the default continue character from "+" to "..."
options(continue = "...")
# Upgrading R -----------------------------------------------------------------
# using installr package
if(require('installr')==FALSE) install.packages('installr')
library(installr)
updateR()
# Package Management ----------------------------------------------------------
# Repositories
# CRAN
# http://cran.r-project.org/
# BioConductor
# http://bioconductor.org/biocLite.R
# OMEGA
# http://www.omegahat.org/R
# Package search with documentation (CRAN + Bioconductor + GitHub)
# http://www.rdocumentation.org/
# Package Use
# packrat() can package up all dependent packages for a project with specific versions in a sandbox area that will not affect the local package installation.
# drat() build a local repository
# biocLite() bioinformatics repository source("http://bioconductor.org/biocLite.R")
# devtools() install_github("user/repo") access Github packages
# list or search installed packages
search()
# list which paths will be searched for packages
searchpaths()
# where are packages stored
.libPaths()
# can also be used to set the directory location of packages
# .libPaths("C:/ProgramData/R/library")
installed.packages()
# load a package
# library('packageName')
# require('packageName')
# install multiple packages
# install.packages(c('packageName', 'packageName'))
# Install example
install.packages('psych')
# loads the package
library(psych)
# Install a package with any dependencies --depend on/link to/import/suggest
install.packages('ggplot2', dependencies = TRUE)
# install package(s) with a defined repository --needed for commandline
# List of repositories: https://cran.r-project.org/mirrors.html
install.packages('ggplot2', dependencies = TRUE, repos='https://ftp.osuosl.org/pub/cran/')
library(ggplot2)
# Install a package only if needed, i.e. it's not already installed
if(require('psych')==FALSE) install.packages('psych')
# CRAN Task Views
# http://cran.r-project.org/web/views/
# have to install and load ctv first
install.packages("ctv")
library("ctv")
available.views()
# install all of the packages associated with a Task View
install.views("TimeSeries")
# update any packages associated with a Task View
update.views("TimeSeries")
# unload a package
detach("package:ctv", unload = TRUE)
# uninstall a package
remove.packages("ctv")
# System Utilities ------------------------------------------------------------
# additional functions
library("R.utils")
# very powerful feature of R is to leverage Windows command-shell or GNU/Linux BASH
# GNU/Linux BASH
system('hostname')
system('whoami')
#OR
# Windows
shell('hostname')
# shell.exec() can be used to open a file
# shell.exec("D:/Workspace/file.txt)
# shell.exec("D:\\Workspace\\file.txt)
# System2
# it is now recommend that all system commands use the system2 package.
# Will work for: Windows, GNU/Linux, Mac
system2('whoami')
# System variables ------------------------------------------------------------
Sys.getenv()
Sys.getenv("R_HOME")
Sys.getenv("R_LIBS_USER")
memory.limit()
memory.size()
memory.limit() - memory.size() #available memory
memory.profile()
# get memory usage for an object
object.size(date)
# garbage collection
# should happen automatically, but can be called manually. Good for getting memory usage.
gc()
gc(verbose=TRUE)
# System Date & Time ----------------------------------------------------------
## POSIXct reference constant: January 1st 1970 12:00 am 0 UTC
Sys.Date()
Sys.time()
Sys.timezone()
today <- Sys.Date() #store current date in variable
now <- Sys.time() #store current time
unclass(today) #to see the numeric value based on calculation against internal reference date
unclass(now) #to see the numeric value based on calculation against internal reference time
# expressions can be grouped together with the use of: exp();exp()...
Sys.time();Sys.timezone()
# build in function returns date & time as character string
date()
# ensure that a variable is stored as date or time
var_reference_date <- as.Date("1970-01-01")
class(var_reference) #it's a date class
#ensure that a varialbe is stored as time
var_reference_time <- c("00:00:00") #class will be character
var_reference_time <- as.POSIXct("1970-01-01 00:00:00") #timezone (tz) not specified; will defualt to system tz.
# conditional checking
is.vector(date()) && is.character(date()) && is.numeric.POSIXt(date()) #returns false, not a POSIXt, just a vector character string.
# POSIX format
as.POSIXlt(Sys.time(), tz = "GMT")
as.POSIXct(Sys.time())
format(Sys.time(), "%a %b %d %X %Y %Z")
# long format
format(Sys.time(), "%A %B %d %Y %X %Z")
#time with miliseconds
format(Sys.time(), "%H:%M:%OS3")
# Get the current day of the week
format(Sys.time(), "%A")
#
# see that Sys.Date() is class date
# {numeric, logical, character, list, matrix, array, factor, data.frame}
class(Sys.Date())
# see what storage mode Sys.Date() is
# {logical, numeric, complex, character, list (only for object list), or raw}
mode(Sys.Date())
storage.mode(Sys.Date())
# Time Series packages for advanced analysis
# Packages: ts, lubridate, chron,
if(require('chron')==FALSE) install.packages('chron')
if(require('lubridate')==FALSE) install.packages('lubridate')
if(require('ts')==FALSE) install.packages('ts')
# Time Execution (Benchmarking) -----------------------------------------------
# To time the execution of any script
# Simple meta(global)
var_script_start <- Sys.time()
var_script_end <- Sys.time()
var_script_run <- var_script_end - var_script_start
# uses proc.time()
startTimer <- proc.time()
proc.time() - startTimer
# Find out how long execution takes
system.time(pie(rep(1, 12), col = rainbow(12)))
# Pause for specified time (in seconds)
Sys.sleep(10)
# Can use lapply and mcapply to compare differences for parallel processing
system.time((lapply))
system.time((mclapply))
# Microbenchmarking: performance on a small piece of code
if(require('microbenchmark')==FALSE) install.packages('microbenchmark')
library(microbenchmark)
microbenchmark(pie(rep(1, 12), col = rainbow(12)))
# R Profiler for performance analysis
Rpro()
summaryRprof()
# Write to a file -------------------------------------------------------------
# not the best
cat(paste(Sys.getenv()), sep = "\r", fill = TRUE, label = paste(names(Sys.getenv())), append = FALSE, file="System.variables.txt")
# best way and formatted
cat(paste(names(Sys.getenv()), "=", Sys.getenv()), sep = "\r", fill = TRUE, label = NULL, append = FALSE, file="System.variables.txt")
# Another method is to use capture.output()
capture.output(memory.limit(), file = "System.variables.txt", append = TRUE)
# Add txt content to appended file using c()
capture.output(c("MaxMem:", memory.limit()), file = "System.variables.txt", append = TRUE)
# Output Controls -------------------------------------------------------------
# Console Message
message("text")
# warning message
warning()
# Output to a log file using sink()
# see if there are any open connections
sink.number()
# start a sink connection to a file
sink("R-Output.log.txt", append = TRUE, split = FALSE)
# close the connection
sink()
# Check connection is closed
sink.number()
# Output to... {pdf, png, svg, jpg, etc.}
# display list of graphical devices
?Devices
# example: svg(file="myImageFile.svg", height=4,width=8)
# ...followed by creating a plot/graph.
# close device connection(s)
dev.off()
# Setting values (best practice is the use of <-
# x <- 1
# assign("x", C(x,y,z)
#getting user input
om <- "Your Selection:" #output message
x <- readline(prompt = "User Input (Y or N) : ")
if (x == 'Y' || x == 'y') cat(c(om,"Yes")) else cat (c(om,"No"))
# Working with directories ----------------------------------------------------
# Old school way, which requires explicit paths
# get working directory
getwd()
# Set the working directory (Windows OS syntax)
setwd("D:\\Workspace\\R")
setwd("D:/Workspace/R")
#
# Executing an R script from file
source("Week_Days.r")
source("Months.r", local = FALSE, echo = TRUE, verbose = FALSE)
# Can be a URL
# source("http://www.awebsite.info")
# New school way, works with relative paths; required RStudio
# Using the <Project>.Rproj file
[.Rproj](https://support.posit.co/hc/en-us/articles/200526207-Using-RStudio-Projects)
# Working with PDF ----------------------------------------------------
# Where to output to
output_file <- "FileName.txt"
PDF_FileName <- pdf_text(pdf_file.pdf)
# writelines provides the best formattting to make it look like the original pdf
writeLines(PDF_FileName.txt, output_file)
# Objects ---------------------------------------------------------------------
# Return all objects that have been instantiated
objects()
character() #any objects that are character
numeric() #any objects that are numeric
# get object information
attributes() #attributes of object, if any
dim() #dimensions
class() #class type of object
# bulk execution on an object
# uses build in dataset "cars"
# returns the class for each variable
lapply(cars, class)
#clearing objects/variables
# list all instantiated objects
ls()
# remove a variable
## creating a variable first
var_Name <- 100
## remove a variable
rm(var_Name)
## remove multiple variables
# rm (var_Name1, var_Name2)
# quickly remove all instantiated objects/variables
rm(list=ls())
# List variables by size
sort( sapply(ls(),function(x) {object.size(get(x))}),decreasing=TRUE )
# Coercion --------------------------------------------------------------------
# make a variable into a certain data type
# as.*()
# as.logical(), as.numeric (), as.character(), etc.
var <- 1
var_log <- as.logical(var)
# Naming vectors, applying labels
# Example uses playing cards
var_suite <- c("s", "h", "c", "d")
names(var_suite) <- c("spade", "heart", "club", "diamond")
# days of the week are often represented numerically:
var_days <- c(1:7)
names(var_days) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
# or define a days of the week vector and reuse it
days_vector <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
names(var_days) <- days_vector
# Finding the number of elements in a vector
length(var_days)
# Built in R data sets --------------------------------------------------------
data()
#Historical Data
#install.packages('HistData')
#library(HistData)
# Matrix ----------------------------------------------------------------------
# simple example
var_myMatrix <- matrix(1:2, ncol = 2, byrow = TRUE)
# 2nd simple example using vectors
# instantiate vectors
#Parity even
var_even <- c(0, 2, 4, 6, 8)
#Parity odd
var_odd <- c(1, 3, 5, 7, 9)
# create matrix with vectors
var_myMatrix <- matrix(c(var_even, var_odd), ncol = 2, byrow = TRUE)
#label rows, columns
rownames(var_myMatrix) <- list("rownumber1", "rownumber2", "rownumber3", "rownumber4", "rownumber5")
colnames(var_myMatrix) <- list("parity_even", "parity_odd")
# Get totals on the rows:
var_myMatrix_rowSum <- rowSums(var_myMatrix)
# quick ways to sums in a matrix using colSums()/rowSums()
var_myMatrix_colSum <- colSums(var_myMatrix)
# add a column of data with rbind()/cbind(); such as calculated columns
var_myMatrix_row_total <- cbind(var_myMatrix, var_myMatrix_rowSum)
# Factors ---------------------------------------------------------------------
# simple [nominal] factor example
var_myVector <- c("Bacteria", "Protozoa", "Chromista", "Plantae", "Fungi", "Animalia")
var_myFactor <- factor(var_myVector)
# simple [ordinal] example, that is ordered
var_bioClad_vector <- c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species")
var_bioClad_factor <- factor(var_bioClad_vector,
ordered = TRUE,
levels = c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species"),
labels = c("Domain", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"))
# Lists -----------------------------------------------------------------------
# simple example of lists and using names
var_myVector <- 2:10
var_myMatrix <- matrix(1:10, ncol= 2)
var_myFactor <- factor(c("J", "Q", "K", "A"), ordered = TRUE, levels = c("J", "Q", "K", "A"), labels = c("Jack", "Queen", "King", "Ace"))
# list with names
var_myList <- list( numbercards = var_myVector, oddandeven = var_myMatrix, facecards = var_myFactor)
str(var_myList)
# subsetting with lists
var_myList[1] #returns the first list, in this case numbercards
var_myList[[1]][[1]] #returns the first list, AND the first element
var_myList$numbercards[[4]] #returns the fourth element in the named list numbercards, in this case 5
var_myList[c(T, F, T)] #uses logic to return lists: True returns list, F omits list
#add another list to a list
var_myList <- c(var_myList, list(Card_Company = "Bicycle", Card_Series = c("Red", "Blue")))
str(var_myList)
# Manual data frame -----------------------------------------------------------
# an example of creating a data frame manually
# uses CAL-Usage-Audit data
# https://docs.google.com/spreadsheets/d/1e8FRS3INSldjHo329_F_H0acE7sTGeuZTJ_cK2gGvy4
# Column variables, populate data
year <- as.ts(c("2012", "2012", "2013", "2013", "2013", "2014", "2014", "2014", "2015", "2015")) #without as.ts, it's loaded as a factor.
quarter <- as.factor(c("Spring", "Fall", "Winter", "Spring", "Fall", "Winter", "Spring", "Fall", "Winter", "Spring")) #nominal data will be factor.
# another way of creating quarter with rep (replicate)
quarter <- as.factor(factor(rep(c("Fall", "Winter", "Spring"), times = 3), ordered = TRUE, levels = c("Fall", "Winter", "Spring")))
# How to add "Spring" to quarter at the beginning?
total_reservations <- as.numeric(c("128", "152", "125", "153", "127", "149", "168", "182", "169", "227")) #without as.numeric, it's loaded as a factor.
total_hours_scheduled <- as.ts(c("306:15:00", "505:00:00", "411:00:00", "545:30:00", "395:00:00", "511:30:00", "518:30:00", "549:15:00", "459:00:00", "697:00:00")) #without as.ts, it's loaded as a factor.
total_hours_used <- as.ts(c("231:15:00", "386:15:00", "308:30:00", "374:45:00", "241:05:00", "377:15:00", "428:30:00", "374:30:00", "361:00:00", "508:15:00")) #without as.ts, it's loaded as a factor.
no_show_count <- as.numeric(c("34", "28", "26", "37", "33", "32", "29", "45", "24", "61")) #without as.numeric, it's loaded as a factor.
# create the data frame
faculty_usage <- data.frame(year, quarter, total_reservations, total_hours_scheduled, total_hours_used, no_show_count, stringsAsFactors = FALSE)
faculty_usage
View(faculty_usage) #view in table format
#structure of the data frame
str(faculty_usage) #note that without using "as.", all columns would be factors.
# head: show only the # of records --quick view of the data
head(faculty_usage, 5) #first 5 records
# tail: show only the last # of records
tail(faculty_usage, 5)
# dimensions for the data frame: rows, columns
dim(faculty_usage)
# adding additional columns; can also use cbind()
faculty_usage$percent_no_show <- (no_show_count / total_reservations) * 100
round(faculty_usage$percent_no_show, digits = 1)
faculty_usage$percent_no_show <- round(faculty_usage$percent_no_show, digits = 1)
# naming/renaming columns, after the fact
names(faculty_usage) <- c("year", "quarter", "total_reservations", "total_hours_scheduled", "total_hours_used", "no_show_count", "%_no_show")
# ordering a selection, such as column
## using sort() will not preserve the index, only returns a vector
# returns the index for the values
order(faculty_usage$total_reservations)
# best way to use order()
faculty_usage[order(faculty_usage$total_reservations, decreasing = FALSE),]
# subsetting example
## define logical vector for condition
faculty_usage_fall <- quarter == "Fall"
### use condition to subset
faculty_usage[faculty_usage_fall == TRUE,]
#### OR, simply
faculty_usage[faculty_usage$quarter == "Fall", ]
#### OR, using subset()
subset(faculty_usage, subset = quarter == "Fall")
# Building a function ---------------------------------------------------------
# using a deck of cards to provide an example of building a function() {}
# create a function to shuffle and show x number of cards where x is default to 1
shuffle <- function(x = 1) {
var_deck_vector <- c("2C", "3C", "4C", "5C", "6C", "7C", "8C", "9C", "10C", "JC", "QC", "KC", "AC",
"2S", "3S", "4S", "5S", "6S", "7S", "8S", "9S", "10S", "JS", "QS", "KS", "AS",
"2H", "3H", "4H", "5H", "6H", "7H", "8H", "9H", "10H", "JH", "QH", "KH", "AH",
"2D", "3D", "4D", "5D", "6D", "7D", "8D", "9D", "10D", "JD", "QD", "KD", "AD")
var_deck_factor <- factor(var_deck_vector,
levels = c("2S", "2H", "2C", "2D", "3S", "3H", "3C", "3D", "4S", "4H", "4C", "4D", "5S", "5H", "5C", "5D", "6S", "6H", "6C", "6D", "7S", "7H", "7C", "7D", "8S", "8H", "8C", "8D", "9S", "9H", "9C", "9D", "10S", "10H", "10C", "10D", "JS", "JH", "JC", "JD", "QS", "QH", "QC", "QD", "KS", "KH", "KC", "KD", "AS", "AH", "AC", "AD"),
labels = c("2 of spades", "2 of hearts", "2 of clubs", "2 of diamonds",
"3 of spades", "3 of hearts", "3 of clubs", "3 of diamonds",
"4 of spades", "4 of hearts", "4 of clubs", "4 of diamonds",
"5 of spades", "5 of hearts", "5 of clubs", "5 of diamonds",
"6 of spades", "6 of hearts", "6 of clubs", "6 of diamonds",
"7 of spades", "7 of hearts", "7 of clubs", "7 of diamonds",
"8 of spades", "8 of hearts", "8 of clubs", "8 of diamonds",
"9 of spades", "9 of hearts", "9 of clubs", "9 of diamonds",
"10 of spades", "10 of hearts", "10 of clubs", "10 of diamonds",
"Jack of spades", "Jack of hearts", "Jack of clubs", "Jack of diamonds",
"Queen of spades", "Queen of hearts", "Queen of clubs", "Queen of diamonds",
"King of spades", "King of hearts", "King of clubs", "King of diamonds",
"Ace of spades", "Ace of hearts", "Ace of clubs", "Ace of diamonds"),
ordered = TRUE)
sample(var_deck_factor, size = x, replace = FALSE)
}
#To find out the arguments of a function
# no arguments are build in at this time.
args(shuffle)
shuffle() #use the function, with default
shuffle(5) #return 5 cards
# sort the returning cards in descending order
sort(shuffle(5), decreasing = TRUE)
# Working with web connections -------------------------------------------
# Example, Connection
# Working with Connections, such as [geo]JSON feed
# uses RJSONIO package
# Sample data from USGS Earthquake feed
install.packages('RJSONIO')
library(RJSONIO)
# USGS API Documentation -Earthquake Catalog
# https://earthquake.usgs.gov/fdsnws/event/1/#extensions
URI <- "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/significant_week.geojson"
download.file(URI,"USGS_Quakes.json")
con = file("USGS_Quakes.json")
USGS_Quakes = fromJSON(con) #makes it like a list
close(con)
head(USGS_Quakes)
tail(USGS_Quakes)
# Web scrapping -------------------------------------------
# The easiest way to get rvest is to install the whole tidyverse:
install.packages("tidyverse")
# Alternatively, install just rvest:
install.packages("rvest")
# CAL Analytics, Working with -------------------------------------------
# Get the data from Google docs
# Source File
# https://docs.google.com/spreadsheets/d/1e8FRS3INSldjHo329_F_H0acE7sTGeuZTJ_cK2gGvy4/edit?usp=sharing
# "Summary" worksheet as a csv file.
# URI for csv file
# https://docs.google.com/spreadsheets/d/1e8FRS3INSldjHo329_F_H0acE7sTGeuZTJ_cK2gGvy4/export?format=csv&id=1e8FRS3INSldjHo329_F_H0acE7sTGeuZTJ_cK2gGvy4&gid=305606125
# Load the CSV file into a variable; read.table() is common for text files {.txt}; scan() is the most primitive form of reading data from a file into a variable.
# Create a Timestamp for loading the data
LastLoadTimestamp_CalData <- as.POSIXct(Sys.time())
# Get the difference between last load and current time
as.POSIXlt(Sys.time()) - LastLoadTimestamp_CalData
# assumes a project template layout
# Best to use .Rproject in RStudio
# setwd("D:/Workspace/R/CAL")
# or to automate the process
URI <- "https://docs.google.com/spreadsheets/d/1e8FRS3INSldjHo329_F_H0acE7sTGeuZTJ_cK2gGvy4/export?format=csv&id=1e8FRS3INSldjHo329_F_H0acE7sTGeuZTJ_cK2gGvy4&gid=305606125"
download.file(URI,"CAL-Analytics.csv")
# Reading in the data; should use readr() to load the tabular data.
CalData <- read.csv('CAL-Analytics.csv', sep = ',', header = TRUE) #if tab use sep='/t'
# Check the last few records of the dataset
tail(CalData, 10)
# To remove the Validity.Check column
rmtCalData <- CalData[ ,-7] #remove column #7 (which is the Validity.Check column)
#or
CalData <- CalData[,-7]
# How was the data loaded? The data was loaded as a list.
typeof(CalData)
is.matrix(CalData)
is.list(CalData)
# better to load CalData as a data frame
CalData <- as.data.frame(CalData)
is.data.frame(CalData)
# First things to do when looking at a new dataset
# Look at the dimensions of the data
# number of records, number of columns
dim(CalData)
# any missing values?
# is.na(CalData)
sum(is.na(CalData)) #better way of getting the info
table(is.na(CalData))
# look at the data in table format