## R is a language used in a variety of fields ## (bio-*, finance, big data-*) to carry out ## statistical analyses of complex data sets. ## You may have to work with it some day... ## ... engineer your code! ## Say you need a vector of double containing ## all the squares from 1 to n. As an imperative ## programmer seeped in Java, how do you approach ## this task? ## Start with a function contract, then write tests! ## sqn : double -> double[] ## Given: a size n ## Returns: a vector of squares x such that x[i] == i * i sqn <- function(n) { x <- 1:n for (i in 1:n) x[i] <- i * i return(x) } ## Let's define functions for testing to make our life easier. ## testEq: T, T -> void ## Given: two values ## Returns: prints an error message if different testEq<- function(a,b) if (a != b) print("Error!") ## ... and now write one test. Silence means assent. testEq(sqn(100)[100],100*100) ## ... make sure that the test is doing something ## forcing a failure testEq(sqn(100)[100],100*100+1) ## Ok this fails ## Would this be a good exhaustive test? It's nice and short. testSqn <- function(f, n) for (i in 1:n) testEq(f(i)[i], i*i) testSqn(sqn, 10^3) ## and for bigger arrays our patience is tested... testSqn(sqn, 10^4) ## The test is faulty because: we are creating a new array for ## each value of i and we are always looking at the last ## value of each array and nothing else. This is a bad test. ## A better test: testSqn2 <- function(f, n) { x <- f(n) for (i in 1:n) testEq(x[i], i*i) } ## Why do we take a function as argument? Because we want to ## demonstrate that R support *higher-order* values (i.e. functions ## as values) and we want to write a test that is parameterized ## by the function being tested. This will come in handy. testSqn2(sqn, 10^4) # Runs fine... testSqn2(sqn, 10^5) # Two error messages... strange. ## The warnings() function will print the last few messages that ## were logged by R. warnings() ## At this point it is unclear where the error lies. Everything ## seems to work fine for small array but when we go to 10^5 ## we see errors. ## Proceeding, by elimination: Is it a problem of the test function? x <- sqn(10^5) ## Yes, the test function generates some of the errors but there are ## also warnings comming from sqn warnings() ## Let's get to the bottom of this. Something strange ## is going on with multiplication. We can define our own ## square function and test it. ## sq: double -> double ## Given: a double ## Returns: the square of the value sq <- function(i) i * i ## Now check on the extremal value (on the theory ## that if there is overflow it should show up there) testEq(sq(10^5), 1e+10) ## this is fine. No error. ## Are we crazy? Let's refactor sqn to use sq (since sq "works") sqn2 <- function(n) { x <- 1:n for (i in 1:n) x[i] <- sq(i) x ## btw: we don't need an explicit return } ## Test it again... testSqn2(sqn2, 10^5) ## Still the same errors. ## Maybe it is something to do with assignment? ## Let's refactor it in its own function ## set : double[], double -> void ## Given: a vector of double v and a double i ## Returns: nothing; but sets v[i] to the square of i. set <- function (v, i) { v[i] <- sq(i) } ## And now use the function set sqn3 <- function(n) { x <- 1:n for (i in 1:n) set(x,i) x } ## Test again... testSqn2(sqn3, 10^2) ## This time there are errors printed from the test. This is a different problem. ## Someone told us that all R functions should always return ## something. Let's try that: ## set: double[], double -> double[] ## Given: a vector of double v and double i ## Return: a vector that was update at location i set <- function (v, i) { v[i] <- sq(i) v } ## In this version of the function we recover the result ## of set() and store it into x sqn4 <- function(n) { x <- 1:n for (i in 1:n) x <- set(x,i) x } ## and now the test pass... testSqn2(sqn4, 10^2) ## but not for the bigger arrays. testSqn2(sqn4, 10^5) ## and they take even longer than before. ## Obviously we are driving blind trying things without having a real clue what ## is going on. Let's switch gears and look at the R the language. Perhaps ## we will figure out what is going on. ### ## R is a general purpose programming language with most of the data types one would expect x <- c(11, 12) # vectors of doubles x[2] # subsetting 42[1] # that's a vector too 42[3] # of size one y <- x # assignment typeof(y) # check the type of a value x <- 1:100 # another way to create a vector typeof(x) # and also integers v <- 1L # 32 bit integer value typeof(v) # with a range of +/-2*10^9 ### Aparte': INTEGERS! ######################### x <- 100000L x * x # Aha! 1:100 ## this creates integers because: (1) it is often used in loops to ## to create indicies into vectors and 32-bit integers are more efficient ## in memory and time than doubles! as.double(x) * x ## forces the computation to be doubles. ## Here is the version that is not causing errors. sqn0 <- function(n) { x <- 1:n for (i in 1:n) x[i] <- as.double(i) * i x } testSqn(sqn0, 10^5) ## the test runs fine ## but is still slow, way too slow ## Why? ################################################### ## back to the language... x2 <- x[ (x %% 2) == 0] # powerful subsetting operations x2 # a shorter vector with only even numbers z <- c("vector","of","strings") # vector of char typeof(z) z <- c(1, "one") # vectors are monomorphic z z[100] <- "hundred" # and they grow on the demand z u <- list(1,2) # a list of double typeof(u) unlist(u) # lists can be turned to vectors w <- list("hi", 2, u) # lists are polymorphic unlist(w) # unlisting coerces to the most general type b <- c(TRUE, FALSE) # logicals typeof(b) sex <- c("M", "F", "M", "M2F") # vector of characters sex <- factor(sex) # factors sex as.integer(sex) # the vector's values are represented by integers typeof(sex) # If sex is a vector of integers, where are the level stored? for (i in 1:100000) sex <- c("M", "F", "M", "M2F") # Looping for (i in 1:100000) sex <- factor(c("M", "F", "M", "M2F")) # creating factor takes time x <- 1 # x is a vector of length 1 for (i in 1:100000) x[i] <- i # initialize the vector with increasing values for (i in 1:100000) x[i] <- i+1 # do the same (faster?) x <- 1 # again for (i in 1:100000) x[i] <- i # (slow?) x <- 1 x[100000] <- 0 # extend for (i in 1:100000) x[i] <- i # (fast) for (i in 1:10000000) 1 ## ok for (i in 1:10000000) ((((((1)))))) ## slow # Performance of R code is often obscure, let's look into it for a bit system.time( 1 ) # time is given in seconds system.time ( ((((((1))))))) # hmm, no difference here ## That is because the accuracy of the timer (which returns values in seconds) ## is not sufficient to measure the time it takes for R to evaluate the number 1. system.time(for (i in 1:10000) 1) # iterate until we get past clock resolution system.time(for (i in 1:10000) ((((((1))))))) ## slower ## Why? ( 1 ) # '(' is a function call in R `(` <-function( x ) { ## overwrite the default () to do something print ( paste("Saw " ,x )) x } ( 1 ) # what did we see? ## Now every time we use a parenthesis around an expression we will print something ## That sounds like a bad idea, but it explains the speed difference; ## there were many additional function calls. remove( `(` ) ## definitions can be removed. Back to normal. # The problem is that most function take very small amounts of time when called # once, can we find an automatic way to run them "enough" times? # A better version of system.time... that would invoke its argument sufficiently # and return a cost per invocation # tm : T -> double # Given: any R expression # Return: the execution time in milliseconds ## use as: tm( 1 + 1 ) # undefined, wait for it ## How do we run an expression passed as argument multiple times? #... but first we must mention that R is lazy programming language print(1) # outputs "1" on the terminal f<-function(a,b) {} # a helper, nevermind f( print(1) , print(2) ) # what would this do in Java or in C temp <-print(1);temp2 <- print(2); f(temp, temp2) # same thing? # Odd? In Java, the two snippets would be equivalent f1 <- function( a, b, c ) { # Consider the following, not ver useful function x <- f(b,b) x <- c x <- c return(a) } f1( print(1), print(2), print(3) ) # What do you recon this will print? # The order in which the values are printed are jumbled f2 <- function( a, b, c ) { # here is another function x <- b x <- c f3(a) } f3 <- function( a) { } # with a silly helper function f2( print(1), print(2), print(3) ) # what will we print? # All right, "lazy" means do not evaluate arguments to functions but # evaluate everything else. # Question: Can you imagine another defintion of lazyness? _tm_( 1 + 1 ) # have we made any progress? ## Sort of... let's try something tm1 <- function( exp ) { ### missing some code for recording the time for( i in 1:10) exp # Use lazyness and just evaluate the } tm1( print(1) ) # same expression many times, which for( i in 1:10) print(1) # should be the same as this. ## But that does not work, in R expressions are only evaluated once; ## if used many times, their value is remembered. # Ok, lazy is not enough; we need one new trick... x<- substitute( 1 + 1 ) # R can turn code into values x typeof(x) # here a new value, for code deparse(x) # And we have got text again y<-deparse(x) eval(parse(text=y)) # and back into code again parse(text=y) typeof(parse(text=y)) # and another new data type # Back to the plot _tm_( 1 + 1 ) ## This function will run the expression 1000 times... tm2 <- function( exp ) for( i in 1:1000) eval( parse(text= deparse( substitute( exp )))) tm2(1+1) # Almost there, we need to run just the right number of times to run # and add the timer code tm <- function( exp ) { x<- deparse(substitute( exp )) ## decompile the expression t <- 1 ## repetion counter (we run exp 10 * t) time = 0; ## reset timer while ( time < 1 ) { ## continue looking until the code runs long enough y <- x ## code for( i in 1:9 ) y <- paste(y,";",x) ## duplicate the code 10 times y<- paste(" system.time ( for (i in 1:",t,") {", y, ## wrap the code with a loop that runs t times "}, gcFirst= FALSE)") r <- eval(parse(text=y)) ## execute time <- r[3] ## grab the elasped time t <- t * 10 ## try with a bigger t } (time / (t * 10)) * 1000 ## return time in milliseconds (default is in s) } tm( (((1))) ) *1000 tm( (( 1)) ) *1000 tm( 1 ) *1000 # All right, we have established that in R, extra parentheses can cost you # dearly and shown how to write a small performance utility that generates code # on the fly. # One thing that R does a lot, is calling functions tryme<-function( aaaa00_=0,aaaa01_=1, aaaa02_=2,aaaa03_=3, aaaa04_=4,aaaa05_=5,aaaa06_=6 ) { } tm( tryme(0, 1, 2, 3, 4, 5, 6 ) ) # arguments can be passed by position tm( tryme(0) ) # or omitted, and default will be used # or passed by name tm( tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3)) tm( tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3,aaaa04_=4,aaaa05_=5,aaaa06_=6)) # Debugging time! x <- deparse(substitute(tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3,aaaa04_=4,aaaa05_=5,aaaa06_=6))) x ## it looks like deparse breaks lines at some width... ouch! x <- paste(x,collapse='') ## paste has an option to glue thing back together x ## is that enough? y <- x ## let's try build longer and longer strings y <- paste(y,x, sep=";") y <- paste(y,x, sep=";") y for( i in 1:2 ) y <- paste(y,x,sep=";" ) y ## looking good... # here is another try ## tm: T, double -> double ## Given: an expresssion and a double ## Returns: the time it takes to run that expression in milliseconds ## or at a higher percision if a different unit is passed tm <- function( exp , unit = 1) { x<- paste(deparse(substitute( exp )), collapse='') t <- 1 time = 0; while ( time < 1 ) { y <- x for( i in 1:9 ) y <- paste(y,x,sep=';') y<- paste("system.time(for(i in 1:",t,"){",y, "}, gcFirst= FALSE)") r <- eval(parse(text=y)) time <- r[3] t <- t * 10 } r <- (time / (t * 10)) * 1000 * unit attributes(r)<-NULL r } # Back to the plot... ( switching to times in microsecs ) tm( tryme(0, 1, 2, 3, 4, 5, 6 ), 1000) tm( tryme(0) , 1000) tm( tryme() , 1000) tm( tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3), 1000) tm( tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3,aaaa04_=4,aaaa05_=5,aaaa06_=6), 1000) # by partial name tm( tryme(aaaa00=0,aaaa01=1,aaaa02=2,aaaa03=3,aaaa04=4,aaaa05=5,aaaa06=6),1000) # in random order tm( tryme(aaaa02=2,aaaa01=1,aaaa04=4,aaaa05=5,aaaa03=3,aaaa00=0,aaaa06=6),1000) # R is a functional language with referential transparency # Consider this function... tryme3<-function( aaaa00, aaab01, aaac02 ) { x <- aaac02 y <- x z <- y w <- z u <- w u[ 1 ] <- 2 u[ 2 ] <- aaab01 u[ 3 ] <- aaab01 u[ 4 ] <- aaab01 u } v <- 1:3000000 # let's create a vector u <- tryme3( 1, -2, v ) # call our function tm( tryme3( 1, -2, v ) ) # time it all(u == v) # In Java u and v would the same, what about R? v[1:10] # Hmm? Unchanged? u[1:10] # updated! # Referential transparency entails that arguments of a function call are not # changed by the evaluation of that call. ## Values are copied on assignment. tryme2<-function( aaaa00, aaab01, aaac02 ) { x <- aaac02 y <- x z <- y y[ 1 ] <- 2 w <- z z[ 2 ] <- aaab01 u <- w w[ 3 ] <- aaab01 u[ 4 ] <- aaab01 u } u2 <- tryme2( 1, -2, v ) tm( tryme2( 1, -2, v ) ) # time it again... and find something funny u[1:10] u2[1:10] # the vectors are quite different. Why? # Every assignment is "logically" a copy # Every vector update only affects the particular logical copy # For performance reasons, some copies are avoided. # ... but this makes for surprising performance profiles. # Okay to sum up: R is a lazy functional language where expressions are not # evaluated when passed as argument to functions and assignments perform # logical copies. Good? FU <- function() print("FU") # A function BAR <- function( FU = print("BAR")) FU() # Another function with a default argument BAR() # call bar will print ? # Huh? How could we print FU as R has lexical scoping? Why do we print BAR? # R has a best effort function lookup rule that interact with lazyness. # When calling a function it will look for a function of the same name # To know if a name is bound to a function will evaluate promises. # Is R expressive enough to do anything? # What about implementing an abstraction of a counter counter <- function() { x <- x + 1; x} counter() # oops, we need to define x x<-0 counter() counter() # Hmm... not quite # Super assignment to the rescue counter <- function() { x <<- x + 1; x} # <<- assigns in the parent env counter() counter() # now we have a counter # What if we don't want to agree on a name but instead pass # a value that must be modified? Can't do it! # Environments to the rescue! map <- new.env(hash=T, parent=emptyenv()) # create a new environment key <- 'ping' # define a key name map$key <- 'pong' # map the key to the value map$key # read the value mod <- function( hm ) hm$key <- 'noob' # this function updates its argument mod( map ) # call it. Recall "referential transparency" map$key # not so much. Environments are mutable # Programming in R requires a good understanding of the data structures that # the language provides: # - vector -- we have seen that and discussed it # - list -- polymporphic vectors # - matrix -- a 2-dimensional vector # - array -- a multi-dimensional vector # - data frames -- lists of vectors # - time series -- vector/matrix + dates # - objects # All the magic is done by one additional feature of the language: attributes. # An attribute is a key value pair associated to a value x <- 1 # here is a value y <- x # and the same value attr(x, "ssn") <- 123456 x # now 1 has a social security number y # but y hasn't, remember the logical copy rule on assignment x <- c(1,2,3,4) # is a vector attr(x, "dim") <- c(2,2) # and through the magic of attributes x # c is now a matrix # Can we write code that does different things depending what attributes are attached to an object? # Enter OOP with S3... # Suppose we have cowgirls and pens and would like to write a function draw that does # the right thing when passed either of these values. x<- 1 attr(x, "class" ) <- "pen" # class is just an attribute like any other x y<- 1 class(y ) <- "cowboy" # there is a dedicate function to set y class(y) <- c("cowboy","human") # one can be several things draw <- function( this ) UseMethod("draw", this) # this introduces the generic method draw.cowboy <- function(this) "Bang" # different variants for different classes draw.pen <- function(this) "Scribble" draw.default <- function(this) "Huh?" # default version when nothing else applies draw(x) draw(y) draw(1) # Of course this is a bit weak for Java programmers... there is no inheritance, objects # have no fields. # Enter OOP with S4... setClass("Person", # Create an new class with two slots slots = list(name = "character", age = "numeric")) setClass("Employee", # Extends the Person class with another class slots = list(boss = "Person"), contains = "Person") # declare an inheritance relationship # create a few people alice <- new("Person", name = "Alice", age = 40) john <- new("Employee", name = "John", age = 20, boss = alice) # john's got a boss jack <- new("Person", name = "Jack", age = 42) alice@age # ask an impolite question slot(john, "boss") # same # Let's define methods setGeneric("hire", function(x,y) standardGeneric("hire") ) # The first will work only for persons hiring employees setMethod("hire", c(x = "Person", y = "Employee"), function(x, y) { print(paste(x@name," hired ",y@name)) return(setBoss(y)<-x) # change the boss slot } ) # another method for setting fields setGeneric("setBoss<-",function(object,value){standardGeneric("setBoss<-")}) setReplaceMethod(f="setBoss", signature="Employee", definition=function(object,value) return(object@boss <-value) ) hire(alice, john) # this should work print(hire(jack, john)) # and this too, changing the boss hire(john, john) # this should work as an employee is person hire(alice, alice) # this should fails as an employee isn't a person # Getting more information about time: profiling Rprof() tm( 1+ 1 ) Rprof(NULL) summaryRprof() Rprof() a<-1:1000 for(i in 1:1000) { a <- a +a - a } Rprof(NULL) summaryRprof() # Performance tips... # pre-allocated vectors test <- function(k, N){ x <- 1 #rep(NA, k) for(i in 1:N) x[i] <- rnorm(1) } system.time(test(20000, 20000)) ## back to the past, remember Sqn? tm(sqn(10^4)) tm(sqn2(10^4)) tm(sqn3(10^4)) sqn <- function(n) { x <- 1:n for (i in 1:n) x[i] <- i * i return(x) } sq <- function(i) i * i sqn2 <- function(n) { x <- 1:n for (i in 1:n) x[i] <- sq(i) x } set <- function (v, i) { v[i] <- sq(i) } sqn3 <- function(n) { x <- 1:n for (i in 1:n) set(x,i) x } ## Remember? testSqn(sqn3,10) set <- function (v, i) { v[i] <- sq(i) v } sqn4 <- function(n) { x <- 1:n for (i in 1:n) x <- set(x,i) x } testSqn(sqn4,10) tm(sqn4(10^4)) # > 20 millis x <- sqn4(10^5) ## still slow ## and wrong sq <- function(i) i * as.double(i) x <- sqn4(10^5) ## slow but correct tm(sqn4(10^4)) # > 40 ms sqn5 <- function(n) { x <- 1:n for (i in 1:n) x[i] <- as.double(i) * i x } tm(sqn5(10^4)) # ~ 1ms ## Here is the right answer in R sq0 <- function(n) { x<- as.double(1:n) x * x } ## Rember in R everything is a vector and all operations know how to deal with them. tm(sqn0(10^4)) # ~1ms