## R is a language used in a variety of fields 
## (bio-*, finance, big data-*) to carry out 
## statistical analyses of complex data sets.
## You may have to work with it some day...
## ... engineer your code!

## Say you need a vector of double containing 
## all the squares from 1 to n. As an imperative
## programmer seeped in Java, how do you approach
## this task?

## Start with a function contract, then write tests!

## sqn : double -> double[]
## Given: a size n
## Returns: a vector of squares x such that x[i] == i * i
sqn <- function(n) {
 x <- 1:n
 for (i in 1:n)
   x[i] <- i * i
 return(x)
}

## Let's define functions for testing to make our life easier.

## testEq: T, T -> void
## Given: two values
## Returns: prints an error message if different
testEq<- function(a,b) if (a != b) print("Error!")

## ... and now write one test. Silence means assent.
testEq(sqn(100)[100],100*100)

## ... make sure that the test is doing something
## forcing a failure
testEq(sqn(100)[100],100*100+1)
## Ok this fails

## Would this be a good exhaustive test? It's nice and short.
testSqn <- function(f, n) for (i in 1:n) testEq(f(i)[i], i*i)

testSqn(sqn, 10^3)

## and for bigger arrays our patience is tested...
testSqn(sqn, 10^4)

## The test is faulty because: we are creating a new array for
## each value of i and we are always looking at the last
## value of each array and nothing else.  This is a bad test.

## A better test:
testSqn2 <- function(f, n) {
   x <- f(n)
   for (i in 1:n) testEq(x[i], i*i)
}

## Why do we take a function as argument? Because we want to 
## demonstrate that R support *higher-order* values (i.e. functions
## as values) and we want to write a test that is parameterized
## by the function being tested.  This will come in handy.

testSqn2(sqn, 10^4)   # Runs fine...

testSqn2(sqn, 10^5)   # Two error messages... strange.

## The warnings() function will print the last few messages that 
## were logged by R.
warnings()

## At this point it is unclear where the error lies. Everything
## seems to work fine for small array but when we go to 10^5
## we see errors.  

## Proceeding, by elimination: Is it a problem of the test function?
x <- sqn(10^5)

## Yes, the test function generates some of the errors but there are 
## also warnings comming from sqn
warnings()

## Let's get to the bottom of this. Something strange
## is going on with multiplication.  We can define our own
## square function and test it.

## sq: double -> double
## Given: a double
## Returns: the square of the value
sq <- function(i) i * i

## Now check on the extremal value (on the theory 
## that if there is overflow it should show up there)
testEq(sq(10^5), 1e+10)   ## this is fine. No error.

## Are we crazy?  Let's refactor sqn to use sq (since sq "works")
sqn2 <- function(n) {
 x <- 1:n
 for (i in 1:n)
   x[i] <- sq(i)
 x ## btw: we don't need an explicit return
}

## Test it again...
testSqn2(sqn2, 10^5)
## Still the same errors.

## Maybe it is something to do with assignment?
## Let's refactor it in its own function

## set : double[], double -> void
## Given:  a vector of double v and a double i
## Returns: nothing;  but sets v[i] to the square of i.
set <- function (v, i) {
 v[i] <- sq(i)
}

## And now use the function set
sqn3 <- function(n) {
 x <- 1:n
 for (i in 1:n)
   set(x,i)
 x
}

## Test again...
testSqn2(sqn3, 10^2)

## This time there are errors printed from the test.  This is a different problem.

## Someone told us that all R functions should always return
## something. Let's try that:

## set: double[], double -> double[]
## Given: a vector of double v and double i
## Return: a vector that was update at location i
set <- function (v, i) {
 v[i] <- sq(i)
 v
}

## In this version of the function we recover the result 
## of set() and store it into x
sqn4 <- function(n) {
 x <- 1:n
 for (i in 1:n)
   x <- set(x,i)
 x
}

## and now the test pass...
testSqn2(sqn4, 10^2)

## but not for the bigger arrays.
testSqn2(sqn4, 10^5)
## and they take even longer than before.

## Obviously we are driving blind trying things without having a real clue what
## is going on. Let's switch gears and look at the R the language. Perhaps
## we will figure out what is going on.

###

## R is a general purpose programming language with most of the data types one would expect

x <- c(11, 12)                  # vectors of doubles
x[2]                            # subsetting
42[1]                           # that's a vector too
42[3]                           # of size one
y <- x                          # assignment
typeof(y)                       # check the type of a value
x <- 1:100                      # another way to create a vector
typeof(x)                       # and also integers
v <- 1L                         # 32 bit integer value
typeof(v)                       # with a range of +/-2*10^9


### Aparte':  INTEGERS!  #########################
x <- 100000L
x * x              # Aha!

1:100 ## this creates integers because: (1) it is often used in loops to
## to create indicies into vectors and 32-bit integers are more efficient
## in memory and time than doubles!

as.double(x) * x   ## forces the computation to be doubles.

## Here is the version that is not causing errors.
sqn0 <- function(n) {
 x <- 1:n
 for (i in 1:n)
   x[i] <- as.double(i) * i
 x
}

testSqn(sqn0, 10^5)  ## the test runs fine
## but is still slow, way too slow

## Why?

###################################################

## back to the language...

x2 <- x[ (x %% 2) == 0]         # powerful subsetting operations
x2                              # a shorter vector with only even numbers
z <- c("vector","of","strings") # vector of char
typeof(z)
z <- c(1, "one")                # vectors are monomorphic
z
z[100] <- "hundred"             # and they grow on the demand
z
u <- list(1,2)                  # a list of double
typeof(u)
unlist(u)                       # lists can be turned to vectors
w <- list("hi", 2, u)           # lists are polymorphic
unlist(w)                       # unlisting coerces to the most general type
b <- c(TRUE, FALSE)             # logicals
typeof(b)
sex <- c("M", "F", "M", "M2F")  # vector of characters
sex <- factor(sex)              # factors
sex
as.integer(sex)                 # the vector's values are represented by integers
typeof(sex)                     # If sex is a vector of integers, where are the level stored?
for (i in 1:100000) sex <- c("M", "F", "M", "M2F")  # Looping
for (i in 1:100000) sex <- factor(c("M", "F", "M", "M2F"))  # creating factor takes time

x <- 1                            # x is a vector of length 1
for (i in 1:100000)  x[i] <- i    # initialize the vector with increasing values
for (i in 1:100000)  x[i] <- i+1  # do the same (faster?)
x <- 1                            # again
for (i in 1:100000)  x[i] <- i    # (slow?)
x <- 1
x[100000] <- 0                    # extend
for (i in 1:100000)  x[i] <- i    # (fast)

for (i in 1:10000000) 1               ## ok
for (i in 1:10000000) ((((((1))))))   ## slow

# Performance of R code is often obscure, let's look into it for a bit

system.time( 1 )                # time is given in seconds
system.time ( ((((((1)))))))    # hmm, no difference here

## That is because the accuracy of the timer (which returns values in seconds)
## is not sufficient to measure the time it takes for R to evaluate the number 1.


system.time(for (i in 1:10000) 1)  # iterate until we get past clock resolution
system.time(for (i in 1:10000) ((((((1)))))))   ## slower

## Why?

( 1 )  #    '(' is a function call in R

`(` <-function( x ) {  ## overwrite the default () to do something
 print ( paste("Saw " ,x ))
 x
}

( 1 )     # what did we see?

## Now every time we use a parenthesis around an expression we will print something

## That sounds like a bad idea, but it explains the speed difference;
## there were many additional function calls.

remove( `(` )     ## definitions can be removed.  Back to normal.

# The problem is that most function take very small amounts of time when called
# once, can we find an automatic way to run them "enough" times?
# A better version of system.time... that would invoke its argument sufficiently
# and return a cost per invocation

# tm : T -> double
# Given: any R expression
# Return: the execution time in milliseconds 

## use as:
tm( 1 + 1 )      # undefined, wait for it

## How do we run an expression passed as argument multiple times?

#... but first we must mention that R is lazy programming language

print(1)   # outputs "1" on the terminal

f<-function(a,b) {}  # a helper, nevermind

f( print(1) , print(2) )   # what would this do in Java or in C

temp <-print(1);temp2 <- print(2); f(temp, temp2)  # same thing?

# Odd? In Java, the two snippets would be equivalent

f1 <- function( a, b, c ) { # Consider the following, not ver useful function
x <- f(b,b)
x <- c
x <- c
return(a)
}

f1( print(1), print(2), print(3) ) # What do you recon this will print?

# The order in which the values are printed are jumbled

f2 <- function( a, b, c ) { # here is another function
x <- b
x <- c
f3(a)
}

f3 <- function( a) { } # with a silly helper function

f2( print(1), print(2), print(3) ) # what will we print?

# All right, "lazy" means do not evaluate arguments to functions but
# evaluate everything else.
# Question: Can you imagine another defintion of lazyness?

_tm_( 1 + 1 )    # have we made any progress?

## Sort of... let's try something

tm1 <- function( exp ) {
  ### missing some code for recording the time
  for( i in 1:10)  exp                       # Use lazyness and just evaluate the 
}                                
tm1( print(1) )                               # same expression many times, which
for( i in 1:10) print(1)                      # should be the same as this.

## But that does not work, in R expressions are only evaluated once;
## if used many times, their value is remembered.

# Ok, lazy is not enough; we need one new trick...

x<- substitute( 1 + 1 )   # R can turn code into values
x
typeof(x)                 # here a new value, for code
deparse(x)                # And we have got text again
y<-deparse(x)
eval(parse(text=y))       # and back into code again
parse(text=y)           
typeof(parse(text=y))     # and another new data type

# Back to the plot

_tm_( 1 + 1 )

## This function will run the expression 1000 times...
tm2 <- function( exp ) 
 for( i in 1:1000) 
   eval( parse(text= deparse( substitute( exp ))))

tm2(1+1)

# Almost there, we need to run just the right number of times to run
# and add the timer code

tm <- function( exp ) {
x<- deparse(substitute( exp ))   ## decompile the expression
t <- 1                           ## repetion counter (we run exp 10 * t)
time = 0;                        ## reset timer
while ( time < 1 ) {             ## continue looking until the code runs long enough
 y <- x                          ## code
 for( i in 1:9 ) y <- paste(y,";",x)  ## duplicate the code 10 times
 y<- paste(" system.time ( for (i in 1:",t,") {", 
           y,                    ## wrap the code with a loop that runs t times
           "}, gcFirst= FALSE)")
 r <- eval(parse(text=y))        ## execute
 time <- r[3]                    ## grab the elasped time
 t <- t * 10                     ## try with a bigger t
}
(time / (t * 10)) * 1000         ## return time in milliseconds (default is in s)
}

tm( (((1))) ) *1000
tm( (( 1)) )  *1000
tm(  1 )      *1000

# All right, we have established that in R, extra parentheses can cost you 
# dearly and shown how to write a small performance utility that generates code
# on the fly.

# One thing that R does a lot, is calling functions
tryme<-function( aaaa00_=0,aaaa01_=1,
                aaaa02_=2,aaaa03_=3,
                aaaa04_=4,aaaa05_=5,aaaa06_=6 ) { }

tm( tryme(0, 1, 2, 3, 4, 5, 6 ) )  # arguments can be passed by position
tm( tryme(0) )      # or omitted, and default will be used
# or passed by name
tm( tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3)) 
tm( tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3,aaaa04_=4,aaaa05_=5,aaaa06_=6)) 

# Debugging time!
x <- deparse(substitute(tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3,aaaa04_=4,aaaa05_=5,aaaa06_=6)))
x          ## it looks like deparse breaks lines at some width... ouch!
x <- paste(x,collapse='')    ## paste has an option to glue thing back together
x                            ## is that enough?
y <- x                       ## let's try build longer and longer strings
y <- paste(y,x, sep=";")
y <- paste(y,x, sep=";")
y
for( i in 1:2 ) y <- paste(y,x,sep=";" )
y                            ## looking good...


# here is another try
## tm: T, double -> double
## Given:  an expresssion and a double
## Returns: the time it takes to run that expression in milliseconds 
##          or at a higher percision if a different unit is passed
tm <- function( exp , unit = 1) {
 x<- paste(deparse(substitute( exp )), collapse='')
 t <- 1
 time = 0;
 while ( time < 1 ) {
   y <- x
   for( i in 1:9 ) y <- paste(y,x,sep=';')
   y<- paste("system.time(for(i in 1:",t,"){",y, "}, gcFirst= FALSE)")
   r <- eval(parse(text=y))
   time <- r[3]
   t <- t * 10
 }
 r <- (time / (t * 10)) * 1000 * unit
 attributes(r)<-NULL
 r
}

# Back to the plot...  ( switching to times in microsecs )

tm( tryme(0, 1, 2, 3, 4, 5, 6 ), 1000)
tm( tryme(0) , 1000)
tm( tryme() , 1000)
tm( tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3), 1000)
tm( tryme(aaaa00_=0,aaaa01_=1,aaaa02_=2,aaaa03_=3,aaaa04_=4,aaaa05_=5,aaaa06_=6), 1000)
# by partial name
tm( tryme(aaaa00=0,aaaa01=1,aaaa02=2,aaaa03=3,aaaa04=4,aaaa05=5,aaaa06=6),1000)
# in random order
tm( tryme(aaaa02=2,aaaa01=1,aaaa04=4,aaaa05=5,aaaa03=3,aaaa00=0,aaaa06=6),1000)

# R is a functional language with referential transparency 

# Consider this function...
tryme3<-function( aaaa00, aaab01, aaac02 ) { 
x <- aaac02
y <- x  
z <- y
w <- z
u <- w
u[ 1 ] <- 2
u[ 2 ] <- aaab01
u[ 3 ] <- aaab01
u[ 4 ] <- aaab01
u
}

v <- 1:3000000             # let's create a vector
u <- tryme3( 1, -2, v )     # call our function
tm( tryme3( 1, -2, v ) )    # time it 
all(u == v)               # In Java u and v would the same, what about R?
v[1:10]                      # Hmm? Unchanged?
u[1:10]                      # updated!

# Referential transparency entails that arguments of a function call are not
# changed by the evaluation of that call.

## Values are copied on assignment. 

tryme2<-function( aaaa00, aaab01, aaac02 ) { 
x <- aaac02
y <- x	
z <- y
y[ 1 ] <- 2
w <- z
z[ 2 ] <- aaab01
u <- w
w[ 3 ] <- aaab01
u[ 4 ] <- aaab01
u
}
u2 <- tryme2( 1, -2, v )
tm( tryme2( 1, -2, v ) )  # time it again... and find something funny
u[1:10]  
u2[1:10]                  # the vectors are quite different. Why?

# Every assignment is "logically" a copy
# Every vector update only affects the particular logical copy
# For performance reasons, some copies are avoided.
# ... but this makes for surprising performance profiles.

# Okay to sum up: R is a lazy functional language where expressions are not
# evaluated when passed as argument to functions and assignments perform
# logical copies.  Good?

FU <- function() print("FU")                # A function

BAR <- function( FU = print("BAR"))  FU()   # Another function with a default argument

BAR()   # call bar will print ?

# Huh? How could we print FU as R has lexical scoping? Why do we print BAR?

# R has a best effort function lookup rule that interact with lazyness.
# When calling a function it will look for a function of the same name
# To know if a name is bound to a function will evaluate promises.

# Is R expressive enough to do anything?
# What about implementing an abstraction of a counter
counter <- function() { x <- x + 1; x}
counter()             # oops, we need to define x
x<-0
counter()
counter()             # Hmm... not quite

# Super assignment to the rescue
counter <- function() { x <<- x + 1; x}  # <<- assigns in the parent env
counter()
counter()             # now we have a counter

# What if we don't want to agree on a name but instead pass 
# a value that must be modified?   Can't do it!

# Environments to the rescue!
map <- new.env(hash=T, parent=emptyenv())  # create a new environment
key <- 'ping'         # define a key name
map$key <- 'pong'     # map the key to the value
map$key               # read the value

mod <- function( hm ) hm$key <- 'noob'   # this function updates its argument

mod( map )   # call it.  Recall "referential transparency"

map$key      # not so much.  Environments are mutable


# Programming in R requires a good understanding of the data structures that
# the language provides:
#     - vector       -- we have seen that and discussed it
#     - list         -- polymporphic vectors
#     - matrix       -- a 2-dimensional vector
#     - array        -- a multi-dimensional vector
#     - data frames  -- lists of vectors
#     - time series  -- vector/matrix + dates
#     - objects

# All the magic is done by one additional feature of the language: attributes.

# An attribute is a key value pair associated to a value

x <-  1     # here is a value
y <-  x     # and the same value
attr(x, "ssn") <- 123456  
x           # now 1 has a social security number
y           # but y hasn't, remember the logical copy rule on assignment

x <- c(1,2,3,4)  # is a vector
attr(x, "dim") <- c(2,2)  # and through the magic of attributes
x                # c is now a matrix

# Can we write code that does different things depending what attributes are attached to an object?

# Enter OOP with S3...

# Suppose we have cowgirls and pens and would like to write a function draw that does 
# the right thing when passed either of these values.

x<- 1
attr(x, "class" ) <- "pen"      # class is just an attribute like any other
x
y<- 1
class(y ) <- "cowboy"           # there is a dedicate function to set
y
class(y) <- c("cowboy","human") # one can be several things

draw <- function( this )  UseMethod("draw", this)  # this introduces the generic method
draw.cowboy <- function(this)  "Bang"      # different variants for different classes
draw.pen <- function(this) "Scribble"
draw.default <- function(this) "Huh?"     # default version when nothing else applies

draw(x)
draw(y)
draw(1)

# Of course this is a bit weak for Java programmers... there is no inheritance, objects
# have no fields.

# Enter OOP with S4...

setClass("Person",   # Create an new class with two slots
     slots = list(name = "character", age = "numeric"))

setClass("Employee",  # Extends the Person class with another class
     slots = list(boss = "Person"), 
     contains = "Person") # declare an inheritance relationship

# create a few people
alice <- new("Person", name = "Alice", age = 40)
john <- new("Employee", name = "John", age = 20, boss = alice) # john's got a boss
jack <- new("Person", name = "Jack", age = 42)

alice@age              # ask an impolite question
slot(john, "boss")     # same

# Let's define methods
setGeneric("hire",  function(x,y) standardGeneric("hire") )
# The first will work only for persons hiring employees
setMethod("hire", 
      c(x = "Person", y = "Employee"), 
      function(x, y) {
        print(paste(x@name," hired ",y@name)) 
        return(setBoss(y)<-x)  # change the boss slot
      }
)

# another method for setting fields
setGeneric("setBoss<-",function(object,value){standardGeneric("setBoss<-")})
setReplaceMethod(f="setBoss",
             signature="Employee",
             definition=function(object,value) return(object@boss <-value)
             )

hire(alice, john)   # this should work

print(hire(jack, john))  # and this too, changing the boss

hire(john, john)  # this should work as an employee is person
hire(alice, alice) # this should fails as an employee isn't a person

# Getting more information about time: profiling

Rprof()
tm( 1+ 1 )
Rprof(NULL)
summaryRprof()


Rprof()
a<-1:1000
for(i in 1:1000) { a <- a +a - a } 
Rprof(NULL)
summaryRprof()


# Performance tips...

# pre-allocated vectors
test <- function(k, N){
x <- 1 #rep(NA, k)
for(i in 1:N) x[i] <- rnorm(1)
}
system.time(test(20000, 20000))

## back to the past, remember Sqn?

tm(sqn(10^4))
tm(sqn2(10^4))
tm(sqn3(10^4))

sqn <- function(n) {
 x <- 1:n
 for (i in 1:n)
   x[i] <- i * i
 return(x)
}
sq <- function(i) i * i
sqn2 <- function(n) {
 x <- 1:n
 for (i in 1:n)
   x[i] <- sq(i)
 x
}
set <- function (v, i) {
 v[i] <- sq(i)
}
sqn3 <- function(n) {
 x <- 1:n
 for (i in 1:n)
   set(x,i)
 x
}

## Remember?
testSqn(sqn3,10)

set <- function (v, i) {
 v[i] <- sq(i)
 v
}
sqn4 <- function(n) {
 x <- 1:n
 for (i in 1:n)
   x <- set(x,i)
 x
}

testSqn(sqn4,10)

tm(sqn4(10^4))   # > 20 millis

x <- sqn4(10^5)

## still slow
## and wrong

sq <- function(i) i * as.double(i)

x <- sqn4(10^5)

## slow but correct

tm(sqn4(10^4))  # > 40 ms

sqn5 <- function(n) {
 x <- 1:n
 for (i in 1:n)
   x[i] <- as.double(i) * i
 x
}

tm(sqn5(10^4))  # ~ 1ms

## Here is the right answer in R
sq0 <- function(n) {
	x<- as.double(1:n)
	x * x
}

## Rember in R everything is a vector and all operations know how to deal with them.

tm(sqn0(10^4))  # ~1ms