Advanced Programming with R (session 2)

Exercise 2.1

The function my_mean() by using the R recommendation style guide

my_mean <- function(x) {
  
  # verification
  if (!is.numeric(x))
    stop("x must be a numeric vector")
  
  # initialization
  n <- length(x) 
  res <- 0
  
  for (k in 1:n) {
    res <- res + x[k]
  }
  
  # return res
  return(res / n)
}

Exercise 2.2.a

We consider the following simulated vector of size \(10000\):

set.seed(1)
x <- rnorm(10000)

m <- x[1]
for (i in 2:length(x)) {
  if (x[i] > m) {
    m <- x[i]
  }
}
m

## [1] 3.810277

max(x)

## [1] 3.810277

The algorithm to find the maximum is:

initialisation: \(m=x[1]\)
for \(i\) from 2 to \(n\), do

\(~~~~~~\) if (x[i] > m) do

\(~~~~~~~~~\) m = x[i]

\(~~~~~~\) end if

end for

Program in R this algorithm

 M <- x[1]
  
  for (i in 1:length(x)) {
    if (M < x[i]) {
      M <- x[i]
    }
  }
M

## [1] 3.810277

my_max <- function(x) { 
  # initialization 
  i <- 1
  while (is.na(x[i])) {
    i <- i + 1  
  }
  M <- x[i]
  # Loop
  for (j in (i + 1):length(x)) {
    if (is.na(x[j])) {
      next
    }
    if (M < x[j]) {
      M <- x[j]
    }
  }
 return(M)
}

x <- runif(10000)
x[2] <- NA
my_max(x)

## [1] 0.9999035

x[1] <- NA
my_max(x)

## [1] 0.9999035

y <- rbinom(10000, 10, 0.5)
my_max(y)

## [1] 10

We compare with the function max()

max(x)

## [1] NA

Exercise 2.2.b

Create a function that permits to compute the maximum for any vector of numeric. It must take into account the possibility that there exists some missing values.

my_max <- function(x) {
  # verification
  stopifnot(is.numeric(x))
  # initialisation
  init <- 1
  while (is.na(x[init])) {
  init <- init + 1
  }
  M <- x[init]
  
  for (i in init:length(x)) {
    if (is.na(x[i])) {
      next
    }
    if (M < x[i]) {
      M <- x[i]
    }
  }
  return(M)  
}

To test your function, execute the following codes:

x1 <- c(1000, 10, 6, NA)
x2 <- c(NA, 1000, 10, 6)
x3 <- c(NA, NA, 1000, 10, 6)
my_max(x1)

## [1] 1000

my_max(x2)

## [1] 1000

my_max(x3)

## [1] 1000

Exercise 2.3

To get the computational time between the three expressions, we use microbenchmark() function, and to plot the result, we use autoplot()

n <- 10 ^ 4

mbm <- microbenchmark::microbenchmark(
  {x <- numeric(n) # expression 1
  for (k in 1:n) 
    x[k] <- (5 == sample(1:10, 1))
  mean(x)},  
  {x <- NULL       # expression 2
  for (k in 1:n) 
    x <- c(x, (5 == sample(1:10, 1)))
  mean(x)},
  {x <- 0         # expression 3
  for (k in 1:n)  
    x <- x + (5 == sample(1:10, 1))
  x / n}
)

ggplot2::autoplot(mbm)

## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

In the expression 3, we do not store the result in a vector because we compute directly the result step by step. We can see that the computational time is nearly identical to the 1st case. In other term, use a vector is costly in memory but it does not take time when you fill it.

Exercise 2.4

We use the formula \(\sqrt{\frac{1}{n-1}(\sum x_i^2 - (\sum x_i)^2/n}\)

my_sd <- function(x) {
  # verification
  stopifnot(is.numeric(x))
  
  # initialisation
  n <- length(x)
  sum_x <- 0
  sum_square_x <- 0
  
  # we need to compute the mean and the 
  for (k in 1:n) {
    sum_x <- sum_x + x[k]
    sum_square_x <- sum_square_x + x[k] ^ 2
  }
  
  return(sqrt(1 / (n - 1) * (sum_square_x - sum_x ^ 2 / n )))
}

We compare the computational time on a small example:

vec <- rnorm(1000)
ggplot2::autoplot(
  microbenchmark::microbenchmark(
    my_sd(vec),
    sd(vec), times = 10L))

## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

Exercise 2.5

The function in C++ looks like:

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
double my_sd_cpp(NumericVector x) {
  double res;
  double   sum_x = 0;
  double sum_square_x = 0;
  int n = x.size();
  
  for(int i = 0; i < n; i++) {
    sum_x = sum_x + x(i);
    sum_square_x = sum_square_x + pow(x(i), 2.0);
  }
  res = sqrt((sum_square_x - pow(sum_x,  2.0) / n ) / (n - 1));
  return res;
}

vec <- rnorm(1000)
ggplot2::autoplot(
  microbenchmark::microbenchmark(
    my_sd(vec),
    sd(vec),
    my_sd_cpp(vec),
    times = 10L))

## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

Exercise 2.6

Simulate a list xs of 5 samples each of size 10 distributed under a \(U_{[0,1]}\) (use if possible function replicate()).

xs <- replicate(5, runif(10), simplify = FALSE)

Simulate a vector ws of size 5 distributed under a binomial \(\mathcal{B}(10, 0.5)\) (use function rbinom()).

ws <- rbinom(5, 10, 0.5)

compute the sum of each element of xs and multiply it by the element of ws (use for loop and mapply()).

# solution 1 
res <- numeric(5)
for (k in 1:5)
  res[k] <- sum(xs[[k]]) * ws[k]
res

## [1] 29.91883 19.92242 21.59394 24.41046 31.67674

# solution 2
mapply(function(x, y) sum(x) * y, xs, ws)

## [1] 29.91883 19.92242 21.59394 24.41046 31.67674

Exercise 2.7

Solution 1: we use for loop:

hist_extrm.1 <- function(n, B, ...) { 
  res <- 0 # au début le phenomene s est produi 0 fois
  extremes <- NULL # ici, on ne sait pas à l'avance la taille finale du vecteur
  for (k in 1:B) {
    x <- rnorm(n)
    if (any(abs(x)>1.96)) {
      res <- res + 1
      extremes <- c(extremes, x[abs(x)>1.96])
    }
  }
  print(paste("Le phenomene s'est produit", res/B*100, "% de fois")) 
  hist(extremes,...)
}  # fin de la fonction

hist_extrm.1(100, 1000)

## [1] "Le phenomene s'est produit 99.3 % de fois"

Solution 2: we do not use for loop

hist_extrm.2 <- function(n, B, ...) {
  # on créé une fonction qu'on va appliquer à replicate()
  my.func <- function() {
    x <- rnorm(n)
    x[abs(x)>1.96]
  }
  res.ech <- replicate(B, my.func(), simplify = FALSE)
  extremes <- unlist(res.ech) # on récupère les valeurs concernées
  res <- sapply(res.ech,  function(x) length(x) > 0) # on compte le nombre de fois où l'événement s'est produit
  print(paste("Le phenomene s'est produit", mean(res)*100, "% de fois")) 
  hist(extremes,...)
 }  # fin de la fonction