my_mean <- function(x) {
# verification
if (!is.numeric(x))
stop("x must be a numeric vector")
# initialization
n <- length(x)
res <- 0
for (k in 1:n) {
res <- res + x[k]
}
# return res
return(res / n)
}
We consider the following simulated vector of size \(10000\):
set.seed(1)
x <- rnorm(10000)
m <- x[1]
for (i in 2:length(x)) {
if (x[i] > m) {
m <- x[i]
}
}
m
## [1] 3.810277
max(x)
## [1] 3.810277
The algorithm to find the maximum is:
initialisation: \(m=x[1]\)
for \(i\) from 2 to \(n\), do
\(~~~~~~\) if (x[i] > m) do
\(~~~~~~~~~\) m = x[i]
\(~~~~~~\) end if
end for
Program in R this algorithm
M <- x[1]
for (i in 1:length(x)) {
if (M < x[i]) {
M <- x[i]
}
}
M
## [1] 3.810277
my_max <- function(x) {
# initialization
i <- 1
while (is.na(x[i])) {
i <- i + 1
}
M <- x[i]
# Loop
for (j in (i + 1):length(x)) {
if (is.na(x[j])) {
next
}
if (M < x[j]) {
M <- x[j]
}
}
return(M)
}
x <- runif(10000)
x[2] <- NA
my_max(x)
## [1] 0.9999035
x[1] <- NA
my_max(x)
## [1] 0.9999035
y <- rbinom(10000, 10, 0.5)
my_max(y)
## [1] 10
We compare with the function max()
max(x)
## [1] NA
Create a function that permits to compute the maximum for any vector of numeric. It must take into account the possibility that there exists some missing values.
my_max <- function(x) {
# verification
stopifnot(is.numeric(x))
# initialisation
init <- 1
while (is.na(x[init])) {
init <- init + 1
}
M <- x[init]
for (i in init:length(x)) {
if (is.na(x[i])) {
next
}
if (M < x[i]) {
M <- x[i]
}
}
return(M)
}
To test your function, execute the following codes:
x1 <- c(1000, 10, 6, NA)
x2 <- c(NA, 1000, 10, 6)
x3 <- c(NA, NA, 1000, 10, 6)
my_max(x1)
## [1] 1000
my_max(x2)
## [1] 1000
my_max(x3)
## [1] 1000
To get the computational time between the three expressions, we use microbenchmark() function, and to plot the result, we use autoplot()
n <- 10 ^ 4
mbm <- microbenchmark::microbenchmark(
{x <- numeric(n) # expression 1
for (k in 1:n)
x[k] <- (5 == sample(1:10, 1))
mean(x)},
{x <- NULL # expression 2
for (k in 1:n)
x <- c(x, (5 == sample(1:10, 1)))
mean(x)},
{x <- 0 # expression 3
for (k in 1:n)
x <- x + (5 == sample(1:10, 1))
x / n}
)
ggplot2::autoplot(mbm)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
In the expression 3, we do not store the result in a vector because we compute directly the result step by step. We can see that the computational time is nearly identical to the 1st case. In other term, use a vector is costly in memory but it does not take time when you fill it.
We use the formula \(\sqrt{\frac{1}{n-1}(\sum x_i^2 - (\sum x_i)^2/n}\)
my_sd <- function(x) {
# verification
stopifnot(is.numeric(x))
# initialisation
n <- length(x)
sum_x <- 0
sum_square_x <- 0
# we need to compute the mean and the
for (k in 1:n) {
sum_x <- sum_x + x[k]
sum_square_x <- sum_square_x + x[k] ^ 2
}
return(sqrt(1 / (n - 1) * (sum_square_x - sum_x ^ 2 / n )))
}
We compare the computational time on a small example:
vec <- rnorm(1000)
ggplot2::autoplot(
microbenchmark::microbenchmark(
my_sd(vec),
sd(vec), times = 10L))
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
The function in C++ looks like:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double my_sd_cpp(NumericVector x) {
double res;
double sum_x = 0;
double sum_square_x = 0;
int n = x.size();
for(int i = 0; i < n; i++) {
sum_x = sum_x + x(i);
sum_square_x = sum_square_x + pow(x(i), 2.0);
}
res = sqrt((sum_square_x - pow(sum_x, 2.0) / n ) / (n - 1));
return res;
}
vec <- rnorm(1000)
ggplot2::autoplot(
microbenchmark::microbenchmark(
my_sd(vec),
sd(vec),
my_sd_cpp(vec),
times = 10L))
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
xs <- replicate(5, runif(10), simplify = FALSE)
ws <- rbinom(5, 10, 0.5)
# solution 1
res <- numeric(5)
for (k in 1:5)
res[k] <- sum(xs[[k]]) * ws[k]
res
## [1] 29.91883 19.92242 21.59394 24.41046 31.67674
# solution 2
mapply(function(x, y) sum(x) * y, xs, ws)
## [1] 29.91883 19.92242 21.59394 24.41046 31.67674
Solution 1: we use for loop:
hist_extrm.1 <- function(n, B, ...) {
res <- 0 # au début le phenomene s est produi 0 fois
extremes <- NULL # ici, on ne sait pas à l'avance la taille finale du vecteur
for (k in 1:B) {
x <- rnorm(n)
if (any(abs(x)>1.96)) {
res <- res + 1
extremes <- c(extremes, x[abs(x)>1.96])
}
}
print(paste("Le phenomene s'est produit", res/B*100, "% de fois"))
hist(extremes,...)
} # fin de la fonction
hist_extrm.1(100, 1000)
## [1] "Le phenomene s'est produit 99.3 % de fois"
Solution 2: we do not use for loop
hist_extrm.2 <- function(n, B, ...) {
# on créé une fonction qu'on va appliquer à replicate()
my.func <- function() {
x <- rnorm(n)
x[abs(x)>1.96]
}
res.ech <- replicate(B, my.func(), simplify = FALSE)
extremes <- unlist(res.ech) # on récupère les valeurs concernées
res <- sapply(res.ech, function(x) length(x) > 0) # on compte le nombre de fois où l'événement s'est produit
print(paste("Le phenomene s'est produit", mean(res)*100, "% de fois"))
hist(extremes,...)
} # fin de la fonction