require(dplyr)
require(sf)
load("R/data_cantons.RData")
# drop the 0
contours_occitanie_no0 <- contours_occitanie %>%
  filter(left != 0, right != 0, others != 0)
# We create the percentages associated to each of the parties:
contours_occitanie_no0 <- contours_occitanie_no0 %>%
  mutate(percent_left = left / (left + right + others),
         percent_right = right / (left + right + others),
         percent_others = others / (left + right + others))
# create ILR
y_ilr <- ilr(contours_occitanie_no0[, c("left" , "right", "others")] %>% 
               st_set_geometry(NULL)) 
contours_occitanie_no0 <- contours_occitanie_no0  %>%
  mutate(y_ilr_1 = as.numeric(y_ilr[, 1]),
         y_ilr_2 = as.numeric(y_ilr[, 2]))

require("missForest")
data.to.imp <- contours_occitanie_no0 %>% 
  st_set_geometry(NULL) %>% 
  select(unemp_rate, employ_evol, owner_rate, income_rate)  
data.to.imp$income_rate[is.nan(data.to.imp$income_rate)] <- NA
data.imp <- missForest(data.to.imp)
contours_occitanie_no0$income_rate <- data.imp$ximp[, "income_rate"]


# prepare the matrices V to transform data into ilr space
V_dip <- rbind(c(1/sqrt(6), 1/sqrt(6), -2/sqrt(6)),
               c(1/sqrt(2), -1/sqrt(2),  0))

V_employ <- rbind(c(-1/2/sqrt(5), -1/2/sqrt(5), -1/2/sqrt(5), -1/2/sqrt(5), 2/sqrt(5)),
                  c(-sqrt(3)/6, -sqrt(3)/6, -sqrt(3)/6, sqrt(3/4), 0),
                  c(-sqrt(2/3)/2, -sqrt(2/3)/2,  sqrt(2/3), 0, 0),
                  c(sqrt(1/2), -sqrt(1/2), 0, 0, 0))

# we create the independant variabes 
# age without "mineur"
age <- contours_occitanie_no0[, c("age_1824", "age_2540", 
                                  "age_4055", "age_5564", "age_65")] %>% 
  st_set_geometry(NULL)
age_1840 <- age[, "age_1824"] + age[, "age_2540"]
age_4064 <- age[, "age_4055"] + age[, "age_5564"]
age_65 <- age[, "age_65"]
age3 <- as.data.frame(cbind(age_1840, age_4064, age_65))

# diplome
diplome <- contours_occitanie_no0[, c("no_diplom", "capbep", "bac", "diplom_sup")] %>% 
  st_set_geometry(NULL)
No_CAPBEP <- diplome[, "no_diplom"] + diplome[, "capbep"]
diplome3 <- cbind(diplome[,c("diplom_sup", "bac")], No_CAPBEP)

# employ
employ <- contours_occitanie_no0[, c("AZ", "BE", "FZ", "GU", "OQ")] %>% 
  st_set_geometry(NULL)

# we create the matrix of the independent variables
x2 <- cbind(ilr(age3), 
           ilr(diplome3, V = t(V_dip)), 
           ilr(employ, V = t(V_employ)),
           as.matrix(contours_occitanie_no0[, c("unemp_rate", "employ_evol", 
                                                "owner_rate", "income_rate", 
                                                "foreign")] %>% 
                       st_set_geometry(NULL)),
           log(contours_occitanie_no0$NBRINS))

colnames(x2) <- c("age3_ilr1", "age3_ilr2",  
                  "diplome3_ilr1", "diplome3_ilr2",
                  "employ_ilr1", "employ_ilr2", "employ_ilr3", "employ_ilr4", 
                  "unemp_rate", "employ_evol", "owner_rate", "income_rate", "foreign", "voters")

x2_df <- data.frame(x2)

rm(age3, age, diplome, employ, x2,  age_1840, age_4064, age_65)