library(FactoMineR)
library(factoextra)
library(corrplot)
library(gridExtra)
options(repr.plot.width=15, repr.plot.height=9)

Loading required package: ggplot2

Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

corrplot 0.84 loaded


# Lecture des données
d <- read.csv("https://auder.net/miage/B_seance3-4_ACP/data/owid-covid-data_EDIT_13-10-2020.csv")
# Suppression de la première colonne, utilisée comme label (codes pays)
rownames(d) <- d[,1]
d <- d[,-1]


# Calcul de l'ACP
res.pca <- PCA(d, quali.sup=c(1,2), graph=FALSE)


# Inerties sur chaque axe :
par(mar=c(5.1, 5.1, 2.1, 2.1))
plot(res.pca$eig[,2], xlab="Dimension", ylab="Inertie [cumulée]", ylim=c(0,100),
     type="l", lwd=2, cex.axis=1.5, cex.lab=2)
par(new=TRUE)
plot(res.pca$eig[,3], xaxt="n", yaxt="n", xlab="", ylab="", ylim=c(0,100), type="l", lwd=2, col=2)


res.pca$eig[,3][1:4]


# Nuage des individus
plot(res.pca, axes=c(1,2), choix="ind", habillage=1, label="ind")


monaco_idx <- (1:nrow(d))[d[,2] == "Monaco"]
rbind(d[monaco_idx,3:ncol(d)], AVG=colMeans(d[-monaco_idx,3:ncol(d)]))[,c(2,3,4,7,9,11)]


sort_gdp <- sort(d[["gdp_per_capita"]], decreasing=TRUE, index.return=TRUE)
d[sort_gdp$ix[1:6],c(2,4,5,6,9,11,13)]


d <- d[-monaco_idx,]


# Actualisation de l'ACP :
res.pca <- PCA(d, quali.sup=c(1,2), graph=FALSE)


# Inertie cumulée :
res.pca$eig[,3][1:4]


# Nuage des individus : on y voit maintenant plus clair
plot(res.pca, axes=c(1,2), choix="ind", habillage=1, label="ind")


plotellipses(res.pca, keepvar="continent", invisible=c("ind"), level=0.99)


# Cercle des corrélations
plot(res.pca, axes=c(1,2), choix="var", habillage="cos2")


corrplot(cor(d[,3:ncol(d)]))
# Seconde vérification graphique des redondances inter-variables :


# Relançons l'analyse après suppression des deux variables mentionnées ci-dessus :
d <- subset(d, select=!(colnames(d) %in% c("aged_70_older","life_expectancy")))


# Actualisation de l'ACP :
res.pca <- PCA(d, quali.sup=c(1,2), graph=FALSE)


# Inertie cumulée :
res.pca$eig[,3][1:4]


# Nuage des individus : pas de bouleversements, Singapour et la Corée descendent un peu par exemple.
plot(res.pca, axes=c(1,2), choix="ind", habillage=1, label="ind")


# Qualité de projection des individus + individus contribuant le plus
grid.arrange(plot(res.pca, axes=c(1,2), choix="ind", habillage="cos2", label="ind"),
             plot(res.pca, axes=c(1,2), choix="ind", habillage=1, select="contrib 20", label="ind"), ncol=2)

Warning message:
“ggrepel: 11 unlabeled data points (too many overlaps). Consider increasing max.overlaps”


# Cercles des corrélations : pas de changements notables sur le plan 1-2
grid.arrange(plot(res.pca, axes=c(1,2), choix="var", habillage="cos2"),
             plot(res.pca, axes=c(3,4), choix="var", habillage="cos2"), ncol=2)

Warning message:
“ggrepel: 4 unlabeled data points (too many overlaps). Consider increasing max.overlaps”


# Vérification sur le nuage des individus sur les axes 3-4 (23% d'inertie environ)
plot(res.pca, axes=c(3,4), choix="ind", habillage=1, label="ind")


# Singapour est un point extrême sur ce nuage : supprimons le seulement pour le tracé.
singapour_idx <- (1:nrow(d))[d[,2] == "Singapore"]
ylim <- range(res.pca$ind$coord[-singapour_idx,3:4])
plot(res.pca, axes=c(3,4), choix="ind", habillage=1, label="ind", ylim=ylim)

Warning message:
“Removed 1 rows containing missing values (geom_point).”
Warning message:
“Removed 1 rows containing missing values (geom_text).”
Warning message:
“Removed 1 rows containing missing values (geom_point).”


# Vérification avec deux points extrêmes, et un central :
show_idx <- (1:nrow(d))[rownames(d) %in% c("USA","IDN","BHR")]
subset(d[show_idx,], select=c("location", "population_density"))
range(d[["population_density"]])


# Puis, sur l'autre axe concernant la prévalence du diabète :
show_idx <- (1:nrow(d))[rownames(d) %in% c("NCL","UZB","SMR")]
subset(d[show_idx,], select=c("location", "diabetes_prevalence"))
range(d[["diabetes_prevalence"]])


# Puis, sur l'autre axe concernant la prévalence du diabète :
show_idx <- (1:nrow(d))[rownames(d) %in% c("PER","BMU","SGP")]
subset(d[show_idx,], select=c("location", "total_deaths_per_million"))
range(d[["total_deaths_per_million"]])


# Recalcul de l'ACP
res.pca <- PCA(d, quali.sup=c(1,2), quanti.sup=c(3,4), graph=FALSE)


# Nuage des individus : même allure qu'auparavant sur le plan 1-2, changement sur le plan 3-4
grid.arrange(plot(res.pca, axes=c(1,2), choix="ind", habillage=1, label="ind"),
             plot(res.pca, axes=c(3,4), choix="ind", habillage=1, label="ind"), ncol=2)


res.pca$eig[,3][1:4]


# Zoom violent pour y voir un peu plus clair : (perdant quelques pays)
plot(res.pca, axes=c(1,2), choix="ind", habillage=1, label="ind", xlim=c(-4,4), ylim=c(-2.5,2.5))

Warning message:
“Removed 5 rows containing missing values (geom_point).”
Warning message:
“Removed 5 rows containing missing values (geom_text).”
Warning message:
“Removed 5 rows containing missing values (geom_point).”


# Cercle des corrélations
grid.arrange(plot(res.pca, axes=c(1,2), choix="var", habillage="cos2"),
             plot(res.pca, axes=c(3,4), choix="var", habillage="cos2"), ncol=2)


# Vérification numérique des corrélations :
cor(subset(d, select=c("human_development_index","total_deaths_per_million","total_cases_per_million")))


system("wget https://covid.ourworldindata.org/data/owid-covid-data.csv")
# Put the header in a separate file
system("head -n1 owid-covid-data.csv > 13oct")
# Keep only lines referring to October 13
system("grep '.*,.*,.*,2020-10-13' owid-covid-data.csv >> 13oct")
# The character "'" seemingly has an issue with R CSV reading
system("sed -i 's/Cote d'Ivoire/Ivory Coast/' 13oct")


full_d <- read.csv("13oct")
# Remove redundant variables :
full_d <- subset(full_d, select=!(colnames(d) %in% c("aged_70_older","life_expectancy")))
# Some countries might be removed from the source (Russia, Afghanistan...).
# Also, some new countries might have appeared (they are ignored).
keep_idx <- rownames(d) %in% full_d[,1]
d <- d[keep_idx,]
# Reorder rows, in alphabetic (lexicographic) order
sorted_idx <- sort(rownames(d), index.return=TRUE)$ix
d <- d[sorted_idx,]
# Restrict full_d to the countries available in d
keep_idx <- full_d[,1] %in% rownames(d)
full_d <- full_d[keep_idx,]
# Also sort full_d rows, as in d
sorted_idx <- sort(full_d[,1], index.return=TRUE)$ix
full_d <- full_d[sorted_idx,]


# La température n'étant pas une variable disponible (bien que potentiellement intéressante),
# on choisit ici les variables suivantes :
d2 <- cbind(d,
            cardiovasc_death_rate=full_d[["cardiovasc_death_rate"]],
            total_tests_per_thousand=full_d[["total_tests_per_thousand"]],
            male_smokers=full_d[["male_smokers"]],
            female_smokers=full_d[["female_smokers"]])


d2 <- d2[-(1:nrow(d))[is.na(rowSums(d2[,4:ncol(d2)]))],]


res.pca2 <- PCA(d2, quali.sup=c(1,2), graph=FALSE)


# Cercle des corrélations
plot(res.pca2, axes=c(1,2), choix="var", habillage="cos2")


cor(subset(d2, select=c("total_cases_per_million","gdp_per_capita","female_smokers","aged_65_older")))


# Nuage des individus :
plot(res.pca2, axes=c(1,2), choix="ind", habillage=1, label="ind")


show_idx <- (1:nrow(d2))[rownames(d2) %in% c("MEX","PAN","CUB","USA")]
subset(d2[show_idx,], select=c("location", "median_age","human_development_index"))


# Cercle des corrélations sur le plan 3-4 :
plot(res.pca2, axes=c(3,4), choix="var", habillage="cos2")


cor(d2[,3:6])


rowSums(res.pca2$var$cos2[2:3,3:4])


# Nuage des individus dans le plan 3-4 :
plot(res.pca2, axes=c(3,4), choix="ind", habillage=1, label="ind")


show_idx <- (1:nrow(d2))[rownames(d2) %in% c("AUS","ISR","UKR")]
subset(d2[show_idx,], select=c("location", "cardiovasc_death_rate","diabetes_prevalence","male_smokers"))


show_idx <- (1:nrow(d2))[rownames(d2) %in% c("PAN","MDV")]
subset(d2[show_idx,], select=c("location","total_deaths_per_million",
                               "population_density","total_tests_per_thousand","male_smokers"))

Variable	Signification
iso_code	Code ISO du pays
continent	...Continent :-)
location	Nom du pays
total_cases_per_million	Somme des cas enregistrés jusqu'au 13 octobre inclus, par million d'habitant
total_deaths_per_million	Nombre de décès jusqu'au 13 octobre inclus, par million d'habitants
population_density	Nombre d'habitants par kilomètre carré en moyenne
median_age	Âge médian : 50% de la population est plus jeune, l'autre moitié est plus vieille
aged_65_older	Pourcentage de +65 ans
aged_70_older	Pourcentage de +70 ans
gdp_per_capita	PIB par habitant
diabetes_prevalence	Pourcentage de diabétiques
hospital_beds_per_thousand	Nombre de lits d'hôpital pour 1000 habitants
life_expectancy	...Espérance de vie :-)
human_development_index	Indice de développement basé sur le PIB, l'espérance de vie et l'éducation

	total_deaths_per_million	population_density	median_age	gdp_per_capita	hospital_beds_per_thousand	human_development_index
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
MCO	25.4820	19347.5000	53.80000	185741.28	13.800000	0.9560000
AVG	130.7234	216.4659	30.79259	19801.88	2.834921	0.7156138

	location	total_deaths_per_million	population_density	median_age	gdp_per_capita	hospital_beds_per_thousand	human_development_index
	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
MCO	Monaco	25.482	19347.500	53.8	185741.28	13.800	0.956
LIE	Liechtenstein	26.221	237.012	43.4	165028.25	2.397	0.916
QAT	Qatar	76.361	227.322	31.9	116935.60	1.200	0.856
LUX	Luxembourg	212.468	231.447	39.7	94277.96	4.510	0.904
SGP	Singapore	4.786	7915.731	42.4	85535.38	2.400	0.932
BRN	Brunei	6.857	81.347	32.4	71809.25	2.700	0.853

	location	population_density
	<chr>	<dbl>
BHR	Bahrain	1935.907
IDN	Indonesia	145.725
USA	United States	35.608

	location	diabetes_prevalence
	<chr>	<dbl>
NCL	New Caledonia	23.36
SMR	San Marino	5.64
UZB	Uzbekistan	7.57

Question 0¶

Question 1¶

Analyse descriptive¶

Tentative avec variables supplémentaires¶

Question 2¶

Conclusion¶

	human_development_index	total_deaths_per_million	total_cases_per_million
human_development_index	1.0000000	0.3749394	0.4216821
total_deaths_per_million	0.3749394	1.0000000	0.6305615
total_cases_per_million	0.4216821	0.6305615	1.0000000

	total_cases_per_million	gdp_per_capita	female_smokers	aged_65_older
total_cases_per_million	1.00000000	0.4996729	0.02316017	-0.1163874
gdp_per_capita	0.49967294	1.0000000	0.31217884	0.3434384
female_smokers	0.02316017	0.3121788	1.00000000	0.8095403
aged_65_older	-0.11638738	0.3434384	0.80954031	1.0000000

	total_cases_per_million	total_deaths_per_million	population_density	median_age
total_cases_per_million	1.00000000	0.46565828	0.33610223	0.08293562
total_deaths_per_million	0.46565828	1.00000000	-0.06884444	0.18571445
population_density	0.33610223	-0.06884444	1.00000000	-0.03368264
median_age	0.08293562	0.18571445	-0.03368264	1.00000000

A data.frame: 4 × 3
	location	median_age	human_development_index
	<chr>	<dbl>	<dbl>
CUB	Cuba	43.1	0.777
MEX	Mexico	29.3	0.774
PAN	Panama	29.7	0.789
USA	United States	38.3	0.924

A data.frame: 3 × 4
	location	cardiovasc_death_rate	diabetes_prevalence	male_smokers
	<chr>	<dbl>	<dbl>	<dbl>
AUS	Australia	107.791	5.07	16.5
ISR	Israel	93.320	6.74	35.4
UKR	Ukraine	539.849	7.11	47.4

A data.frame: 2 × 5
	location	total_deaths_per_million	population_density	total_tests_per_thousand	male_smokers
	<chr>	<dbl>	<dbl>	<dbl>	<dbl>
MDV	Maldives	64.750	1454.433	318.617	55.0
PAN	Panama	579.869	55.133	127.560	9.9