data <- read.csv("../data/google_review_ratings.csv")
data <- data[,-ncol(data)]
dim(data)
head(data)


data <- data[,-1]


for (i in seq_len(ncol(data))) print(paste(i, class(data[,i])))

[1] "1 numeric"
[1] "2 numeric"
[1] "3 numeric"
[1] "4 numeric"
[1] "5 numeric"
[1] "6 numeric"
[1] "7 numeric"
[1] "8 numeric"
[1] "9 numeric"
[1] "10 numeric"
[1] "11 character"
[1] "12 numeric"
[1] "13 numeric"
[1] "14 numeric"
[1] "15 numeric"
[1] "16 numeric"
[1] "17 numeric"
[1] "18 numeric"
[1] "19 numeric"
[1] "20 numeric"
[1] "21 numeric"
[1] "22 numeric"
[1] "23 numeric"
[1] "24 numeric"


# Recherche d'une chaîne non conforme.
# Pas besoin de tenir compte d'éventuels exposants : on sait que les notes sont entre 0 et 5.
grep("^[0-9]+.?[0-9]*$", data[,11], invert=TRUE)


data[2713,11]


data <- data[-2713,]
data[,11] <- as.double(data[,11])


#data[which(!complete.cases(data)),] #montre la ligne où category24 manque
data <- na.omit(data)


as.matrix(dist(data[1:6,]))


data[1:2,which(data[1,] != data[2,])]


# Je ne sais pas quelle valeur choisir pour k, alors profitant du fait que le k-means est rapide, balayons :
library(clusterCrit)
mdata <- as.matrix(data) #nécessaire pour clusterCrit
silhouette <- c()
for (k in 2:20) {
    km <- kmeans(mdata, k, nstart=5)
    silhouette <- c(silhouette, intCriteria(mdata, km$cluster, "Silhouette")$silhouette)
}


options(repr.plot.width=15, repr.plot.height=10)
plot(silhouette, type="l")
(2:20)[which.max(silhouette)]


# Juste pour indiquer que factoextra implémente les trois méthodes précitées :
#library(factoextra)
#fviz_nbclust(data, kmeans, method = "wss")
#fviz_nbclust(data, kmeans, method = "silhouette")
#fviz_nbclust(data, kmeans, method = "gap_stat")

# Voir aussi le package NbClust référencé sur dananovia.com


km <- kmeans(mdata, 7, nstart=5)
library(factoextra)
fviz_cluster(km, data=data)

Loading required package: ggplot2

Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa


km$centers
matplot(t(km$centers), type="l", lty=1, lwd=2, col=rainbow(7))


library(nngd)
o <- nng(data, k=20, mutual=TRUE) #attention k = nombre de voisins ici
dists_graph <- igraph::distances(o$graph, weights=o$distances)


library(cluster)
# pamonce=1 permet de diminuer le temps d'exécution ; cf. aide ?pam
p <- pam(as.dist(dists_graph), k=7, pamonce=1)


fviz_cluster(list(data=data, cluster=p$clustering))


range(dists_graph)
# ...Ok on tient l'explication ! Graphe probablement non connexe.
igraph::count_components(o$graph)
# 86 composantes connexes => on voit juste les 6 les plus denses ici.


k = 100
o <- nng(data, k=k, mutual=TRUE)
igraph::count_components(o$graph) #3


k = 150
o <- nng(data, k=k, mutual=TRUE)
igraph::count_components(o$graph) #2


k = 200
o <- nng(data, k=k, mutual=TRUE)
igraph::count_components(o$graph) #1 !


dists_graph <- igraph::distances(o$graph, weights=o$distances)
p <- pam(as.dist(dists_graph), k=7, pamonce=1)


fviz_cluster(list(data=data, cluster=p$clustering))


data[p$medoids,]
extCriteria(km$cluster, p$clustering, "Rand")
matplot(t(data[p$medoids,]), type="l", lty=1, lwd=2, col=rainbow(7))

	User	Category.1	Category.2	Category.3	Category.4	Category.5	Category.6	Category.7	Category.8	Category.9	⋯	Category.15	Category.16	Category.17	Category.18	Category.19	Category.20	Category.21	Category.22	Category.23	Category.24
	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	User 1	0	0.0	3.63	3.65	5	2.92	5	2.35	2.33	⋯	1.74	0.59	0.5	0	0.5	0	0	0	0	0
2	User 2	0	0.0	3.63	3.65	5	2.92	5	2.64	2.33	⋯	1.74	0.59	0.5	0	0.5	0	0	0	0	0
3	User 3	0	0.0	3.63	3.63	5	2.92	5	2.64	2.33	⋯	1.74	0.59	0.5	0	0.5	0	0	0	0	0
4	User 4	0	0.5	3.63	3.63	5	2.92	5	2.35	2.33	⋯	1.74	0.59	0.5	0	0.5	0	0	0	0	0
5	User 5	0	0.0	3.63	3.63	5	2.92	5	2.64	2.33	⋯	1.74	0.59	0.5	0	0.5	0	0	0	0	0
6	User 6	0	0.0	3.63	3.63	5	2.92	5	2.63	2.33	⋯	1.74	0.59	0.5	0	0.5	0	0	0	0	0

	1	2	3	4	5	6
1	0.0000000	0.29017236	0.29068884	0.5012983	0.29068884	0.28124722
2	0.2901724	0.00000000	0.02236068	0.5792236	0.02236068	0.02645751
3	0.2906888	0.02236068	0.00000000	0.5787918	0.00000000	0.02000000
4	0.5012983	0.57922362	0.57879185	0.0000000	0.57879185	0.57358522
5	0.2906888	0.02236068	0.00000000	0.5787918	0.00000000	0.02000000
6	0.2812472	0.02645751	0.02000000	0.5735852	0.02000000	0.00000000

	Category.8	Category.10
	<dbl>	<dbl>
1	2.35	2.64
2	2.64	2.65

	Category.1	Category.2	Category.3	Category.4	Category.5	Category.6	Category.7	Category.8	Category.9	Category.10	⋯	Category.15	Category.16	Category.17	Category.18	Category.19	Category.20	Category.21	Category.22	Category.23	Category.24
1	1.3044910	1.706437	2.666766	3.436986	4.205689	4.043733	4.130579	2.481916	2.790469	2.234162	⋯	1.249721	0.9264471	0.7330040	0.5506986	0.6215170	0.6260379	0.7333433	0.8400499	1.5356188	1.7131637
2	1.1522175	2.627542	2.342726	2.273404	2.464986	3.365918	4.616455	3.242062	4.522895	3.470565	⋯	4.610777	0.8410593	0.6642938	0.6414124	0.7149153	0.8230508	0.8296610	0.9908192	0.9326271	1.0369633
3	1.6493014	2.245205	3.137466	4.299137	4.076219	3.181137	2.713178	2.176151	2.644425	2.679493	⋯	1.169356	1.2070137	0.8722466	0.6713699	0.6324247	0.9896438	1.0362740	4.7830137	2.5126849	1.7733151
4	1.2233749	1.666942	1.704417	2.154190	2.235527	2.570079	3.711755	3.670600	4.554428	4.498822	⋯	1.281336	1.0907814	0.6640883	0.3932843	0.3657418	0.4466704	0.5853454	1.2301359	1.0751416	1.1847225
5	0.7911018	1.039883	1.650718	1.638564	1.657613	1.668164	3.004257	1.970284	2.741636	2.864958	⋯	4.003139	1.4760267	1.1929215	0.9225543	1.1884808	1.0036394	0.7162938	0.7200167	0.6912020	0.7922538
6	1.5343158	4.728752	3.603068	3.471865	3.777383	3.276737	3.218902	2.549895	2.937609	2.575353	⋯	1.603143	1.1503609	0.5736090	0.5051880	0.7400451	0.9784361	0.8165865	0.8690226	1.0789925	1.4309624
7	2.3510381	2.543218	2.381096	2.158201	1.988547	1.817728	1.930150	1.582134	1.737209	1.610750	⋯	2.383645	1.7161015	1.9075202	2.0232987	2.5014072	2.1606228	1.9572895	2.7805075	2.5792272	2.6456863

	Category.1	Category.2	Category.3	Category.4	Category.5	Category.6	Category.7	Category.8	Category.9	Category.10	⋯	Category.15	Category.16	Category.17	Category.18	Category.19	Category.20	Category.21	Category.22	Category.23	Category.24
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
4922	1.30	1.71	1.35	2.76	2.80	2.81	5.00	2.46	3.00	3.01	⋯	3.57	0.67	0.62	0.62	0.63	0.67	0.57	0.57	1.12	1.18
4032	0.88	5.00	3.02	4.10	5.00	2.87	5.00	2.79	2.77	2.79	⋯	2.11	0.58	0.55	0.55	0.55	0.58	0.62	0.82	0.83	0.85
2625	2.10	2.11	2.14	2.00	1.99	2.01	2.03	1.67	1.80	1.63	⋯	1.56	1.45	1.40	1.41	1.44	2.12	0.93	2.06	2.05	2.07
492	1.47	1.48	1.51	5.00	5.00	4.45	1.79	1.77	1.79	1.81	⋯	1.32	1.34	0.56	0.57	0.55	0.51	0.53	4.46	0.92	1.45
2071	2.16	2.15	2.17	5.00	3.05	3.27	3.27	3.02	3.04	3.34	⋯	1.05	1.00	0.97	0.69	0.66	0.65	0.67	3.06	5.00	1.63
1976	1.19	1.22	1.75	1.79	1.71	3.14	5.00	3.92	5.00	5.00	⋯	1.03	0.57	0.55	0.54	0.55	0.58	0.91	1.28	1.19	1.18
4322	0.57	0.56	1.54	1.51	1.50	1.51	1.54	1.55	2.64	3.22	⋯	5.00	0.76	0.75	0.75	0.78	1.01	0.53	0.51	0.50	0.51

Exercice 1¶

Exercice 2¶