This is dataset with my variables for analysis.
clys<-structure(list(session_price = c(18824.7664, 35584.4106, 21084.4035,
9907.5856, 30806.5486, 15788.1279, 10147.7593, 11977.5904, 11734.3553,
53484.8698, 27788.9949, 11072.0588, 29241.0885, 5676.2372, 14007.0981,
34964.85, 14668.6735, 9425.9294, 16577.845, 153147.2272), flight_type = c(1.2462,
1.1691, 1.0601, 1.2909, 1.5488, 1.1279, 1.166, 1.3862, 1.2936,
1.0195, 1.0451, 1.2904, 1.6684, 1.2786, 1.1358, 1.2958, 1.05,
1.1522, 1.0561, 1.6795), adults_count = c(1.1793, 1.0821, 1.1156,
1.2565, 1.2742, 1.2283, 1.3237, 1.1494, 1.2904, 1.3525, 1.0814,
1.3644, 1.5781, 1.1816, 1.2604, 1.1732, 1.4088, 1.3959, 1.0959,
1.4726), children_count = c(0.2432, 0.0338, 0.1573, 0.0517, 0.0769,
0.0365, 0.1494, 0.0408, 0.1177, 0.128, 0.0579, 0.2749, 0.4045,
0.0823, 0.0943, 0.0677, 0.2088, 0.3009, 0.0817, 0.2353), infants_count = c(0.0152,
0.0048, 0.0731, 0.0259, 0.0129, 0.0046, 0.0954, 0.014, 0.0141,
0.0152, 0.0121, 0.0667, 0.0365, 0.0174, 0.0679, 0.0111, 0.0441,
0.0818, 0.0313, 0.0446), meta_flight_type = c(0.2918, 0.4686,
0.1425, 0.43, 0.3924, 0.6575, 0.6349, 0.0583, 0.2747, 0.167,
0.6179, 0.22, 0.5573, 0.2165, 0.3623, 0.6272, 0.3853, 0.1468,
0.255, 0.4604), flight_kind = c(0.4528, 0.1379, 3.6497, 0.2331,
0.3969, 0.1519, 0.098, NA, 0.6111, NA, 0.1086, 0.1061, NA, NA,
0.8571, 1.3472, NA, 0.0243, 3.3273, 1.1279), service_class_id = c(2,
1.9952, 2, 2, 1.9986, 2, 2, 1.9977, 2, 1.9913, 1.9968, 1.9985,
1.9983, 2, 2, 1.9994, 2, 1.9979, 1.9986, 1.9939), UI_profit = c(249.9766,
210.7159, 121.1932, 46.7757, 202.5403, 58.3467, 35.375, 0, 63.4536,
0, 116.4613, 41.2356, 0, 0, 72.0427, 131.8692, 0, 24.3831, 75.1906,
53), leg_price = c(9807.4805, 23253.6651, 15805.3328, 6148.6305,
15574.0215, 11339.653, 5964.4419, 7846.2151, 6910.2812, 35607.4389,
23953.2572, 5411.9416, 9544.5809, 3568.1491, 9463.4491, 23276.3196,
8357.9574, 4977.6056, 13331.1196, 54673.0944), flight_duration_min = c(307.2136,
269.9225, 439.2894, 143.2841, 197.8477, 110.2875, 114.3542, NA,
173.47, NA, 236.4197, 160.9437, NA, NA, 216.9208, 624.4288, NA,
162.4991, 190.5408, 776.6839), trip_duration_min = c(504.257,
531.7625, 967.9167, 261.4497, 265.9794, 138.0625, 163.9792, NA,
325.6778, NA, 459.6784, 166.7464, NA, NA, 462.5097, 949.2419,
NA, 162.6241, 478.7249, 1346.6982), price_duration_min = c(27.8457,
78.404, 35.9824, 38.95, 56.0142, 102.8833, 49.24, NA, 33.841,
NA, 96.4814, 29.3607, NA, NA, 43.4476, 33.1768, NA, 28.893, 76.8556,
45.4329), days_to_flight = c(27.8068, 23.0823, 23.7821, 12.4188,
26.8415, 19.6586, 24.6713, 16.6704, 13.9125, 10.1796, 13.2141,
18.1858, 119.3786, 12.5782, 20.3807, 31.856, 37.4516, 6.9034,
21.6605, 43.7275), days_RT = c(12.8218, 8.904, 23.2507, 4.585,
8.5987, 13.0174, 7.6805, 6.4065, 4.219, 19.984, 11.874, 8.8732,
14.4032, 4.9503, 11.9996, 12.5172, 4.9677, 8.0309, 12.8996, 15.5516
), mobile_share = c(0.538, 0.5845, 0.7576, 0.5409, 0.5279, 0.6119,
0.6017, 0.5344, 0.5133, 0.7007, 0.7336, 0.7531, 0.5156, 0.6429,
0.7208, 0.6033, 0.7118, 0.8446, 0.6328, 0.6268), desktop_share = c(0.4559,
0.4155, 0.2424, 0.3556, 0.4687, 0.3881, 0.3983, 0.4656, 0.4757,
0.2993, 0.2626, 0.2382, 0.4844, 0.3571, 0.2792, 0.3924, 0.2882,
0.1519, 0.3643, 0.3732), iphone_share = c(0.2128, 0.2947, 0.1443,
0.3103, 0.3459, 0.3379, 0.1618, 0.2882, 0.2308, 0.4707, 0.2606,
0.4327, 0.1892, 0.277, 0.2453, 0.2805, 0.1853, 0.478, 0.1882,
0.3834), android_share = c(0.307, 0.2657, 0.6087, 0.2274, 0.1697,
0.2694, 0.4274, 0.2322, 0.2779, 0.2115, 0.4685, 0.3165, 0.3177,
0.361, 0.4755, 0.3196, 0.5176, 0.3668, 0.4432, 0.2414), multi_share = c(0.2888,
0.0676, 0.8825, 0.1078, 0.0807, 0.0365, 0.1411, 0.0292, 0.2229,
0.0412, 0.1354, 0.1619, 0.0972, 0.1538, 0.1585, 0.3809, 0.1324,
0.0902, 0.473, 0.211), CR_session_to_popup = c(0.1185, 0.1159,
0.0879, 0.2295, 0.1276, 0.2374, 0.1162, 0.1097, 0.1695, 0.1605,
0.1062, 0.2189, 0.0226, 0.2356, 0.166, 0.1383, 0.1118, 0.2994,
0.0874, 0.0467), CR_session_to_booking = c(0.1155, 0.1063, 0.0703,
0.2392, 0.1208, 0.2237, 0.1079, 0.1995, 0.1648, 0.1844, 0.082,
0.1826, 0.0313, 0.2339, 0.1472, 0.1141, 0.1118, 0.2515, 0.0739,
0.0548), corr_winter = c(0.2635, 0.1983, 0.2513, 0.1867, 0.106,
0.4188, 0.0534, 0.1589, 0.2498, 0.4775, 0.4858, 0.3605, 0.0688,
0.318, 0.3394, 0.223, 0.3281, 0.3985, 0.173, 0.112), corr_spring = c(0.3036,
0.2772, 0.2602, 0.2209, 0.3627, 0.1332, 0.4484, 0.2793, 0.2526,
0.506, 0.0814, 0.2088, 0.6824, 0.2407, 0.1407, 0.326, 0.3228,
0.0654, 0.0897, 0.3196), corr_summer = c(0.2673, 0.1791, 0.258,
0.2894, 0.2856, 0.099, 0.2358, 0.2793, 0.276, 0.0165, 0.2525,
0.2087, 0.2488, 0.4413, 0.2477, 0.2744, 0.3491, 0.2917, 0.5926,
0.0861), corr_autumn = c(0.1656, 0.3454, 0.2304, 0.3029, 0.2458,
0.349, 0.2625, 0.2826, 0.2216, 0, 0.1803, 0.222, 0, 0, 0.2722,
0.1766, 0, 0.2444, 0.1447, 0.4823), corr_BL = c(0.4759, 0.5444,
0.4952, 0.4392, 0.4586, 0.4146, 0.4011, 0.4722, 0.4244, 0.4542,
0.4742, 0.4467, 0.4652, 0.4293, 0.4412, 0.4423, 0.4811, 0.4583,
0.496, 0.4882), corr_UP = c(0.5241, 0.4556, 0.5048, 0.5608, 0.5414,
0.5854, 0.5989, 0.5278, 0.5756, 0.5458, 0.5258, 0.5533, 0.5348,
0.5707, 0.5588, 0.5577, 0.5189, 0.5417, 0.504, 0.5118), pam_german.clustering = c(1L,
2L, 2L, 3L, 4L, 4L, 5L, 6L, 1L, 7L, 7L, 8L, 6L, 9L, 8L, 9L, 9L,
8L, 2L, 2L)), class = "data.frame", row.names = c(NA, -20L))
pam_german.clustering
is the number of the cluster in which the observation is belong (row)
How for all variable from session_price
to corr_UP
between all clusters to draw a histogram of the distribution of variables?
I only learn ggplot2, so can't do it self. But to explain what result i need , i can draw using paint.
For session price histogram between cluster
Or perhaps you are wanting columns of averages per cluster with an error bar representing the range?
library(tidyverse)
clys %>%
group_by(pam_german.clustering) %>%
summarize(max = max(session_price),
min = min(session_price),
session_price = mean(session_price),
cluster = factor(mean(pam_german.clustering))) %>%
ggplot(aes(x = cluster, y = session_price, fill = session_price))
geom_col()
geom_errorbar(aes(ymin = min, ymax = max), width = 0.5, size = 0.2)
scale_fill_viridis_c(option = 7)
theme_light(base_size = 16)
labs(y = 'Session Price')
guides(fill = guide_none())
Certainly, a set of histograms is possible, but really doesn't work very well with this data set due to the lack of data points, and trying to fit too many facets across a single dimension of the plot:
clys %>%
ggplot(aes(x = session_price))
geom_histogram()
facet_grid(.~pam_german.clustering, scales = 'free_x')
theme_light(base_size = 16)