I need to get correlation coefficient using two dataframes.
- First Data Frame
## ML
generate1 <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex1 <- date(generate1)
generate2 <- seq(ymd_h("2021-11-02-00"), ymd_h("2021-11-02-03"), by = "hours")
datex2 <- date(generate2)
hourx <- hour(generate1)
method <- c(rep("ARIMA",8),rep("LSTM",8))
books <- c(390,154,154,153,352,170,229,124,458,224,196,485,492,235,139,116)
shirts <- c(312,397,119,357,464,444,453,155,484,454,282,288,141,262,148,258)
shoes <- c(306,274,480,330,143,190,213,477,141,323,316,473,269,149,333,145)
hats <- c(107,101,363,436,282,377,435,381,427,102,100,471,475,134,479,250)
data.predicted <- data.frame(datex = c(datex1,datex2,datex1,datex2),
hour = rep(hourx,4), method = method,
books, shirts, shoes, hats)
#data.predicted
# datex hour method books shirts shoes hats
#1 2021-11-01 0 ARIMA 390 312 306 107
#2 2021-11-01 1 ARIMA 154 397 274 101
#3 2021-11-01 2 ARIMA 154 119 480 363
#4 2021-11-01 3 ARIMA 153 357 330 436
#5 2021-11-02 0 ARIMA 352 464 143 282
#6 2021-11-02 1 ARIMA 170 444 190 377
#7 2021-11-02 2 ARIMA 229 453 213 435
#8 2021-11-02 3 ARIMA 124 155 477 381
#9 2021-11-01 0 LSTM 458 484 141 427
#10 2021-11-01 1 LSTM 224 454 323 102
#11 2021-11-01 2 LSTM 196 282 316 100
#12 2021-11-01 3 LSTM 485 288 473 471
#13 2021-11-02 0 LSTM 492 141 269 475
#14 2021-11-02 1 LSTM 235 262 149 134
#15 2021-11-02 2 LSTM 139 148 333 479
#16 2021-11-02 3 LSTM 116 258 145 250
- Second Data Frame
## real
generate <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex <- date(generate)
hourx <- hour(generate)
books <- c(220,120,150,114)
shirts <- c(319,400,130,360)
shoes <- c(300,280,300,330)
hats <- c(120,140,370,400)
data.real <- data.frame(datex, hourx, books, shirts, shoes, hats)
#data.real
# datex hourx books shirts shoes hats
#1 2021-11-01 0 220 319 300 120
#2 2021-11-01 1 120 400 280 140
#3 2021-11-01 2 150 130 300 370
#4 2021-11-01 3 114 360 330 400
I want to have the result like this data frame. The correlation is based on real data, if the real data only has 1 day data, so predicted data is adjusted.
## Result
metrics <-c("books","books","shirts","shirts","shoes","shoes","hats","hats")
method <-c("ARIMA","LSTM","ARIMA","LSTM","ARIMA","LSTM","ARIMA","LSTM")
correlation <- c(0.946898292,0.294308358,0.999957355,0.535718183,
0.167424749,0.547561054,0.993560612,0.085661117)
result.cor <- data.frame(metrics, method, correlation)
#result.cor
# metrics method correlation
#1 books ARIMA 0.94689829
#2 books LSTM 0.29430836
#3 shirts ARIMA 0.99995736
#4 shirts LSTM 0.53571818
#5 shoes ARIMA 0.16742475
#6 shoes LSTM 0.54756105
#7 hats ARIMA 0.99356061
#8 hats LSTM 0.08566112
We can see value of ARIMA is 0.94689829, it is from
ARIMA.pred <- subset(data.predicted, method == "ARIMA" & datex == "2021-11-01")
#ARIMA.pred
# datex hour method books shirts shoes hats
#1 2021-11-01 0 ARIMA 390 312 306 107
#2 2021-11-01 1 ARIMA 154 397 274 101
#3 2021-11-01 2 ARIMA 154 119 480 363
#4 2021-11-01 3 ARIMA 153 357 330 436
data.real$books
#220 120 150 114
cor(ARIMA.pred$books, data.real$books)
#0.9468983
How do i create funtion to simplify and get the result?.
CodePudding user response:
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
library(tidyverse)
generate1 <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex1 <- date(generate1)
generate2 <- seq(ymd_h("2021-11-02-00"), ymd_h("2021-11-02-03"), by = "hours")
datex2 <- date(generate2)
hourx <- hour(generate1)
method <- c(rep("ARIMA", 8), rep("LSTM", 8))
books <- c(390, 154, 154, 153, 352, 170, 229, 124, 458, 224, 196, 485, 492, 235, 139, 116)
shirts <- c(312, 397, 119, 357, 464, 444, 453, 155, 484, 454, 282, 288, 141, 262, 148, 258)
shoes <- c(306, 274, 480, 330, 143, 190, 213, 477, 141, 323, 316, 473, 269, 149, 333, 145)
hats <- c(107, 101, 363, 436, 282, 377, 435, 381, 427, 102, 100, 471, 475, 134, 479, 250)
data.predicted <- data.frame(
datex = c(datex1, datex2, datex1, datex2),
hour = rep(hourx, 4), method = method,
books, shirts, shoes, hats
)
generate <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex <- date(generate)
hourx <- hour(generate)
books <- c(220, 120, 150, 114)
shirts <- c(319, 400, 130, 360)
shoes <- c(300, 280, 300, 330)
hats <- c(120, 140, 370, 400)
data.real <- data.frame(datex, hour = hourx, books, shirts, shoes, hats)
data.real
#> datex hour books shirts shoes hats
#> 1 2021-11-01 0 220 319 300 120
#> 2 2021-11-01 1 120 400 280 140
#> 3 2021-11-01 2 150 130 300 370
#> 4 2021-11-01 3 114 360 330 400
data.predicted
#> datex hour method books shirts shoes hats
#> 1 2021-11-01 0 ARIMA 390 312 306 107
#> 2 2021-11-01 1 ARIMA 154 397 274 101
#> 3 2021-11-01 2 ARIMA 154 119 480 363
#> 4 2021-11-01 3 ARIMA 153 357 330 436
#> 5 2021-11-02 0 ARIMA 352 464 143 282
#> 6 2021-11-02 1 ARIMA 170 444 190 377
#> 7 2021-11-02 2 ARIMA 229 453 213 435
#> 8 2021-11-02 3 ARIMA 124 155 477 381
#> 9 2021-11-01 0 LSTM 458 484 141 427
#> 10 2021-11-01 1 LSTM 224 454 323 102
#> 11 2021-11-01 2 LSTM 196 282 316 100
#> 12 2021-11-01 3 LSTM 485 288 473 471
#> 13 2021-11-02 0 LSTM 492 141 269 475
#> 14 2021-11-02 1 LSTM 235 262 149 134
#> 15 2021-11-02 2 LSTM 139 148 333 479
#> 16 2021-11-02 3 LSTM 116 258 145 250
covariates <- c("books", "shirts", "shoes", "hats")
methods <- c("ARIMA", "LSTM")
list(
data.real %>% mutate(type = "real"),
data.predicted %>% mutate(type = "predicted")
) %>%
bind_rows() %>%
nest(-datex) %>%
expand_grid(
covariate = covariates,
method = methods
) %>%
mutate(cor = list(data, covariate, method) %>% pmap_dbl(possibly(~ {
real <- .x %>%
filter(type == "real") %>%
pluck(.y)
predicted <- .x %>%
filter(method == ..3) %>%
pluck(.y)
cor(real, predicted)
}, NA))) %>%
select(-data)
#> Warning: All elements of `...` must be named.
#> Did you want `data = c(hour, books, shirts, shoes, hats, type, method)`?
#> # A tibble: 16 x 4
#> datex covariate method cor
#> <date> <chr> <chr> <dbl>
#> 1 2021-11-01 books ARIMA 0.947
#> 2 2021-11-01 books LSTM 0.294
#> 3 2021-11-01 shirts ARIMA 1.00
#> 4 2021-11-01 shirts LSTM 0.536
#> 5 2021-11-01 shoes ARIMA 0.167
#> 6 2021-11-01 shoes LSTM 0.548
#> 7 2021-11-01 hats ARIMA 0.994
#> 8 2021-11-01 hats LSTM 0.0857
#> 9 2021-11-02 books ARIMA NA
#> 10 2021-11-02 books LSTM NA
#> 11 2021-11-02 shirts ARIMA NA
#> 12 2021-11-02 shirts LSTM NA
#> 13 2021-11-02 shoes ARIMA NA
#> 14 2021-11-02 shoes LSTM NA
#> 15 2021-11-02 hats ARIMA NA
#> 16 2021-11-02 hats LSTM NA
Created on 2021-11-25 by the reprex package (v2.0.1)
CodePudding user response:
A data.table approach
Convert it to data.table object using setDT()
library(data.table)
setDT(data.predicted)
setDT(data.real)
First step is to melt data.predicted and data.real.
data.predicted <- melt(data.predicted, id.vars = c('datex', 'hourx', 'method'), measure.vars = c('books', 'shirts', 'shoes', 'hats'), variable.name = 'metrics', value.name = 'Value_Pred')
data.real <- melt(data.real, id.vars = c('datex', 'hourx'), measure.vars = c('books', 'shirts', 'shoes', 'hats'), variable.name = 'metrics', value.name = 'Value_Real')
Next step is to join both the dataset on datex, hourx and metrics. Applying left join with an assumption data.predicted contains all the data points with respect to date and hour present in data.real.
data.predicted <- merge(data.predicted, data.real, by = c('datex', 'hourx', 'metrics'), all.x = TRUE)
Last step is to take correlation for each datex, metrics and method values.
data.predicted <- data.predicted[,.(Cor_Col = cor(Value_Pred, Value_Real)), by = .(datex, metrics, method)]
data.predicted
datex metrics method Cor_Col
1: 2021-11-01 books ARIMA 0.94689829
2: 2021-11-01 books LSTM 0.29430836
3: 2021-11-01 shirts ARIMA 0.99995736
4: 2021-11-01 shirts LSTM 0.53571818
5: 2021-11-01 shoes ARIMA 0.16742475
6: 2021-11-01 shoes LSTM 0.54756105
7: 2021-11-01 hats ARIMA 0.99356061
8: 2021-11-01 hats LSTM 0.08566112
9: 2021-11-02 books ARIMA NA
10: 2021-11-02 books LSTM NA
11: 2021-11-02 shirts ARIMA NA
12: 2021-11-02 shirts LSTM NA
13: 2021-11-02 shoes ARIMA NA
14: 2021-11-02 shoes LSTM NA
15: 2021-11-02 hats ARIMA NA
16: 2021-11-02 hats LSTM NA