Get Correlation coefficient from Two Dataframes Using R-CodePudding

I need to get correlation coefficient using two dataframes.

First Data Frame

## ML
generate1 <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex1 <- date(generate1)
generate2 <- seq(ymd_h("2021-11-02-00"), ymd_h("2021-11-02-03"), by = "hours")
datex2 <- date(generate2)
hourx <- hour(generate1)
method <- c(rep("ARIMA",8),rep("LSTM",8))
books <- c(390,154,154,153,352,170,229,124,458,224,196,485,492,235,139,116)
shirts <- c(312,397,119,357,464,444,453,155,484,454,282,288,141,262,148,258)
shoes <- c(306,274,480,330,143,190,213,477,141,323,316,473,269,149,333,145)
hats <- c(107,101,363,436,282,377,435,381,427,102,100,471,475,134,479,250)

data.predicted <- data.frame(datex = c(datex1,datex2,datex1,datex2),
                             hour = rep(hourx,4), method = method, 
                             books, shirts, shoes, hats)

#data.predicted
#        datex hour method books shirts shoes hats
#1  2021-11-01    0  ARIMA   390    312   306  107
#2  2021-11-01    1  ARIMA   154    397   274  101
#3  2021-11-01    2  ARIMA   154    119   480  363
#4  2021-11-01    3  ARIMA   153    357   330  436
#5  2021-11-02    0  ARIMA   352    464   143  282
#6  2021-11-02    1  ARIMA   170    444   190  377
#7  2021-11-02    2  ARIMA   229    453   213  435
#8  2021-11-02    3  ARIMA   124    155   477  381
#9  2021-11-01    0   LSTM   458    484   141  427
#10 2021-11-01    1   LSTM   224    454   323  102
#11 2021-11-01    2   LSTM   196    282   316  100
#12 2021-11-01    3   LSTM   485    288   473  471
#13 2021-11-02    0   LSTM   492    141   269  475
#14 2021-11-02    1   LSTM   235    262   149  134
#15 2021-11-02    2   LSTM   139    148   333  479
#16 2021-11-02    3   LSTM   116    258   145  250

Second Data Frame

## real
generate <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex <- date(generate)
hourx <- hour(generate)
books <- c(220,120,150,114)
shirts <- c(319,400,130,360)
shoes <- c(300,280,300,330)
hats <- c(120,140,370,400)
data.real <- data.frame(datex, hourx, books, shirts, shoes, hats)

#data.real
#       datex hourx books shirts shoes hats
#1 2021-11-01     0   220    319   300  120
#2 2021-11-01     1   120    400   280  140
#3 2021-11-01     2   150    130   300  370
#4 2021-11-01     3   114    360   330  400

I want to have the result like this data frame. The correlation is based on real data, if the real data only has 1 day data, so predicted data is adjusted.

## Result
metrics <-c("books","books","shirts","shirts","shoes","shoes","hats","hats")
method  <-c("ARIMA","LSTM","ARIMA","LSTM","ARIMA","LSTM","ARIMA","LSTM")
correlation <- c(0.946898292,0.294308358,0.999957355,0.535718183,
                 0.167424749,0.547561054,0.993560612,0.085661117)
result.cor <- data.frame(metrics, method, correlation)

#result.cor
#  metrics method correlation
#1   books  ARIMA  0.94689829
#2   books   LSTM  0.29430836
#3  shirts  ARIMA  0.99995736
#4  shirts   LSTM  0.53571818
#5   shoes  ARIMA  0.16742475
#6   shoes   LSTM  0.54756105
#7    hats  ARIMA  0.99356061
#8    hats   LSTM  0.08566112

We can see value of ARIMA is 0.94689829, it is from

ARIMA.pred <- subset(data.predicted, method == "ARIMA" & datex == "2021-11-01")

#ARIMA.pred
#       datex hour method books shirts shoes hats
#1 2021-11-01    0  ARIMA   390    312   306  107
#2 2021-11-01    1  ARIMA   154    397   274  101
#3 2021-11-01    2  ARIMA   154    119   480  363
#4 2021-11-01    3  ARIMA   153    357   330  436

data.real$books
#220 120 150 114

cor(ARIMA.pred$books, data.real$books)
#0.9468983

How do i create funtion to simplify and get the result?.

CodePudding user response：

library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(tidyverse)

generate1 <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex1 <- date(generate1)
generate2 <- seq(ymd_h("2021-11-02-00"), ymd_h("2021-11-02-03"), by = "hours")
datex2 <- date(generate2)
hourx <- hour(generate1)
method <- c(rep("ARIMA", 8), rep("LSTM", 8))
books <- c(390, 154, 154, 153, 352, 170, 229, 124, 458, 224, 196, 485, 492, 235, 139, 116)
shirts <- c(312, 397, 119, 357, 464, 444, 453, 155, 484, 454, 282, 288, 141, 262, 148, 258)
shoes <- c(306, 274, 480, 330, 143, 190, 213, 477, 141, 323, 316, 473, 269, 149, 333, 145)
hats <- c(107, 101, 363, 436, 282, 377, 435, 381, 427, 102, 100, 471, 475, 134, 479, 250)

data.predicted <- data.frame(
  datex = c(datex1, datex2, datex1, datex2),
  hour = rep(hourx, 4), method = method,
  books, shirts, shoes, hats
)
generate <- seq(ymd_h("2021-11-01-00"), ymd_h("2021-11-01-03"), by = "hours")
datex <- date(generate)
hourx <- hour(generate)
books <- c(220, 120, 150, 114)
shirts <- c(319, 400, 130, 360)
shoes <- c(300, 280, 300, 330)
hats <- c(120, 140, 370, 400)
data.real <- data.frame(datex, hour = hourx, books, shirts, shoes, hats)

data.real
#>        datex hour books shirts shoes hats
#> 1 2021-11-01    0   220    319   300  120
#> 2 2021-11-01    1   120    400   280  140
#> 3 2021-11-01    2   150    130   300  370
#> 4 2021-11-01    3   114    360   330  400

data.predicted
#>         datex hour method books shirts shoes hats
#> 1  2021-11-01    0  ARIMA   390    312   306  107
#> 2  2021-11-01    1  ARIMA   154    397   274  101
#> 3  2021-11-01    2  ARIMA   154    119   480  363
#> 4  2021-11-01    3  ARIMA   153    357   330  436
#> 5  2021-11-02    0  ARIMA   352    464   143  282
#> 6  2021-11-02    1  ARIMA   170    444   190  377
#> 7  2021-11-02    2  ARIMA   229    453   213  435
#> 8  2021-11-02    3  ARIMA   124    155   477  381
#> 9  2021-11-01    0   LSTM   458    484   141  427
#> 10 2021-11-01    1   LSTM   224    454   323  102
#> 11 2021-11-01    2   LSTM   196    282   316  100
#> 12 2021-11-01    3   LSTM   485    288   473  471
#> 13 2021-11-02    0   LSTM   492    141   269  475
#> 14 2021-11-02    1   LSTM   235    262   149  134
#> 15 2021-11-02    2   LSTM   139    148   333  479
#> 16 2021-11-02    3   LSTM   116    258   145  250


covariates <- c("books", "shirts", "shoes", "hats")
methods <- c("ARIMA", "LSTM")

list(
  data.real %>% mutate(type = "real"),
  data.predicted %>% mutate(type = "predicted")
) %>%
  bind_rows() %>%
  nest(-datex) %>%
  expand_grid(
    covariate = covariates,
    method = methods
  ) %>%
  mutate(cor = list(data, covariate, method) %>% pmap_dbl(possibly(~ {
    real <- .x %>%
      filter(type == "real") %>%
      pluck(.y)
    predicted <- .x %>%
      filter(method == ..3) %>%
      pluck(.y)
    cor(real, predicted)
  }, NA))) %>%
  select(-data)
#> Warning: All elements of `...` must be named.
#> Did you want `data = c(hour, books, shirts, shoes, hats, type, method)`?
#> # A tibble: 16 x 4
#>    datex      covariate method     cor
#>    <date>     <chr>     <chr>    <dbl>
#>  1 2021-11-01 books     ARIMA   0.947 
#>  2 2021-11-01 books     LSTM    0.294 
#>  3 2021-11-01 shirts    ARIMA   1.00  
#>  4 2021-11-01 shirts    LSTM    0.536 
#>  5 2021-11-01 shoes     ARIMA   0.167 
#>  6 2021-11-01 shoes     LSTM    0.548 
#>  7 2021-11-01 hats      ARIMA   0.994 
#>  8 2021-11-01 hats      LSTM    0.0857
#>  9 2021-11-02 books     ARIMA  NA     
#> 10 2021-11-02 books     LSTM   NA     
#> 11 2021-11-02 shirts    ARIMA  NA     
#> 12 2021-11-02 shirts    LSTM   NA     
#> 13 2021-11-02 shoes     ARIMA  NA     
#> 14 2021-11-02 shoes     LSTM   NA     
#> 15 2021-11-02 hats      ARIMA  NA     
#> 16 2021-11-02 hats      LSTM   NA

^{Created on 2021-11-25 by the reprex package (v2.0.1)}

CodePudding user response：

A data.table approach

Convert it to data.table object using setDT()

library(data.table)

setDT(data.predicted)
setDT(data.real)

First step is to melt data.predicted and data.real.

data.predicted <- melt(data.predicted, id.vars = c('datex', 'hourx', 'method'), measure.vars = c('books', 'shirts', 'shoes', 'hats'), variable.name = 'metrics', value.name = 'Value_Pred')

data.real <- melt(data.real, id.vars = c('datex', 'hourx'), measure.vars = c('books', 'shirts', 'shoes', 'hats'), variable.name = 'metrics', value.name = 'Value_Real')

Next step is to join both the dataset on datex, hourx and metrics. Applying left join with an assumption data.predicted contains all the data points with respect to date and hour present in data.real.

data.predicted <- merge(data.predicted, data.real, by = c('datex', 'hourx', 'metrics'), all.x = TRUE)

Last step is to take correlation for each datex, metrics and method values.

data.predicted <- data.predicted[,.(Cor_Col = cor(Value_Pred, Value_Real)), by = .(datex,  metrics, method)]

data.predicted

         datex metrics method    Cor_Col
 1: 2021-11-01   books  ARIMA 0.94689829
 2: 2021-11-01   books   LSTM 0.29430836
 3: 2021-11-01  shirts  ARIMA 0.99995736
 4: 2021-11-01  shirts   LSTM 0.53571818
 5: 2021-11-01   shoes  ARIMA 0.16742475
 6: 2021-11-01   shoes   LSTM 0.54756105
 7: 2021-11-01    hats  ARIMA 0.99356061
 8: 2021-11-01    hats   LSTM 0.08566112
 9: 2021-11-02   books  ARIMA         NA
10: 2021-11-02   books   LSTM         NA
11: 2021-11-02  shirts  ARIMA         NA
12: 2021-11-02  shirts   LSTM         NA
13: 2021-11-02   shoes  ARIMA         NA
14: 2021-11-02   shoes   LSTM         NA
15: 2021-11-02    hats  ARIMA         NA
16: 2021-11-02    hats   LSTM         NA