Home > Blockchain >  Differing number of rows in R
Differing number of rows in R

Time:04-18

I'm trying to merge four data frames into one but I haven't been able to since there seems to be an error in the number of rows. Each data frame contains information about stock price from 2021-02-28 to 2022-04-01.

#Loading stock data
install.packages("quantmod")
library(quantmod)


#Getting stock data for: Toyota (TYO), Renault (RNO.PA), Honda (HMC), Hyundai (HYMTF)
 
 getSymbols(c("HMC", "HYMTF", "RNO.PA", "TYO" ), na.rm = TRUE)


#Creating individual data frames. 
 
# Stock Honda. 
stock_honda <- expand.grid("HMC" = HMC$HMC.Close) %>%
  mutate(Date = row.names(as.data.frame(HMC))) %>%
  mutate(across(Date, ~ . %>% str_remove("^X") %>% ymd())) %>% 
  subset(Date >"2021-02-28" & Date < "2022-04-01") %>%
  rename(Close = HMC)
  
head(stock_honda)


#Stock Toyota
stock_tyo <- expand.grid("Toyota" = TYO$TYO.Close) %>%
mutate(Date = row.names(as.data.frame(TYO))) %>%
  mutate(across(Date, ~. %>% str_remove("^X") %>% ymd()))%>%
  subset(Date > "2021-02-28" & Date < "2022-04-01") %>%
  rename(Close = Toyota)


head(stock_tyo)


# Stock Renault
stock_rno <- expand.grid("Rneault" = RNO.PA$RNO.PA.Close) %>%
  mutate(Date = row.names(as.data.frame(RNO.PA))) %>%
  mutate(across(Date, ~. %>% str_remove("^X") %>% ymd()))%>%
  subset(Date > "2021-02-28" & Date < "2022-04-01") 

head(stock_rno)


#Stock Hyundai
stock_hyundai <- expand.grid("HYMTF" = HYMTF$HYMTF.Close) %>%
  mutate(Date = row.names(as.data.frame(HYMTF))) %>%
  mutate(across(Date, ~ . %>% str_remove("^X") %>% ymd())) %>% 
  subset(Date >"2021-02-28" & Date < "2022-04-01") %>%
  rename(Close = HYMTF)

head(stock_hyundai)


#Merging stocks data into one data frame
stocks <- data.frame("Honda" = stock_honda, "Hyundai" = stock_hyundai, 
                     "Renault" = stock_rno, "Toyota" = stock_tyo)

head(stocks)

Which gives me the error below:

Error in data.frame(Honda = stock_honda, Hyundai = stock_hyundai, Renault = stock_rno,  : 
  arguments imply differing number of rows: 276, 282
```

CodePudding user response:

The number of rows differ. It may be better to join by the 'Date' column as data.frame requires all columns to be of same length. It could also be done by padding NA at the end, but that can lead to bugs as we assume that all the datasets have the same sequence of dates without any interruption. Instead, a join will make sure that we get the rows corresponding to the same 'Date' and if not present, it gets filled by NA

library(zoo)
library(dplyr)
library(purrr)
library(stringr)
library(quantmod)
# keep the datasets in a list
out <- list(HMC, HYMTF, RNO.PA, TYO) %>%
  # loop over the list with `map`
  # convert each of the zoo objects to data.frame with `fortify.zoo`
   map(~ fortify.zoo(.x) %>% 
          # select the Index and Close columns
          select(Date = Index, ends_with('Close')) %>% 
          # remove the suffix Close if needed
          rename_with(~ str_remove(.x, "\\.Close"), ends_with("Close")) %>%
          # filter the rows based on the Date column
          filter(between(Date, as.Date("2021-02-28"), 
                               as.Date("2022-04-01")))) %>% 
  # finally reduce the list of data.frames to a single data.frame by joining
  reduce(full_join, by = 'Date')

-output

> str(out)
'data.frame':   284 obs. of  5 variables:
 $ Date  : Date, format: "2021-03-01" "2021-03-02" "2021-03-03" "2021-03-04" ...
 $ HMC   : num  28.5 28.4 28.9 28.6 29.4 ...
 $ HYMTF : num  49.3 48 48.9 46.7 45.7 ...
 $ RNO.PA: num  37.6 37.4 39.4 39.2 38.5 ...
 $ TYO   : num  8.94 8.8 9 9.1 9.22 9.29 9.15 9.1 9.07 9.26 ...

CodePudding user response:

A join is the most straightforward option here, but a cast to wide formight might also do the trick..

A data.table approach

library(data.table)
library(tibble)
# put everyting into a list, tibble::lst() uses the names 
#  of the objects added to the list as it's names... this comes
#  in handy when we rowbind the list two code-lines down...
L <- tibble::lst(stock_honda, stock_hyundai, stock_rno, stock_tyo)
# convert to data.tables
L <- lapply(L, as.data.table)
# rowbind together
DT <- data.table::rbindlist(L, use.names = FALSE, idcol = "stock")
# cast to wide
final <- dcast(DT, Date ~ stock, value.var = "Close")

#          Date stock_honda stock_hyundai stock_rno stock_tyo
# 1: 2021-03-01       28.48         49.26    37.635      8.94
# 2: 2021-03-02       28.40         48.03    37.420      8.80
# 3: 2021-03-03       28.88         48.89    39.370      9.00
# 4: 2021-03-04       28.60         46.70    39.175      9.10
# 5: 2021-03-05       29.39         45.74    38.550      9.22
# 6: 2021-03-08       29.48         45.50    40.075      9.29
# ...
  • Related