Home > Back-end >  for loop to merge two data frames with common column R
for loop to merge two data frames with common column R

Time:11-21

I need to add some missing values in a dataset based on a code list. I thought to do this by running a loop combined with a common column merge on lists.

Could be a duplicate of Merge in loop R, or a special case.


#load data
data("mtcars")
#add car names
mtcars <- cbind(cars = rownames(mtcars), mtcars)
rownames(mtcars) <- 1:nrow(mtcars)
#add dates and arrange
date <- rep(seq(as.Date("2015-01-02"), by = "month", length.out = 4),times = 8),
mtcars <- cbind(date = date, mtcars)
mtcars <- mtcars %>% 
  arrange(., date)
#add additional cars
add_cars <- c("renault", "dacia", "benz", "ferrari",
                "AC", "Acura", "Aixam", "Alfa",
                "Bertone", "Bestune", "Chevrolet",
                "Chrysler", "Haima", "Haval", "Hawtai", "Hennessey")
total_cars <- as_tibble(c(unique(mtcars$cars), add_cars))
colnames(total_cars) <-  "cars"
#split data on dates, list total cars
car_dates <- split(mtcars, f= mtcars$date)
total_cars <- as.list(total_cars)

#execute loop
results <- vector(mode = "integer", length = length(car_dates))
mylist <- list()

for (i in 1:length(car_dates)){
  g <- nrow(car_dates[[i]])
  results[i] <- g
  if (results[i] < 144){
    res <- list(merge(x = car_dates[[i]], y= total_cars,
                      by = c("cars"), all = T))
    mylist <- c(mylist, res)
    mydata_full <- as.data.frame(mylist)
  } 
}


This loop harvest is a data frame with 48 obs. of 52 variables. Which is partially what I am aiming for. I got the loop to add the missing observations to each date, but it spread the dataset. Now for each date, the initial 13 variables are repeated.

I am stuck here, I only want the initial 13 variables, not long data.


mydata_full <- as_tibble(mydata_full)
head(mydata_full)
# A tibble: 6 x 52
  cars     date         mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb cars.1  date.1     mpg.1 cyl.1
  <chr>    <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>   <date>     <dbl> <dbl>
1 AC       NA            NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA AC      NA            NA    NA
2 Acura    NA            NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA Acura   NA            NA    NA
3 Aixam    NA            NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA Aixam   NA            NA    NA
4 Alfa     NA            NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA Alfa    NA            NA    NA
5 AMC Jav~ NA            NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA AMC Ja~ NA            NA    NA
6 benz     NA            NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA benz    NA            NA    NA
# ... with 35 more variables: disp.1 <dbl>, hp.1 <dbl>, drat.1 <dbl>, wt.1 <dbl>, qsec.1 <dbl>, vs.1 <dbl>,
#   am.1 <dbl>, gear.1 <dbl>, carb.1 <dbl>, cars.2 <chr>, date.2 <date>, mpg.2 <dbl>, cyl.2 <dbl>, disp.2 <dbl>,
#   hp.2 <dbl>, drat.2 <dbl>, wt.2 <dbl>, qsec.2 <dbl>, vs.2 <dbl>, am.2 <dbl>, gear.2 <dbl>, carb.2 <dbl>,
#   cars.3 <chr>, date.3 <date>, mpg.3 <dbl>, cyl.3 <dbl>, disp.3 <dbl>, hp.3 <dbl>, drat.3 <dbl>, wt.3 <dbl>,
#   qsec.3 <dbl>, vs.3 <dbl>, am.3 <dbl>, gear.3 <dbl>, carb.3 <dbl>


I am sure this could be done with a more simple full_join, I tried but succeeded only to full_join separately on each date, what am I missing?

#after rearranging the classes to tibble

mtcars_short <- mtcars %>%
  filter(date == "2015-02-02") %>%
  full_join(total_cars, by= c("cars"))

> print(mtcars_short)
# A tibble: 48 x 13
   date       cars                mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
   <date>     <chr>             <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 2015-02-02 Mazda RX4 Wag      21       6 160     110  3.9   2.88  17.0     0     1     4     4
 2 2015-02-02 Valiant            18.1     6 225     105  2.76  3.46  20.2     1     0     3     1
 3 2015-02-02 Merc 280           19.2     6 168.    123  3.92  3.44  18.3     1     0     4     4
 4 2015-02-02 Merc 450SLC        15.2     8 276.    180  3.07  3.78  18       0     0     3     3
 5 2015-02-02 Fiat 128           32.4     4  78.7    66  4.08  2.2   19.5     1     1     4     1
 6 2015-02-02 Dodge Challenger   15.5     8 318     150  2.76  3.52  16.9     0     0     3     2
 7 2015-02-02 Fiat X1-9          27.3     4  79      66  4.08  1.94  18.9     1     1     4     1
 8 2015-02-02 Ferrari Dino       19.7     6 145     175  3.62  2.77  15.5     0     1     5     6
 9 NA         Mazda RX4          NA      NA  NA      NA NA    NA     NA      NA    NA    NA    NA
10 NA         Hornet Sportabout  NA      NA  NA      NA NA    NA     NA      NA    NA    NA    NA

I would like to have a df of 192 obs. and 13 variables. Meaning for each unique date (4) I want all the observations (48).


# A tibble: 48 x 52
   cars    date         mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb 
   <chr>   <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 
 1 AC      2015-01-02  21       6  160    110  3.9   2.62  16.5     0     1     4     4
 2 Acura   2015-01-02  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2 
 3 Aixam   2015-01-02  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2 
 4 Alfa    2015-01-02  17.3     8  276.   180  3.07  3.73  17.6     0     0     3     3 
 5 AMC Ja~ 2015-01-02  14.7     8  440    230  3.23  5.34  17.4     0     0     3     4 
 6 benz    .            .    .    .    .    .    .    .    .    .    .    .     .     .
 7 Bertone .            .    .    .    .    .    .    .    .    .    .    .     .     .
 8 Bestune .            .    .    .    .    .    .    .    .    .    .    .     .     . 
 9 Cadill~ .            .    .    .    .    and so on    .    .    .     .     .  
10 Camaro~ .            .    .    .    .    .    .    .    .    .    .    .     .     .
.          date2
.          .
.          date3
.          etc.
.
192

Any input would be appreciated!

CodePudding user response:

After some more hours of digging a solution came across, and it is wonderful!

I found it in the Q: Convert a list to a data frame. Thanks to the comment of @mflo-ByeSE , I found the solution here: https://www.r-bloggers.com/2014/06/concatenating-a-list-of-data-frames/

I modified the loop so the list elements would be given the date names by adding

names(res) <- names(car_dates[i])

in the loop

and I kept the output as a list deleting

mydata_full <- as.data.frame(mylist)

improved loop and solution below


#loop
results <- vector(mode = "integer", length = length(car_dates))
mylist <- list()

for (i in 1:length(car_dates)){
  g <- nrow(car_dates[[i]])
  results[i] <- g
  if (results[i] < 144){
    res <- list(merge(x = car_dates[[i]], y= total_cars,
                      by = c("cars"), all = T))
    names(res) <- names(car_dates[i])
    mylist <- c(mylist, res)
  } 
}

#then
mydata_full <- as_tibble(plyr::ldply(mylist, rbind))


Cheers

CodePudding user response:

A simple join can solve this. Create a dataframe with two columns. One containing all the distinct car names repeated by number same as unique dates and the other containing distinct dates repeated each by number of distinct cars.

Above dataframe will look like this:

           date              cars
  1: 2015-01-02         Mazda RX4
  2: 2015-01-02     Mazda RX4 Wag
  3: 2015-01-02        Datsun 710
  4: 2015-01-02    Hornet 4 Drive
  5: 2015-01-02 Hornet Sportabout
  ---                             
188: 2015-04-02          Chrysler
189: 2015-04-02             Haima
190: 2015-04-02             Haval
191: 2015-04-02            Hawtai
192: 2015-04-02         Hennessey

Then we can perform a left join on this table with the mtcars data on date and cars as the joining key.

Below is the tried code

data("mtcars")
#add car names
mtcars <- cbind(cars = rownames(mtcars), mtcars)
rownames(mtcars) <- 1:nrow(mtcars)

date <- rep(seq(as.Date("2015-01-02"), by = "month", length.out = 4),times = 8)
mtcars <- cbind(date = date, mtcars)

#add additional cars
add_cars <- c("renault", "dacia", "benz", "ferrari",
          "AC", "Acura", "Aixam", "Alfa",
          "Bertone", "Bestune", "Chevrolet",
          "Chrysler", "Haima", "Haval", "Hawtai", "Hennessey")
total_cars <- c(unique(mtcars$cars), add_cars)

total_cars <- data.frame(date = rep(sort(unique(mtcars$date)), each = length(total_cars)), cars = rep(total_cars, length(unique(mtcars$date))))

total_cars <- merge(total_cars, mtcars, by = c('date', 'cars'), all.x = TRUE)

Sample output rows

          date             cars  mpg cyl  disp  hp drat    wt qsec vs am gear carb
183 2015-04-02       Merc 450SE 16.4   8 275.8 180 3.07 4.070 17.4  0  0    3    3
184 2015-04-02       Merc 450SL   NA  NA    NA  NA   NA    NA   NA NA NA   NA   NA
185 2015-04-02      Merc 450SLC   NA  NA    NA  NA   NA    NA   NA NA NA   NA   NA
186 2015-04-02 Pontiac Firebird   NA  NA    NA  NA   NA    NA   NA NA NA   NA   NA
187 2015-04-02    Porsche 914-2   NA  NA    NA  NA   NA    NA   NA NA NA   NA   NA
188 2015-04-02          renault   NA  NA    NA  NA   NA    NA   NA NA NA   NA   NA
189 2015-04-02   Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.9  1  1    4    1
190 2015-04-02    Toyota Corona   NA  NA    NA  NA   NA    NA   NA NA NA   NA   NA
191 2015-04-02          Valiant   NA  NA    NA  NA   NA    NA   NA NA NA   NA   NA
192 2015-04-02       Volvo 142E 21.4   4 121.0 109 4.11 2.780 18.6  1  1    4    2
  • Related