Home > Software engineering >  Perform functions on two lists of dataframes with one similar column
Perform functions on two lists of dataframes with one similar column

Time:12-21

I have two separate lists of dataframes:

Example Data

#Example columns
Label <- c("Blue_001_Series009", "Blue_001_Series009", "Blue_001_Series009", "Blue_001_Series009","Red_001_Series008", "Red_001_Series008","Red_001_Series008","Red_001_Series008","Blue_002_Series009", "Blue_002_Series009","Blue_002_Series009","Blue_002_Series009")
Pred <- c("Pear", "Orange", "Apple", "Peach", "Pear", "Orange", "Apple", "Peach", "Pear", "Orange", "Apple", "Peach")
n <- c(10, 223, 890, 34, 78, 902, 34, 211, 1007,209, 330, 446)

#make example data frame
data <- data.frame(Label, Pred, n)

#Split dataframe into a list of dataframes
df <- split(data, f = data$Label)  


#Second dataframe example columns

Label1 <- c("Red_001_Series008","Blue_001_Series009", "Blue_002_Series009")
TotalArea <- c(1904, 578, 7092)

#Make dataframe
data1 <- data.frame(Label1, TotalArea)
#Split dataframe into a list of dataframes
df1 <- split(data1, f = data1$Label1)   

Problem

Each list of dataframes consists of the same labels, however they are in a different order. I would like to.

  1. Match df to df1 based on their label

  2. Divide the n column in df by the TotalArea column in df1, based on their label.

For example. A snippet of df:

Label                  Pred   n
1 Blue_001_Series009   Pear  10
2 Blue_001_Series009 Orange 223
3 Blue_001_Series009  Apple 890
4 Blue_001_Series009  Peach  34

Snippet of df1:

             Label1      TotalArea
2   Blue_001_Series009       578

I would like to get:

Blue_001_Series009 Pear / Blue_001_Series009 TotalArea
10 / 578 = 0.0173

Blue_001_Series009 Orange / Blue_001_Series009 TotalArea
223 / 578 = 0.3858

Etc...

And this must occur in each matched dataframe in each list of dataframes. In reality I have hundreds of dataframes inside a list, so must be able to be able to handle mass amounts of data processing.

I haven't been able to find anything similar online and am finding it hard to work with lists of dataframes.

CodePudding user response:

You can use a for loop iterating over the names:

for (nm in names(df)) {
  df[[nm]]$new <- df[[nm]]$n / df1[[nm]]$TotalArea
}

df
$Blue_001_Series009
               Label   Pred   n        new
1 Blue_001_Series009   Pear  10 0.01730104
2 Blue_001_Series009 Orange 223 0.38581315
3 Blue_001_Series009  Apple 890 1.53979239
4 Blue_001_Series009  Peach  34 0.05882353

$Blue_002_Series009
                Label   Pred    n        new
9  Blue_002_Series009   Pear 1007 0.14199098
10 Blue_002_Series009 Orange  209 0.02946983
11 Blue_002_Series009  Apple  330 0.04653130
12 Blue_002_Series009  Peach  446 0.06288776

$Red_001_Series008
              Label   Pred   n        new
5 Red_001_Series008   Pear  78 0.04096639
6 Red_001_Series008 Orange 902 0.47373950
7 Red_001_Series008  Apple  34 0.01785714
8 Red_001_Series008  Peach 211 0.11081933

CodePudding user response:

You can use Map, which is kinda multivariate version of lapply.

Map(\(x, y) x$n/y$TotalArea, df, df1)
# $Blue_001_Series009
# [1] 0.01730104 0.38581315 1.53979239 0.05882353
# 
# $Blue_002_Series009
# [1] 0.14199098 0.02946983 0.04653130 0.06288776
# 
# $Red_001_Series008
# [1] 0.04096639 0.47373950 0.01785714 0.11081933

Case, you want to add it as new column to df:

Map(\(x, y) {x$n2 <- x$n/y$TotalArea; x}, df, df1)
# $Blue_001_Series009
#                Label   Pred   n         n2
# 1 Blue_001_Series009   Pear  10 0.01730104
# 2 Blue_001_Series009 Orange 223 0.38581315
# 3 Blue_001_Series009  Apple 890 1.53979239
# 4 Blue_001_Series009  Peach  34 0.05882353
# 
# $Blue_002_Series009
#                 Label   Pred    n         n2
# 9  Blue_002_Series009   Pear 1007 0.14199098
# 10 Blue_002_Series009 Orange  209 0.02946983
# 11 Blue_002_Series009  Apple  330 0.04653130
# 12 Blue_002_Series009  Peach  446 0.06288776
# 
# $Red_001_Series008
#               Label   Pred   n         n2
# 5 Red_001_Series008   Pear  78 0.04096639
# 6 Red_001_Series008 Orange 902 0.47373950
# 7 Red_001_Series008  Apple  34 0.01785714
# 8 Red_001_Series008  Peach 211 0.11081933

Note, that df and df1 need to be in same order, but that seems to be the case.


Data:

df <- list(Blue_001_Series009 = structure(list(Label = c("Blue_001_Series009", 
"Blue_001_Series009", "Blue_001_Series009", "Blue_001_Series009"
), Pred = c("Pear", "Orange", "Apple", "Peach"), n = c(10, 223, 
890, 34)), row.names = c(NA, 4L), class = "data.frame"), Blue_002_Series009 = structure(list(
    Label = c("Blue_002_Series009", "Blue_002_Series009", "Blue_002_Series009", 
    "Blue_002_Series009"), Pred = c("Pear", "Orange", "Apple", 
    "Peach"), n = c(1007, 209, 330, 446)), row.names = 9:12, class = "data.frame"), 
    Red_001_Series008 = structure(list(Label = c("Red_001_Series008", 
    "Red_001_Series008", "Red_001_Series008", "Red_001_Series008"
    ), Pred = c("Pear", "Orange", "Apple", "Peach"), n = c(78, 
    902, 34, 211)), row.names = 5:8, class = "data.frame"))


df1 <- list(Blue_001_Series009 = structure(list(Label1 = "Blue_001_Series009", 
    TotalArea = 578), row.names = 2L, class = "data.frame"), 
    Blue_002_Series009 = structure(list(Label1 = "Blue_002_Series009", 
        TotalArea = 7092), row.names = 3L, class = "data.frame"), 
    Red_001_Series008 = structure(list(Label1 = "Red_001_Series008", 
        TotalArea = 1904), row.names = 1L, class = "data.frame"))
  • Related