Home > other >  Add a column to one data frame based on multiple values in another by index number
Add a column to one data frame based on multiple values in another by index number

Time:04-23

I have two very large data frames. The first data frame has a list of county names and their associated fip codes. The second data set only has their fip codes.

I want to add two columns to the second data frame with their associated county names.

Suppose this is df1

df1 = data.frame(countyname = c("Archuleta County, CO","Baca County, CO","Cheyenne County, CO","Kiowa County, CO","Cimarron County, OK","Rio Arriba County, NM","Conejos County, CO"),
                 fipscounty = c(8007,8009,8017,8061,35039,40025,8021))

countyname fipscounty
1  Archuleta County, CO       8007
2       Baca County, CO       8009
3   Cheyenne County, CO       8017
4      Kiowa County, CO       8061
5   Cimarron County, OK      35039
6 Rio Arriba County, NM      40025
7    Conejos County, CO       8021

EDIT: And this is df2

df2 = data.frame(county1=c(8007,8007,8009,8017),
                 distance=c(4,3,2,1),
                 county2=c(35039,8021,40025,8061))
  county1 distance county2
1    8007        4   35039
2    8007        3    8021
3    8009        2   40025
4    8017        1    8061

EDIT: I want the end result to look like this:

            countyname fipscounty distance        countyneighbor fipscounty2
1 Archuleta County, CO       8007        4   Cimarron County, OK       35039
2 Archuleta County, CO       8007        3    Conejos County, CO        8021
3      Baca County, CO       8009        2 Rio Arriba County, NM       40025
4  Cheyenne County, CO       8017        1      Kiowa County, CO        8061

I want to transfer the county names from df1 to df2 using the fips codes from df1 and df2. Since they don't have the same column names, I'll likely have to do this using index number. But, I do not want to transfer the entire row, otherwise I'd have duplicate fips columns.

I tried this, but of course it errored

df2 <- left_join(df1,df2, by= df1[2])

How would I do this?

CodePudding user response:

Using match.

m <- match(df2$county1, df1$fipscounty)
res <- cbind(df1[m, ], df1[match(df2$county2, df1$fipscounty), ])
names(res)[c(2, 4)] <- names(df2)[c(1, 3)]
res
#               countyname county1            countyname county2
# 1   Archuleta County, CO    8007   Cimarron County, OK   35039
# 1.1 Archuleta County, CO    8007    Conejos County, CO    8021
# 2        Baca County, CO    8009 Rio Arriba County, NM   40025
# 3    Cheyenne County, CO    8017      Kiowa County, CO    8061

Edit

Based on your edit, you could include merge and append as tools.

m1 <- merge(df1a, df2a, by.x='fipscounty', by.y='county1')[c(2, 1, 3:4)]
append(m1, 
       list(countyneighbor=df1a[match(m1$county2, df1a$fipscounty), 
                                'countyname']), 3) |>
  as.data.frame()

#             countyname fipscounty distance        countyneighbor county2
# 1 Archuleta County, CO       8007        4   Cimarron County, OK   35039
# 2 Archuleta County, CO       8007        3    Conejos County, CO    8021
# 3      Baca County, CO       8009        2 Rio Arriba County, NM   40025
# 4  Cheyenne County, CO       8017        1      Kiowa County, CO    8061

Note: R >= 4.1 used.


Data:

df1 <- structure(list(countyname = c("Archuleta County, CO", "Baca County, CO", 
"Cheyenne County, CO", "Kiowa County, CO", "Cimarron County, OK", 
"Rio Arriba County, NM", "Conejos County, CO"), fipscounty = c(8007, 
8009, 8017, 8061, 35039, 40025, 8021)), class = "data.frame", row.names = c(NA, 
-7L))

df2 <- structure(list(county1 = c(8007, 8007, 8009, 8017), county2 = c(35039, 
8021, 40025, 8061)), class = "data.frame", row.names = c(NA, 
-4L))

df1a <- structure(list(countyname = c("Archuleta County, CO", "Baca County, CO", 
"Cheyenne County, CO", "Kiowa County, CO", "Cimarron County, OK", 
"Rio Arriba County, NM", "Conejos County, CO"), fipscounty = c(8007, 
8009, 8017, 8061, 35039, 40025, 8021)), class = "data.frame", row.names = c(NA, 
-7L))

df2a <- structure(list(county1 = c(8007, 8007, 8009, 8017), distance = c(4, 
3, 2, 1), county2 = c(35039, 8021, 40025, 8061)), class = "data.frame", row.names = c(NA, 
-4L))
  • Related