Home > Mobile >  How do I merge and filter dataframe based on multiple conditions?
How do I merge and filter dataframe based on multiple conditions?

Time:03-19

I want to merge the d and ann dataframes but only keep the FID and IID from d dataframe and also the Profile from ann dataframe. I only want BP and Unaffected control profiles. I also want to change the Profile column name to Phenotype.

My code:

library(dplyr)
pheno <- d %>% merge(ann %>% filter(Profile=="BP"|Profile=="Unaffected control"), by=c("FID"="Database_ID"))
colnames(pheno)[Profile] <- "Phenotype"
FID IID Phenotype
AC13 1 BP
AC14 1 BP
AC15 1 Unaffected control

d dataframe

> dput(d)
structure(list(FID = c("AC10", "AC11", "AC12", "AC13", "AC14", 
"AC15", "AC17", "AC18", "AC19", "AC1", "AC20", "AC21", "AC22", 
"AC23", "AC24", "AC25", "AC26", "AC27", "AC29", "AC2", "AC30", 
"AC31", "AC32", "AC33", "AC34", "AC35", "AC36", "AC37", "AC38", 
"AC39", "AC3", "AC40", "AC41", "AC42", "AC43", "AC45", "AC46", 
"AC47", "AC48", "AC49", "AC50", "AC51", "AC52", "AC53", "AC54", 
"AC55", "AC56", "AC57", "AC58", "AC5", "AC60", "AC61", "AC62", 
"AC63", "AC64", "AC65", "AC66", "AC67", "AC69", "AC6", "AC70", 
"AC71", "AC72", "AC73", "AC74", "AC75", "AC76", "AC77", "AC78", 
"AC79", "AC7", "AC80", "AC81", "AC82", "AC83", "AC84", "AC86", 
"AC87", "AC88", "AC89", "AC8", "AC90", "AC91", "AC92", "AC9", 
"AC100", "AC101", "AC102", "AC103", "AC104", "AC105", "AC16", 
"AC68", "AC93", "AC94", "AC95", "AC96", "AC97", "AC99", "DE10", 
"DE12", "DE13", "DE14", "DE15", "DE16", "DE17", "DE18", "DE19", 
"DE1", "DE20", "DE21", "DE22", "DE23", "DE25", "DE26", "DE27", 
"DE2", "DE33", "DE34", "DE35", "DE36", "DE37", "DE38", "DE39", 
"DE3", "DE40", "DE41", "DE42", "DE44", "DE45", "DE46", "DE47", 
"DE48", "DE49", "DE4", "DE50", "DE51", "DE52", "DE53", "DE54", 
"DE55", "DE56", "DE57", "DE58", "DE59", "DE60", "DE7", "DE9", 
"DE29", "DE30", "DE32", "DE43", "DE5"), IID = c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), SOL = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L), C1 = c(-0.00385609, 0.0101138, -0.0146168, -0.0218236, -0.0134745, 
-0.017089, 0.0152448, 0.0134359, 0.00540102, -0.0125389, 0.00463956, 
-0.00416079, -0.000325898, 0.0132781, 0.0130666, 0.00718399, 
-0.0051912, -0.0227934, 0.0364974, -0.0180301, -0.0226556, -0.00585266, 
0.0258924, -0.00994298, -0.00380612, 0.0187883, 0.0103367, 0.00747272, 
0.0191431, -0.00501846, -0.00118336, 0.0361201, 0.00830498, 0.00380194, 
0.00667686, -0.000441697, -0.00170991, -0.0281008, -0.00424591, 
0.0213412, 0.00261405, 0.016154, 0.0098956, 0.0141544, 0.0367203, 
0.0144693, 0.0256731, -0.00218851, 0.0204603, -0.000603019, -0.00504176, 
-0.00917368, 0.00237875, 0.0175946, 0.0188388, 0.0368965, -0.00408476, 
0.00871812, -0.00851917, 0.0252035, -0.00915532, 0.0223745, 0.016866, 
0.026825, 0.0366276, 0.0540474, 0.0386237, 0.0029996, 0.0207176, 
0.0177353, -0.0066377, 0.0343811, 0.0282509, 0.00526683, 0.0459516, 
0.00976286, 0.0259005, -0.00104822, -0.012696, 0.0134071, 0.0231658, 
0.00359455, 0.0194968, -0.000936478, -0.0029218, -0.0058512, 
-0.000837274, -0.0129465, -0.0102079, -0.00559039, 0.0118966, 
0.00147658, 0.0120396, -0.0104779, -0.0315149, -0.0115454, -0.0122457, 
-6.72242e-05, 0.00370599, -0.0164126, -0.0107853, -0.0271741, 
-0.0212005, -0.0445118, -0.0387773, -0.025109, -0.0321735, -0.0398603, 
-0.0266408, -0.0260984, -0.0296337, -0.0185381, -0.0403944, 0.0197937, 
-0.0176322, -0.013238, -0.0071666, -7.27277e-05, 0.00397489, 
0.0335056, -0.00604706, -0.00926438, 0.00706601, -0.0156982, 
-0.0275085, -0.00864179, -0.0247967, -0.030564, -0.00767327, 
-0.0235161, 0.00649758, -0.0329062, -0.0016138, -0.00701695, 
0.00819454, 0.0100377, 0.0250199, -0.0493141, -0.0216641, -0.0244709, 
-0.00466616, 0.016751, -0.0191688, -0.00492488, -0.0162364, -0.0167085, 
-0.0113427, 0.000422333, 0.030274, 0.0317995, 0.00237194, -0.00693838, 
-0.0100835), C2 = c(0.000865365, -0.001752, 0.0189917, -0.023343, 
-0.0340531, -0.0258976, -0.00794043, 0.0173163, 0.00639341, -0.0343077, 
0.01083, -0.0402179, 0.0158751, -0.00262893, -0.0216757, -0.00261259, 
-0.00542089, -0.00515714, 0.0105216, -0.0193606, 0.00692795, 
-0.0117295, -0.0235627, -0.00850041, -0.0156109, -0.00871875, 
-0.0163218, 0.0227143, -0.0161961, -0.0176719, -0.0070994, 0.0262932, 
0.00164033, -0.00969917, -0.0197631, -0.0154387, -0.0194608, 
0.00442207, -0.0234804, 0.00822342, -0.00657274, -0.0092332, 
0.0130892, -0.0345162, -0.0114187, -0.0129497, -0.00306092, 0.0417858, 
0.0262002, -0.0188849, -0.0184154, -0.0109956, -0.0151195, -0.00414531, 
0.010064, 0.0308816, -0.0153337, 0.0157867, -0.0289866, -0.0106713, 
0.000112714, -0.00152177, 0.0184509, 0.0112357, 0.00097954, 0.032083, 
0.0190258, -0.0371498, -0.0307498, -0.00947645, -0.00198995, 
0.015845, -0.0240248, -0.0122369, -0.00107049, -0.0144661, 0.0207883, 
-0.0418619, -0.0123712, -0.0212721, -0.00667244, -0.028512, -0.00522357, 
-0.018842, -0.0123026, -0.00511655, 0.0188473, 0.00739189, 0.0321578, 
-0.015449, 0.0214631, -0.00995001, -0.00144645, 0.00934907, 0.0344757, 
-0.0220224, 0.0121403, -0.00615057, -0.0208969, 0.0313899, -0.0251011, 
0.011635, 0.00536455, 0.0233033, -0.0019204, 0.0273593, 0.00844028, 
0.00181444, 0.02824, 0.0255231, 0.00266055, -0.00850383, -0.0129938, 
0.0268634, 0.0195986, 0.0320615, -0.0026514, 0.0127147, 0.014279, 
0.0553434, -0.020963, 0.00629119, -0.0244099, -0.0080923, 0.0173508, 
0.0485753, -0.00666049, 0.0501603, 0.0029162, 0.0267363, 0.0066606, 
0.00857736, 0.0172693, -0.00827586, -0.0117478, -0.00336638, 
0.00954265, -0.00889617, 0.00290055, 0.0229832, 0.0504569, 0.025979, 
-0.00795356, -0.0135421, -0.00359528, 0.0150037, -0.0105817, 
0.0167827, 0.0110882, 0.00200862, -0.00597284, -0.0188371, -0.00827599
)), class = "data.frame", row.names = c(NA, -153L))

The Database_ID and Profile columns in the ann dataframe (there are other columns in the ann dataframe...refer to below)

> dput(ann[c("Database_ID","Profile")])
structure(list(Database_ID = c("AC10", "AC11", "AC12", "AC13",
"AC14", "AC15", "AC17", "AC18", "AC19", "AC1", "AC20", "AC21",
"AC22", "AC23", "AC24", "AC25", "AC26", "AC27", "AC29", "AC2",
"AC30", "AC31", "AC32", "AC33", "AC34", "AC35", "AC36", "AC37",
"AC38", "AC39", "AC3", "AC40", "AC41", "AC42", "AC43", "AC45",
"AC46", "AC47", "AC48", "AC49", "AC50", "AC51", "AC52", "AC53",
"AC54", "AC55", "AC56", "AC57", "AC58", "AC5", "AC60", "AC61",
"AC62", "AC63", "AC64", "AC65", "AC66", "AC67", "AC69", "AC6",
"AC70", "AC71", "AC72", "AC73", "AC74", "AC75", "AC76", "AC77",
"AC78", "AC79", "AC7", "AC80", "AC81", "AC82", "AC83", "AC84",
"AC86", "AC87", "AC88", "AC89", "AC8", "AC90", "AC91", "AC92",
"AC9", "AC100", "AC101", "AC102", "AC103", "AC104", "AC105",
"AC16", "AC68", "AC93", "AC94", "AC95", "AC96", "AC97", "AC99",
"DE10", "DE12", "DE13", "DE14", "DE15", "DE16", "DE17", "DE18",
"DE19", "DE1", "DE20", "DE21", "DE22", "DE23", "DE25", "DE26",
"DE27", "DE2", "DE33", "DE34", "DE35", "DE36", "DE37", "DE38",
"DE39", "DE3", "DE40", "DE41", "DE42", "DE44", "DE45", "DE46",
"DE47", "DE48", "DE49", "DE4", "DE50", "DE51", "DE52", "DE53",
"DE54", "DE55", "DE56", "DE57", "DE58", "DE59", "DE60", "DE7",
"DE9", "DE29", "DE30", "DE32", "DE43", "DE5"), Profile = c("Schiz.",
"Schiz.", "Schiz.", "BP", "BP", "Unaffected control", "Schiz.",
"BP", "Unaffected control", "BP", "BP", "BP", "Schiz.", "BP",
"Unaffected control", "Schiz.", "Schiz.", "Unaffected control",
"Unaffected control", "BP", "Unaffected control", "Schiz.", "BP",
"Unaffected control", "BP", "Unaffected control", "BP", "Schiz.",
"Unaffected control", "Schiz.", "Schiz.", "Schiz.", "Schiz.",
"BP", "Unaffected control", "Schiz.", "BP", "Schiz.", "BP", "Unaffected control",
"BP", "Unaffected control", "Unaffected control", "Unaffected control",
"Unaffected control", "Schiz.", "Unaffected control", "BP", "BP",
"BP", "Unaffected control", "BP", "BP", "BP", "BP", "Unaffected control",
"Schiz.", "Unaffected control", "BP", "BP", "Unaffected control",
"Unaffected control", "BP", "Schiz.", "BP", "Schiz.", "BP", "Unaffected control",
"Schiz.", "Unaffected control", "Schiz.", "Unaffected control",
"Schiz.", "Schiz.", "Unaffected control", "Unaffected control",
"Unaffected control", "Schiz.", "Schiz.", "BP", "BP", "Unaffected control",
"Unaffected control", "Schiz.", "Schiz.", "Schiz.", "Schiz.",
"BP", "Unaffected control", "BP", "Unaffected control", "BP",
"Schiz.", "Schiz.", "Schiz.", "Unaffected control", "Unaffected control",
"Schiz.", "Unaffected control", "Unaffected control", "Unaffected control",
"BP", "BP", "Dep.", "Unaffected control", "Unaffected control",
"Dep.", "Dep.", "Dep.", "Dep.", "Dep.", "Unaffected control",
"Unaffected control", "Schiz.", "Dep.", "BP", "Dep.", "Schiz.",
"Schiz.", "Schiz.", "BP", "Unaffected control", "Unaffected control",
"Unaffected control", "BP", "BP", "Dep.", "Schiz.", "Dep.", "BP",
"Unaffected control", "Unaffected control", "Schiz.", "Schiz.",
"Unaffected control", "Unaffected control", "BP", "BP", "Schiz.",
"Dep.", "BP", "Dep.", "BP", "Schiz.", "Unaffected control", "Dep.",
"BP", "Schiz.", "Dep.", "Dep.", "BP", "BP", "Schiz.")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100",
"101", "102", "103", "104", "105", "106", "107", "108", "109",
"110", "111", "112", "113", "114", "115", "116", "117", "118",
"119", "120", "121", "122", "123", "124", "125", "126", "127",
"128", "129", "130", "131", "132", "133", "134", "135", "136",
"137", "138", "139", "140", "141", "142", "143", "144", "145",
"146", "147", "148", "149", "150", "151", "152", "153"), class = "data.frame")

There are other columns in ann, which I don't need.

> dput(head(ann))
structure(list(Database_ID = c("AC10", "AC11", "AC12", "AC13",
"AC14", "AC15"), Collection_type = c("Array", "Array", "Array",
"Array", "Array", "Array"), Age = c(40L, 51L, 19L, 49L, 48L,
44L), Round_Age = c(40L, 50L, 20L, 50L, 50L, 45L), sex = c("M",
"M", "M", "F", "F", "F"), race = c("White", "White", "White",
"White", "White", "White"), Profile = c("Schiz.", "Schiz.", "Schiz.",
"BP", "BP", "Unaffected control"), DSM_IV = c(295.9, 295.9, 295.9,
296.44, 296.7, NA), Axis_I_Primary_Dx = c("SCHIZ UNDIFF", "SCHIZ UNDIFF",
"SCHIZ UNDIFF", "BP I", "BP I", "CONTROL"), PMI = c(34L, 43L,
28L, 19L, 18L, 28L), Brain_PH = c(6.18, 6.63, 6.73, 5.87, 6.5,
6.59), Left_Brain = c("Fixed", "Frozen", "Frozen", "Fixed", "Frozen",
"Fixed"), Suicide_Status = c("No", "No", "No", "Yes", "No", "No"
), Psychotic_Feature = c("Yes", "Yes", "Yes", "Yes", "Unknown",
"No"), Rate_Of_Death = c("Possible anoxia at death (e.g., CO poisoning or pneumonia)",
"Sudden", "Sudden", "Sudden", "Sudden", "Sudden"), Exacerbation = c("No evidence of exacerbation",
"No evidence of exacerbation", "No evidence of exacerbation",
"Definite", "No evidence of exacerbation", "Unknown"), Smoking_At_TOD = c("Yes",
"Yes", "No", "Yes", "Yes", "Unknown"), Lifetime_Alcohol_Use = c("Moderate drinking in present (same as above, for recent period)",
"Moderate drinking in present (same as above, for recent period)",
"Social (1?2 drinks/day)", "Little or none (less than 1 drink/day)",
"Social (1?2 drinks/day)", "Moderate drinking in present (same as above, for recent period)"
), Lifetime_Drug_Use = c("Moderate drug use in present", "Social",
"Heavy drug use in present", "Little or none", "Moderate drug use in present",
"Little or none"), Lifetime_Antipsychotics = c(75000L, 130000L,
2500L, 4000L, 0L, 0L)), row.names = c("1", "2", "3", "4", "5",
"6"), class = "data.frame")

CodePudding user response:

Define a vector of wanted profiles, select from data set d the columns to be in the final result, and inner_join with the filtered data set ann.

suppressPackageStartupMessages(library(dplyr))

wanted_profiles <- c("BP", "Unaffected control")

pheno <- d %>%
  select(FID, IID) %>%
  inner_join(ann %>% filter(Profile %in% wanted_profiles), by=c("FID"="Database_ID")) %>%
  rename(Phenotype = Profile)

head(pheno)
#>    FID IID          Phenotype
#> 1 AC13   1                 BP
#> 2 AC14   1                 BP
#> 3 AC15   1 Unaffected control
#> 4 AC18   1                 BP
#> 5 AC19   1 Unaffected control
#> 6  AC1   1                 BP

Created on 2022-03-19 by the reprex package (v2.0.1)


Edit

To keep only ann columns Database_ID and Profile, run

d %>%
  select(FID, IID) %>%
  inner_join(ann %>% 
               select(Database_ID, Profile) %>%
               filter(Profile %in% wanted_profiles), 
             by=c("FID"="Database_ID")) %>%
  rename(Phenotype = Profile)

CodePudding user response:

The solution is to select FID and IID columns from the d dataframe and join it with Database_ID and Profile columns from the ann dataframe by the FID and Database_ID columns in d and ann, respectively. Then, I filter based on the BP and Unaffected control variables in the Profile column of the ann dataframe.

> pheno <- d %>% select(FID, IID) %>% inner_join(ann %>% select("Database_ID", "Profile") %>% filter(Profile %in% c("BP"
, "Unaffected control")), by=c("FID"="Database_ID")) %>% rename(Phenotype=Profile)

Output

> head(pheno)
   FID IID          Phenotype
1 AC13   1                 BP
2 AC14   1                 BP
3 AC15   1 Unaffected control
4 AC18   1                 BP
5 AC19   1 Unaffected control
6  AC1   1                 BP
  • Related