I want to merge the d
and ann
dataframes but only keep the FID
and IID
from d
dataframe and also the Profile
from ann
dataframe. I only want BP
and Unaffected control
profiles. I also want to change the Profile
column name to Phenotype
.
My code:
library(dplyr)
pheno <- d %>% merge(ann %>% filter(Profile=="BP"|Profile=="Unaffected control"), by=c("FID"="Database_ID"))
colnames(pheno)[Profile] <- "Phenotype"
FID | IID | Phenotype |
---|---|---|
AC13 | 1 | BP |
AC14 | 1 | BP |
AC15 | 1 | Unaffected control |
d
dataframe
> dput(d)
structure(list(FID = c("AC10", "AC11", "AC12", "AC13", "AC14",
"AC15", "AC17", "AC18", "AC19", "AC1", "AC20", "AC21", "AC22",
"AC23", "AC24", "AC25", "AC26", "AC27", "AC29", "AC2", "AC30",
"AC31", "AC32", "AC33", "AC34", "AC35", "AC36", "AC37", "AC38",
"AC39", "AC3", "AC40", "AC41", "AC42", "AC43", "AC45", "AC46",
"AC47", "AC48", "AC49", "AC50", "AC51", "AC52", "AC53", "AC54",
"AC55", "AC56", "AC57", "AC58", "AC5", "AC60", "AC61", "AC62",
"AC63", "AC64", "AC65", "AC66", "AC67", "AC69", "AC6", "AC70",
"AC71", "AC72", "AC73", "AC74", "AC75", "AC76", "AC77", "AC78",
"AC79", "AC7", "AC80", "AC81", "AC82", "AC83", "AC84", "AC86",
"AC87", "AC88", "AC89", "AC8", "AC90", "AC91", "AC92", "AC9",
"AC100", "AC101", "AC102", "AC103", "AC104", "AC105", "AC16",
"AC68", "AC93", "AC94", "AC95", "AC96", "AC97", "AC99", "DE10",
"DE12", "DE13", "DE14", "DE15", "DE16", "DE17", "DE18", "DE19",
"DE1", "DE20", "DE21", "DE22", "DE23", "DE25", "DE26", "DE27",
"DE2", "DE33", "DE34", "DE35", "DE36", "DE37", "DE38", "DE39",
"DE3", "DE40", "DE41", "DE42", "DE44", "DE45", "DE46", "DE47",
"DE48", "DE49", "DE4", "DE50", "DE51", "DE52", "DE53", "DE54",
"DE55", "DE56", "DE57", "DE58", "DE59", "DE60", "DE7", "DE9",
"DE29", "DE30", "DE32", "DE43", "DE5"), IID = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), SOL = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), C1 = c(-0.00385609, 0.0101138, -0.0146168, -0.0218236, -0.0134745,
-0.017089, 0.0152448, 0.0134359, 0.00540102, -0.0125389, 0.00463956,
-0.00416079, -0.000325898, 0.0132781, 0.0130666, 0.00718399,
-0.0051912, -0.0227934, 0.0364974, -0.0180301, -0.0226556, -0.00585266,
0.0258924, -0.00994298, -0.00380612, 0.0187883, 0.0103367, 0.00747272,
0.0191431, -0.00501846, -0.00118336, 0.0361201, 0.00830498, 0.00380194,
0.00667686, -0.000441697, -0.00170991, -0.0281008, -0.00424591,
0.0213412, 0.00261405, 0.016154, 0.0098956, 0.0141544, 0.0367203,
0.0144693, 0.0256731, -0.00218851, 0.0204603, -0.000603019, -0.00504176,
-0.00917368, 0.00237875, 0.0175946, 0.0188388, 0.0368965, -0.00408476,
0.00871812, -0.00851917, 0.0252035, -0.00915532, 0.0223745, 0.016866,
0.026825, 0.0366276, 0.0540474, 0.0386237, 0.0029996, 0.0207176,
0.0177353, -0.0066377, 0.0343811, 0.0282509, 0.00526683, 0.0459516,
0.00976286, 0.0259005, -0.00104822, -0.012696, 0.0134071, 0.0231658,
0.00359455, 0.0194968, -0.000936478, -0.0029218, -0.0058512,
-0.000837274, -0.0129465, -0.0102079, -0.00559039, 0.0118966,
0.00147658, 0.0120396, -0.0104779, -0.0315149, -0.0115454, -0.0122457,
-6.72242e-05, 0.00370599, -0.0164126, -0.0107853, -0.0271741,
-0.0212005, -0.0445118, -0.0387773, -0.025109, -0.0321735, -0.0398603,
-0.0266408, -0.0260984, -0.0296337, -0.0185381, -0.0403944, 0.0197937,
-0.0176322, -0.013238, -0.0071666, -7.27277e-05, 0.00397489,
0.0335056, -0.00604706, -0.00926438, 0.00706601, -0.0156982,
-0.0275085, -0.00864179, -0.0247967, -0.030564, -0.00767327,
-0.0235161, 0.00649758, -0.0329062, -0.0016138, -0.00701695,
0.00819454, 0.0100377, 0.0250199, -0.0493141, -0.0216641, -0.0244709,
-0.00466616, 0.016751, -0.0191688, -0.00492488, -0.0162364, -0.0167085,
-0.0113427, 0.000422333, 0.030274, 0.0317995, 0.00237194, -0.00693838,
-0.0100835), C2 = c(0.000865365, -0.001752, 0.0189917, -0.023343,
-0.0340531, -0.0258976, -0.00794043, 0.0173163, 0.00639341, -0.0343077,
0.01083, -0.0402179, 0.0158751, -0.00262893, -0.0216757, -0.00261259,
-0.00542089, -0.00515714, 0.0105216, -0.0193606, 0.00692795,
-0.0117295, -0.0235627, -0.00850041, -0.0156109, -0.00871875,
-0.0163218, 0.0227143, -0.0161961, -0.0176719, -0.0070994, 0.0262932,
0.00164033, -0.00969917, -0.0197631, -0.0154387, -0.0194608,
0.00442207, -0.0234804, 0.00822342, -0.00657274, -0.0092332,
0.0130892, -0.0345162, -0.0114187, -0.0129497, -0.00306092, 0.0417858,
0.0262002, -0.0188849, -0.0184154, -0.0109956, -0.0151195, -0.00414531,
0.010064, 0.0308816, -0.0153337, 0.0157867, -0.0289866, -0.0106713,
0.000112714, -0.00152177, 0.0184509, 0.0112357, 0.00097954, 0.032083,
0.0190258, -0.0371498, -0.0307498, -0.00947645, -0.00198995,
0.015845, -0.0240248, -0.0122369, -0.00107049, -0.0144661, 0.0207883,
-0.0418619, -0.0123712, -0.0212721, -0.00667244, -0.028512, -0.00522357,
-0.018842, -0.0123026, -0.00511655, 0.0188473, 0.00739189, 0.0321578,
-0.015449, 0.0214631, -0.00995001, -0.00144645, 0.00934907, 0.0344757,
-0.0220224, 0.0121403, -0.00615057, -0.0208969, 0.0313899, -0.0251011,
0.011635, 0.00536455, 0.0233033, -0.0019204, 0.0273593, 0.00844028,
0.00181444, 0.02824, 0.0255231, 0.00266055, -0.00850383, -0.0129938,
0.0268634, 0.0195986, 0.0320615, -0.0026514, 0.0127147, 0.014279,
0.0553434, -0.020963, 0.00629119, -0.0244099, -0.0080923, 0.0173508,
0.0485753, -0.00666049, 0.0501603, 0.0029162, 0.0267363, 0.0066606,
0.00857736, 0.0172693, -0.00827586, -0.0117478, -0.00336638,
0.00954265, -0.00889617, 0.00290055, 0.0229832, 0.0504569, 0.025979,
-0.00795356, -0.0135421, -0.00359528, 0.0150037, -0.0105817,
0.0167827, 0.0110882, 0.00200862, -0.00597284, -0.0188371, -0.00827599
)), class = "data.frame", row.names = c(NA, -153L))
The Database_ID
and Profile
columns in the ann
dataframe (there are other columns in the ann
dataframe...refer to below)
> dput(ann[c("Database_ID","Profile")])
structure(list(Database_ID = c("AC10", "AC11", "AC12", "AC13",
"AC14", "AC15", "AC17", "AC18", "AC19", "AC1", "AC20", "AC21",
"AC22", "AC23", "AC24", "AC25", "AC26", "AC27", "AC29", "AC2",
"AC30", "AC31", "AC32", "AC33", "AC34", "AC35", "AC36", "AC37",
"AC38", "AC39", "AC3", "AC40", "AC41", "AC42", "AC43", "AC45",
"AC46", "AC47", "AC48", "AC49", "AC50", "AC51", "AC52", "AC53",
"AC54", "AC55", "AC56", "AC57", "AC58", "AC5", "AC60", "AC61",
"AC62", "AC63", "AC64", "AC65", "AC66", "AC67", "AC69", "AC6",
"AC70", "AC71", "AC72", "AC73", "AC74", "AC75", "AC76", "AC77",
"AC78", "AC79", "AC7", "AC80", "AC81", "AC82", "AC83", "AC84",
"AC86", "AC87", "AC88", "AC89", "AC8", "AC90", "AC91", "AC92",
"AC9", "AC100", "AC101", "AC102", "AC103", "AC104", "AC105",
"AC16", "AC68", "AC93", "AC94", "AC95", "AC96", "AC97", "AC99",
"DE10", "DE12", "DE13", "DE14", "DE15", "DE16", "DE17", "DE18",
"DE19", "DE1", "DE20", "DE21", "DE22", "DE23", "DE25", "DE26",
"DE27", "DE2", "DE33", "DE34", "DE35", "DE36", "DE37", "DE38",
"DE39", "DE3", "DE40", "DE41", "DE42", "DE44", "DE45", "DE46",
"DE47", "DE48", "DE49", "DE4", "DE50", "DE51", "DE52", "DE53",
"DE54", "DE55", "DE56", "DE57", "DE58", "DE59", "DE60", "DE7",
"DE9", "DE29", "DE30", "DE32", "DE43", "DE5"), Profile = c("Schiz.",
"Schiz.", "Schiz.", "BP", "BP", "Unaffected control", "Schiz.",
"BP", "Unaffected control", "BP", "BP", "BP", "Schiz.", "BP",
"Unaffected control", "Schiz.", "Schiz.", "Unaffected control",
"Unaffected control", "BP", "Unaffected control", "Schiz.", "BP",
"Unaffected control", "BP", "Unaffected control", "BP", "Schiz.",
"Unaffected control", "Schiz.", "Schiz.", "Schiz.", "Schiz.",
"BP", "Unaffected control", "Schiz.", "BP", "Schiz.", "BP", "Unaffected control",
"BP", "Unaffected control", "Unaffected control", "Unaffected control",
"Unaffected control", "Schiz.", "Unaffected control", "BP", "BP",
"BP", "Unaffected control", "BP", "BP", "BP", "BP", "Unaffected control",
"Schiz.", "Unaffected control", "BP", "BP", "Unaffected control",
"Unaffected control", "BP", "Schiz.", "BP", "Schiz.", "BP", "Unaffected control",
"Schiz.", "Unaffected control", "Schiz.", "Unaffected control",
"Schiz.", "Schiz.", "Unaffected control", "Unaffected control",
"Unaffected control", "Schiz.", "Schiz.", "BP", "BP", "Unaffected control",
"Unaffected control", "Schiz.", "Schiz.", "Schiz.", "Schiz.",
"BP", "Unaffected control", "BP", "Unaffected control", "BP",
"Schiz.", "Schiz.", "Schiz.", "Unaffected control", "Unaffected control",
"Schiz.", "Unaffected control", "Unaffected control", "Unaffected control",
"BP", "BP", "Dep.", "Unaffected control", "Unaffected control",
"Dep.", "Dep.", "Dep.", "Dep.", "Dep.", "Unaffected control",
"Unaffected control", "Schiz.", "Dep.", "BP", "Dep.", "Schiz.",
"Schiz.", "Schiz.", "BP", "Unaffected control", "Unaffected control",
"Unaffected control", "BP", "BP", "Dep.", "Schiz.", "Dep.", "BP",
"Unaffected control", "Unaffected control", "Schiz.", "Schiz.",
"Unaffected control", "Unaffected control", "BP", "BP", "Schiz.",
"Dep.", "BP", "Dep.", "BP", "Schiz.", "Unaffected control", "Dep.",
"BP", "Schiz.", "Dep.", "Dep.", "BP", "BP", "Schiz.")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100",
"101", "102", "103", "104", "105", "106", "107", "108", "109",
"110", "111", "112", "113", "114", "115", "116", "117", "118",
"119", "120", "121", "122", "123", "124", "125", "126", "127",
"128", "129", "130", "131", "132", "133", "134", "135", "136",
"137", "138", "139", "140", "141", "142", "143", "144", "145",
"146", "147", "148", "149", "150", "151", "152", "153"), class = "data.frame")
There are other columns in ann
, which I don't need.
> dput(head(ann))
structure(list(Database_ID = c("AC10", "AC11", "AC12", "AC13",
"AC14", "AC15"), Collection_type = c("Array", "Array", "Array",
"Array", "Array", "Array"), Age = c(40L, 51L, 19L, 49L, 48L,
44L), Round_Age = c(40L, 50L, 20L, 50L, 50L, 45L), sex = c("M",
"M", "M", "F", "F", "F"), race = c("White", "White", "White",
"White", "White", "White"), Profile = c("Schiz.", "Schiz.", "Schiz.",
"BP", "BP", "Unaffected control"), DSM_IV = c(295.9, 295.9, 295.9,
296.44, 296.7, NA), Axis_I_Primary_Dx = c("SCHIZ UNDIFF", "SCHIZ UNDIFF",
"SCHIZ UNDIFF", "BP I", "BP I", "CONTROL"), PMI = c(34L, 43L,
28L, 19L, 18L, 28L), Brain_PH = c(6.18, 6.63, 6.73, 5.87, 6.5,
6.59), Left_Brain = c("Fixed", "Frozen", "Frozen", "Fixed", "Frozen",
"Fixed"), Suicide_Status = c("No", "No", "No", "Yes", "No", "No"
), Psychotic_Feature = c("Yes", "Yes", "Yes", "Yes", "Unknown",
"No"), Rate_Of_Death = c("Possible anoxia at death (e.g., CO poisoning or pneumonia)",
"Sudden", "Sudden", "Sudden", "Sudden", "Sudden"), Exacerbation = c("No evidence of exacerbation",
"No evidence of exacerbation", "No evidence of exacerbation",
"Definite", "No evidence of exacerbation", "Unknown"), Smoking_At_TOD = c("Yes",
"Yes", "No", "Yes", "Yes", "Unknown"), Lifetime_Alcohol_Use = c("Moderate drinking in present (same as above, for recent period)",
"Moderate drinking in present (same as above, for recent period)",
"Social (1?2 drinks/day)", "Little or none (less than 1 drink/day)",
"Social (1?2 drinks/day)", "Moderate drinking in present (same as above, for recent period)"
), Lifetime_Drug_Use = c("Moderate drug use in present", "Social",
"Heavy drug use in present", "Little or none", "Moderate drug use in present",
"Little or none"), Lifetime_Antipsychotics = c(75000L, 130000L,
2500L, 4000L, 0L, 0L)), row.names = c("1", "2", "3", "4", "5",
"6"), class = "data.frame")
CodePudding user response:
Define a vector of wanted profiles, select from data set d
the columns to be in the final result, and inner_join
with the filtered data set ann
.
suppressPackageStartupMessages(library(dplyr))
wanted_profiles <- c("BP", "Unaffected control")
pheno <- d %>%
select(FID, IID) %>%
inner_join(ann %>% filter(Profile %in% wanted_profiles), by=c("FID"="Database_ID")) %>%
rename(Phenotype = Profile)
head(pheno)
#> FID IID Phenotype
#> 1 AC13 1 BP
#> 2 AC14 1 BP
#> 3 AC15 1 Unaffected control
#> 4 AC18 1 BP
#> 5 AC19 1 Unaffected control
#> 6 AC1 1 BP
Created on 2022-03-19 by the reprex package (v2.0.1)
Edit
To keep only ann
columns Database_ID
and Profile
, run
d %>%
select(FID, IID) %>%
inner_join(ann %>%
select(Database_ID, Profile) %>%
filter(Profile %in% wanted_profiles),
by=c("FID"="Database_ID")) %>%
rename(Phenotype = Profile)
CodePudding user response:
The solution is to select FID
and IID
columns from the d
dataframe and join it with Database_ID
and Profile
columns from the ann
dataframe by the FID
and Database_ID
columns in d
and ann
, respectively. Then, I filter based on the BP
and Unaffected control
variables in the Profile
column of the ann
dataframe.
> pheno <- d %>% select(FID, IID) %>% inner_join(ann %>% select("Database_ID", "Profile") %>% filter(Profile %in% c("BP"
, "Unaffected control")), by=c("FID"="Database_ID")) %>% rename(Phenotype=Profile)
Output
> head(pheno)
FID IID Phenotype
1 AC13 1 BP
2 AC14 1 BP
3 AC15 1 Unaffected control
4 AC18 1 BP
5 AC19 1 Unaffected control
6 AC1 1 BP