I want to add a Indication
column in dat
dataframe with the variables that correspond to the Profile
column of the ann
dataframe. The Indication
column will have either 0/1
or 1/2
binary categorical variables for Unaffected control
or BP
variables, respectively.
library(dplyr)
phenocov <- dat %>% inner_join(ann %>% select("Database_ID", "Profile") %>% dummycov$PHENOTYPE==encode_binary(ann$Profile), by=c("FID"="Database_ID"))
Expected output
FID | IID | SOL | C1 | C2 | Indication |
---|---|---|---|---|---|
AC13 | 1 | 0 | -0.02851720 | 0.00450319 | 1/2 |
AC14 | 1 | 0 | -0.04220610 | 0.00394058 | 1/2 |
AC15 | 1 | 0 | -0.01351050 | -0.03165270 | 0/1 |
dat
dataframe
> dput(dat)
structure(list(FID = c("AC13", "AC14", "AC15", "AC18", "AC19",
"AC1", "AC20", "AC21", "AC23", "AC24", "AC27", "AC29", "AC2",
"AC30", "AC32", "AC33", "AC34", "AC35", "AC36", "AC38", "AC42",
"AC43", "AC46", "AC48", "AC49", "AC50", "AC51", "AC52", "AC53",
"AC54", "AC56", "AC57", "AC58", "AC5", "AC60", "AC61", "AC62",
"AC63", "AC64", "AC65", "AC67", "AC69", "AC6", "AC70", "AC71",
"AC72", "AC74", "AC76", "AC77", "AC79", "AC80", "AC83", "AC84",
"AC86", "AC89", "AC8", "AC90", "AC91", "AC102", "AC103", "AC104",
"AC105", "AC16", "AC95", "AC96", "AC99", "DE10", "DE12", "DE13",
"DE14", "DE16", "DE17", "DE22", "DE23", "DE27", "DE36", "DE37",
"DE38", "DE39", "DE3", "DE40", "DE45", "DE46", "DE47", "DE4",
"DE50", "DE51", "DE52", "DE55", "DE57", "DE59", "DE7", "DE32",
"DE43"), IID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), SOL = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), C1 = c(-0.0285172, -0.0422061, -0.0135105, 0.0335788,
0.00453814, -0.0196805, 0.00871406, -0.0359998, 0.00776039, -0.0152431,
-0.0157209, 0.0235421, -0.0292564, 0.000164687, -0.0104566, -0.0078923,
-0.011623, 0.0173727, 0.00327281, -0.0101589, -0.00564814, 0.0232102,
0.00209183, -0.0245178, 0.0295213, -0.0012923, 0.00775297, 0.019803,
0.00521905, 0.0143496, 0.0243968, 0.0403667, 0.0420627, -0.0234294,
-0.00923894, 0.00216892, -0.0174861, 0.000902948, 0.0250136,
0.131506, 0.036772, -0.0314508, 0.0014534, 0.0123288, 0.0152248,
0.030759, 0.029896, 0.0257204, -0.0132957, -0.00942661, 0.0286468,
0.0297961, -0.0225471, 0.0185018, -0.0103523, -0.00141674, -0.00553813,
0.0141762, 0.0254432, -0.0124213, -0.0220576, 0.0278842, 0.00114805,
-0.013759, 0.00493565, -0.0146888, -0.00463272, -0.0332577, -0.0200095,
0.00230828, -0.0232427, 0.00433733, -0.0244282, -0.0245909, 0.0155191,
0.00425367, 0.00868522, -0.0347233, 0.00191488, -0.0154211, 0.0154046,
-0.00126067, 0.00826277, -0.0124104, -0.0136281, -0.023651, 0.00853699,
-0.0373163, 0.0147421, -0.0425991, -0.0198727, -0.0197213, -0.0115728,
-0.0127898), C2 = c(0.00450319, 0.00394058, -0.0316527, 0.00310475,
0.016075, 0.0108606, 0.0195024, -0.0109932, 0.02644, 0.00177053,
-0.0180791, -0.0128455, -0.0353571, 0.00868615, 0.0278649, -0.0143205,
0.0198699, 0.00600335, 0.00636933, -0.0245385, 0.0247255, -0.0019047,
0.00418658, -0.0106317, 0.0329249, -0.0490193, -0.0322256, -0.00525161,
0.0127281, -0.0210357, 0.048556, -0.0157994, 0.0225328, 0.0420733,
-0.0722942, 0.0117474, -0.00108231, -0.053874, -0.0351118, 0.000781904,
-0.0188162, -0.0214653, 0.0150199, 0.0339645, -0.0335628, -0.0151206,
0.0163763, -0.015727, -0.0232298, 0.0172519, 0.0348876, -0.026288,
0.0383726, -0.018123, 0.0200251, -0.0246757, 0.0184051, 0.0249351,
-0.00324928, 0.0129067, -0.0143993, -0.00150337, -0.00089652,
-0.0477761, 0.00873251, -0.0184572, 0.00115896, 0.0252723, -0.0188119,
0.0403222, -0.00957213, -0.0280059, 0.0183744, -0.025548, 0.021987,
0.0268481, -0.0267149, 0.00712551, -0.0115199, 0.00559716, 0.00779719,
-0.0181187, -0.0182654, -0.0263084, 0.00126466, 0.0397802, 0.0457278,
0.0098323, -0.00865206, 0.0248558, -0.0163334, 0.00451314, 0.0337946,
0.0267819)), class = "data.frame", row.names = c(NA, -94L))
The Database_ID
and Profile
columns in the ann
dataframe (there are other columns in the ann
dataframe...refer to below)
> dput(ann[c("Database_ID","Profile")])
structure(list(Database_ID = c("AC10", "AC11", "AC12", "AC13",
"AC14", "AC15", "AC17", "AC18", "AC19", "AC1", "AC20", "AC21",
"AC22", "AC23", "AC24", "AC25", "AC26", "AC27", "AC29", "AC2",
"AC30", "AC31", "AC32", "AC33", "AC34", "AC35", "AC36", "AC37",
"AC38", "AC39", "AC3", "AC40", "AC41", "AC42", "AC43", "AC45",
"AC46", "AC47", "AC48", "AC49", "AC50", "AC51", "AC52", "AC53",
"AC54", "AC55", "AC56", "AC57", "AC58", "AC5", "AC60", "AC61",
"AC62", "AC63", "AC64", "AC65", "AC66", "AC67", "AC69", "AC6",
"AC70", "AC71", "AC72", "AC73", "AC74", "AC75", "AC76", "AC77",
"AC78", "AC79", "AC7", "AC80", "AC81", "AC82", "AC83", "AC84",
"AC86", "AC87", "AC88", "AC89", "AC8", "AC90", "AC91", "AC92",
"AC9", "AC100", "AC101", "AC102", "AC103", "AC104", "AC105",
"AC16", "AC68", "AC93", "AC94", "AC95", "AC96", "AC97", "AC99",
"DE10", "DE12", "DE13", "DE14", "DE15", "DE16", "DE17", "DE18",
"DE19", "DE1", "DE20", "DE21", "DE22", "DE23", "DE25", "DE26",
"DE27", "DE2", "DE33", "DE34", "DE35", "DE36", "DE37", "DE38",
"DE39", "DE3", "DE40", "DE41", "DE42", "DE44", "DE45", "DE46",
"DE47", "DE48", "DE49", "DE4", "DE50", "DE51", "DE52", "DE53",
"DE54", "DE55", "DE56", "DE57", "DE58", "DE59", "DE60", "DE7",
"DE9", "DE29", "DE30", "DE32", "DE43", "DE5"), Profile = c("Schiz.",
"Schiz.", "Schiz.", "BP", "BP", "Unaffected control", "Schiz.",
"BP", "Unaffected control", "BP", "BP", "BP", "Schiz.", "BP",
"Unaffected control", "Schiz.", "Schiz.", "Unaffected control",
"Unaffected control", "BP", "Unaffected control", "Schiz.", "BP",
"Unaffected control", "BP", "Unaffected control", "BP", "Schiz.",
"Unaffected control", "Schiz.", "Schiz.", "Schiz.", "Schiz.",
"BP", "Unaffected control", "Schiz.", "BP", "Schiz.", "BP", "Unaffected control",
"BP", "Unaffected control", "Unaffected control", "Unaffected control",
"Unaffected control", "Schiz.", "Unaffected control", "BP", "BP",
"BP", "Unaffected control", "BP", "BP", "BP", "BP", "Unaffected control",
"Schiz.", "Unaffected control", "BP", "BP", "Unaffected control",
"Unaffected control", "BP", "Schiz.", "BP", "Schiz.", "BP", "Unaffected control",
"Schiz.", "Unaffected control", "Schiz.", "Unaffected control",
"Schiz.", "Schiz.", "Unaffected control", "Unaffected control",
"Unaffected control", "Schiz.", "Schiz.", "BP", "BP", "Unaffected control",
"Unaffected control", "Schiz.", "Schiz.", "Schiz.", "Schiz.",
"BP", "Unaffected control", "BP", "Unaffected control", "BP",
"Schiz.", "Schiz.", "Schiz.", "Unaffected control", "Unaffected control",
"Schiz.", "Unaffected control", "Unaffected control", "Unaffected control",
"BP", "BP", "Dep.", "Unaffected control", "Unaffected control",
"Dep.", "Dep.", "Dep.", "Dep.", "Dep.", "Unaffected control",
"Unaffected control", "Schiz.", "Dep.", "BP", "Dep.", "Schiz.",
"Schiz.", "Schiz.", "BP", "Unaffected control", "Unaffected control",
"Unaffected control", "BP", "BP", "Dep.", "Schiz.", "Dep.", "BP",
"Unaffected control", "Unaffected control", "Schiz.", "Schiz.",
"Unaffected control", "Unaffected control", "BP", "BP", "Schiz.",
"Dep.", "BP", "Dep.", "BP", "Schiz.", "Unaffected control", "Dep.",
"BP", "Schiz.", "Dep.", "Dep.", "BP", "BP", "Schiz.")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100",
"101", "102", "103", "104", "105", "106", "107", "108", "109",
"110", "111", "112", "113", "114", "115", "116", "117", "118",
"119", "120", "121", "122", "123", "124", "125", "126", "127",
"128", "129", "130", "131", "132", "133", "134", "135", "136",
"137", "138", "139", "140", "141", "142", "143", "144", "145",
"146", "147", "148", "149", "150", "151", "152", "153"), class = "data.frame")
There are other columns in ann
, which I don't need.
> dput(head(ann))
structure(list(Database_ID = c("AC10", "AC11", "AC12", "AC13",
"AC14", "AC15"), Collection_type = c("Array", "Array", "Array",
"Array", "Array", "Array"), Age = c(40L, 51L, 19L, 49L, 48L,
44L), Round_Age = c(40L, 50L, 20L, 50L, 50L, 45L), sex = c("M",
"M", "M", "F", "F", "F"), race = c("White", "White", "White",
"White", "White", "White"), Profile = c("Schiz.", "Schiz.", "Schiz.",
"BP", "BP", "Unaffected control"), DSM_IV = c(295.9, 295.9, 295.9,
296.44, 296.7, NA), Axis_I_Primary_Dx = c("SCHIZ UNDIFF", "SCHIZ UNDIFF",
"SCHIZ UNDIFF", "BP I", "BP I", "CONTROL"), PMI = c(34L, 43L,
28L, 19L, 18L, 28L), Brain_PH = c(6.18, 6.63, 6.73, 5.87, 6.5,
6.59), Left_Brain = c("Fixed", "Frozen", "Frozen", "Fixed", "Frozen",
"Fixed"), Suicide_Status = c("No", "No", "No", "Yes", "No", "No"
), Psychotic_Feature = c("Yes", "Yes", "Yes", "Yes", "Unknown",
"No"), Rate_Of_Death = c("Possible anoxia at death (e.g., CO poisoning or pneumonia)",
"Sudden", "Sudden", "Sudden", "Sudden", "Sudden"), Exacerbation = c("No evidence of exacerbation",
"No evidence of exacerbation", "No evidence of exacerbation",
"Definite", "No evidence of exacerbation", "Unknown"), Smoking_At_TOD = c("Yes",
"Yes", "No", "Yes", "Yes", "Unknown"), Lifetime_Alcohol_Use = c("Moderate drinking in present (same as above, for recent period)",
"Moderate drinking in present (same as above, for recent period)",
"Social (1?2 drinks/day)", "Little or none (less than 1 drink/day)",
"Social (1?2 drinks/day)", "Moderate drinking in present (same as above, for recent period)"
), Lifetime_Drug_Use = c("Moderate drug use in present", "Social",
"Heavy drug use in present", "Little or none", "Moderate drug use in present",
"Little or none"), Lifetime_Antipsychotics = c(75000L, 130000L,
2500L, 4000L, 0L, 0L)), row.names = c("1", "2", "3", "4", "5",
"6"), class = "data.frame")
CodePudding user response:
library(data.table)
# make data.tables
setDT(dat); setDT(ann)
# join by reference
dat[ann, Indication := i.Profile, on = .(FID = Database_ID)]
# FID IID SOL C1 C2 Indication
# 1: AC13 1 0 -0.02851720 0.00450319 BP
# 2: AC14 1 0 -0.04220610 0.00394058 BP
# 3: AC15 1 0 -0.01351050 -0.03165270 Unaffected control
# 4: AC18 1 0 0.03357880 0.00310475 BP
# 5: AC19 1 0 0.00453814 0.01607500 Unaffected control
# 6: AC1 1 0 -0.01968050 0.01086060 BP
# ...
CodePudding user response:
With tidyverse
, we may need to recode
before doing the inner_join
library(dplyr)
ann %>%
select(FID = Database_ID, Indication = Profile) %>%
mutate(Indication = recode(Indication,
"Unaffected control" = "0/1", "BP" = "1/2")) %>%
inner_join(dat, ., by = "FID")
-output
FID IID SOL C1 C2 Indication
1 AC13 1 0 -0.0285172 0.00450319 1/2
2 AC14 1 0 -0.0422061 0.00394058 1/2
3 AC15 1 0 -0.0135105 -0.03165270 0/1