How do I replace a dataframe column with variables from another dataframe?-CodePudding

I want to add a Indication column in dat dataframe with the variables that correspond to the Profile column of the ann dataframe. The Indication column will have either 0/1 or 1/2 binary categorical variables for Unaffected control or BP variables, respectively.

library(dplyr)

phenocov <- dat %>% inner_join(ann %>% select("Database_ID", "Profile") %>% dummycov$PHENOTYPE==encode_binary(ann$Profile), by=c("FID"="Database_ID"))

Expected output

FID	IID	C1	C2	Indication
AC13	1	-0.02851720	0.00450319	1/2
AC14	1	-0.04220610	0.00394058	1/2
AC15	1	-0.01351050	-0.03165270	0/1

dat dataframe

> dput(dat)
structure(list(FID = c("AC13", "AC14", "AC15", "AC18", "AC19",
"AC1", "AC20", "AC21", "AC23", "AC24", "AC27", "AC29", "AC2",
"AC30", "AC32", "AC33", "AC34", "AC35", "AC36", "AC38", "AC42",
"AC43", "AC46", "AC48", "AC49", "AC50", "AC51", "AC52", "AC53",
"AC54", "AC56", "AC57", "AC58", "AC5", "AC60", "AC61", "AC62",
"AC63", "AC64", "AC65", "AC67", "AC69", "AC6", "AC70", "AC71",
"AC72", "AC74", "AC76", "AC77", "AC79", "AC80", "AC83", "AC84",
"AC86", "AC89", "AC8", "AC90", "AC91", "AC102", "AC103", "AC104",
"AC105", "AC16", "AC95", "AC96", "AC99", "DE10", "DE12", "DE13",
"DE14", "DE16", "DE17", "DE22", "DE23", "DE27", "DE36", "DE37",
"DE38", "DE39", "DE3", "DE40", "DE45", "DE46", "DE47", "DE4",
"DE50", "DE51", "DE52", "DE55", "DE57", "DE59", "DE7", "DE32",
"DE43"), IID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), SOL = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), C1 = c(-0.0285172, -0.0422061, -0.0135105, 0.0335788,
0.00453814, -0.0196805, 0.00871406, -0.0359998, 0.00776039, -0.0152431,
-0.0157209, 0.0235421, -0.0292564, 0.000164687, -0.0104566, -0.0078923,
-0.011623, 0.0173727, 0.00327281, -0.0101589, -0.00564814, 0.0232102,
0.00209183, -0.0245178, 0.0295213, -0.0012923, 0.00775297, 0.019803,
0.00521905, 0.0143496, 0.0243968, 0.0403667, 0.0420627, -0.0234294,
-0.00923894, 0.00216892, -0.0174861, 0.000902948, 0.0250136,
0.131506, 0.036772, -0.0314508, 0.0014534, 0.0123288, 0.0152248,
0.030759, 0.029896, 0.0257204, -0.0132957, -0.00942661, 0.0286468,
0.0297961, -0.0225471, 0.0185018, -0.0103523, -0.00141674, -0.00553813,
0.0141762, 0.0254432, -0.0124213, -0.0220576, 0.0278842, 0.00114805,
-0.013759, 0.00493565, -0.0146888, -0.00463272, -0.0332577, -0.0200095,
0.00230828, -0.0232427, 0.00433733, -0.0244282, -0.0245909, 0.0155191,
0.00425367, 0.00868522, -0.0347233, 0.00191488, -0.0154211, 0.0154046,
-0.00126067, 0.00826277, -0.0124104, -0.0136281, -0.023651, 0.00853699,
-0.0373163, 0.0147421, -0.0425991, -0.0198727, -0.0197213, -0.0115728,
-0.0127898), C2 = c(0.00450319, 0.00394058, -0.0316527, 0.00310475,
0.016075, 0.0108606, 0.0195024, -0.0109932, 0.02644, 0.00177053,
-0.0180791, -0.0128455, -0.0353571, 0.00868615, 0.0278649, -0.0143205,
0.0198699, 0.00600335, 0.00636933, -0.0245385, 0.0247255, -0.0019047,
0.00418658, -0.0106317, 0.0329249, -0.0490193, -0.0322256, -0.00525161,
0.0127281, -0.0210357, 0.048556, -0.0157994, 0.0225328, 0.0420733,
-0.0722942, 0.0117474, -0.00108231, -0.053874, -0.0351118, 0.000781904,
-0.0188162, -0.0214653, 0.0150199, 0.0339645, -0.0335628, -0.0151206,
0.0163763, -0.015727, -0.0232298, 0.0172519, 0.0348876, -0.026288,
0.0383726, -0.018123, 0.0200251, -0.0246757, 0.0184051, 0.0249351,
-0.00324928, 0.0129067, -0.0143993, -0.00150337, -0.00089652,
-0.0477761, 0.00873251, -0.0184572, 0.00115896, 0.0252723, -0.0188119,
0.0403222, -0.00957213, -0.0280059, 0.0183744, -0.025548, 0.021987,
0.0268481, -0.0267149, 0.00712551, -0.0115199, 0.00559716, 0.00779719,
-0.0181187, -0.0182654, -0.0263084, 0.00126466, 0.0397802, 0.0457278,
0.0098323, -0.00865206, 0.0248558, -0.0163334, 0.00451314, 0.0337946,
0.0267819)), class = "data.frame", row.names = c(NA, -94L))

The Database_ID and Profile columns in the ann dataframe (there are other columns in the ann dataframe...refer to below)

> dput(ann[c("Database_ID","Profile")])
structure(list(Database_ID = c("AC10", "AC11", "AC12", "AC13",
"AC14", "AC15", "AC17", "AC18", "AC19", "AC1", "AC20", "AC21",
"AC22", "AC23", "AC24", "AC25", "AC26", "AC27", "AC29", "AC2",
"AC30", "AC31", "AC32", "AC33", "AC34", "AC35", "AC36", "AC37",
"AC38", "AC39", "AC3", "AC40", "AC41", "AC42", "AC43", "AC45",
"AC46", "AC47", "AC48", "AC49", "AC50", "AC51", "AC52", "AC53",
"AC54", "AC55", "AC56", "AC57", "AC58", "AC5", "AC60", "AC61",
"AC62", "AC63", "AC64", "AC65", "AC66", "AC67", "AC69", "AC6",
"AC70", "AC71", "AC72", "AC73", "AC74", "AC75", "AC76", "AC77",
"AC78", "AC79", "AC7", "AC80", "AC81", "AC82", "AC83", "AC84",
"AC86", "AC87", "AC88", "AC89", "AC8", "AC90", "AC91", "AC92",
"AC9", "AC100", "AC101", "AC102", "AC103", "AC104", "AC105",
"AC16", "AC68", "AC93", "AC94", "AC95", "AC96", "AC97", "AC99",
"DE10", "DE12", "DE13", "DE14", "DE15", "DE16", "DE17", "DE18",
"DE19", "DE1", "DE20", "DE21", "DE22", "DE23", "DE25", "DE26",
"DE27", "DE2", "DE33", "DE34", "DE35", "DE36", "DE37", "DE38",
"DE39", "DE3", "DE40", "DE41", "DE42", "DE44", "DE45", "DE46",
"DE47", "DE48", "DE49", "DE4", "DE50", "DE51", "DE52", "DE53",
"DE54", "DE55", "DE56", "DE57", "DE58", "DE59", "DE60", "DE7",
"DE9", "DE29", "DE30", "DE32", "DE43", "DE5"), Profile = c("Schiz.",
"Schiz.", "Schiz.", "BP", "BP", "Unaffected control", "Schiz.",
"BP", "Unaffected control", "BP", "BP", "BP", "Schiz.", "BP",
"Unaffected control", "Schiz.", "Schiz.", "Unaffected control",
"Unaffected control", "BP", "Unaffected control", "Schiz.", "BP",
"Unaffected control", "BP", "Unaffected control", "BP", "Schiz.",
"Unaffected control", "Schiz.", "Schiz.", "Schiz.", "Schiz.",
"BP", "Unaffected control", "Schiz.", "BP", "Schiz.", "BP", "Unaffected control",
"BP", "Unaffected control", "Unaffected control", "Unaffected control",
"Unaffected control", "Schiz.", "Unaffected control", "BP", "BP",
"BP", "Unaffected control", "BP", "BP", "BP", "BP", "Unaffected control",
"Schiz.", "Unaffected control", "BP", "BP", "Unaffected control",
"Unaffected control", "BP", "Schiz.", "BP", "Schiz.", "BP", "Unaffected control",
"Schiz.", "Unaffected control", "Schiz.", "Unaffected control",
"Schiz.", "Schiz.", "Unaffected control", "Unaffected control",
"Unaffected control", "Schiz.", "Schiz.", "BP", "BP", "Unaffected control",
"Unaffected control", "Schiz.", "Schiz.", "Schiz.", "Schiz.",
"BP", "Unaffected control", "BP", "Unaffected control", "BP",
"Schiz.", "Schiz.", "Schiz.", "Unaffected control", "Unaffected control",
"Schiz.", "Unaffected control", "Unaffected control", "Unaffected control",
"BP", "BP", "Dep.", "Unaffected control", "Unaffected control",
"Dep.", "Dep.", "Dep.", "Dep.", "Dep.", "Unaffected control",
"Unaffected control", "Schiz.", "Dep.", "BP", "Dep.", "Schiz.",
"Schiz.", "Schiz.", "BP", "Unaffected control", "Unaffected control",
"Unaffected control", "BP", "BP", "Dep.", "Schiz.", "Dep.", "BP",
"Unaffected control", "Unaffected control", "Schiz.", "Schiz.",
"Unaffected control", "Unaffected control", "BP", "BP", "Schiz.",
"Dep.", "BP", "Dep.", "BP", "Schiz.", "Unaffected control", "Dep.",
"BP", "Schiz.", "Dep.", "Dep.", "BP", "BP", "Schiz.")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96", "97", "98", "99", "100",
"101", "102", "103", "104", "105", "106", "107", "108", "109",
"110", "111", "112", "113", "114", "115", "116", "117", "118",
"119", "120", "121", "122", "123", "124", "125", "126", "127",
"128", "129", "130", "131", "132", "133", "134", "135", "136",
"137", "138", "139", "140", "141", "142", "143", "144", "145",
"146", "147", "148", "149", "150", "151", "152", "153"), class = "data.frame")

There are other columns in ann, which I don't need.

> dput(head(ann))
structure(list(Database_ID = c("AC10", "AC11", "AC12", "AC13",
"AC14", "AC15"), Collection_type = c("Array", "Array", "Array",
"Array", "Array", "Array"), Age = c(40L, 51L, 19L, 49L, 48L,
44L), Round_Age = c(40L, 50L, 20L, 50L, 50L, 45L), sex = c("M",
"M", "M", "F", "F", "F"), race = c("White", "White", "White",
"White", "White", "White"), Profile = c("Schiz.", "Schiz.", "Schiz.",
"BP", "BP", "Unaffected control"), DSM_IV = c(295.9, 295.9, 295.9,
296.44, 296.7, NA), Axis_I_Primary_Dx = c("SCHIZ UNDIFF", "SCHIZ UNDIFF",
"SCHIZ UNDIFF", "BP I", "BP I", "CONTROL"), PMI = c(34L, 43L,
28L, 19L, 18L, 28L), Brain_PH = c(6.18, 6.63, 6.73, 5.87, 6.5,
6.59), Left_Brain = c("Fixed", "Frozen", "Frozen", "Fixed", "Frozen",
"Fixed"), Suicide_Status = c("No", "No", "No", "Yes", "No", "No"
), Psychotic_Feature = c("Yes", "Yes", "Yes", "Yes", "Unknown",
"No"), Rate_Of_Death = c("Possible anoxia at death (e.g., CO poisoning or pneumonia)",
"Sudden", "Sudden", "Sudden", "Sudden", "Sudden"), Exacerbation = c("No evidence of exacerbation",
"No evidence of exacerbation", "No evidence of exacerbation",
"Definite", "No evidence of exacerbation", "Unknown"), Smoking_At_TOD = c("Yes",
"Yes", "No", "Yes", "Yes", "Unknown"), Lifetime_Alcohol_Use = c("Moderate drinking in present (same as above, for recent period)",
"Moderate drinking in present (same as above, for recent period)",
"Social (1?2 drinks/day)", "Little or none (less than 1 drink/day)",
"Social (1?2 drinks/day)", "Moderate drinking in present (same as above, for recent period)"
), Lifetime_Drug_Use = c("Moderate drug use in present", "Social",
"Heavy drug use in present", "Little or none", "Moderate drug use in present",
"Little or none"), Lifetime_Antipsychotics = c(75000L, 130000L,
2500L, 4000L, 0L, 0L)), row.names = c("1", "2", "3", "4", "5",
"6"), class = "data.frame")

CodePudding user response：

library(data.table)
# make data.tables
setDT(dat); setDT(ann)
# join by reference
dat[ann, Indication := i.Profile, on = .(FID = Database_ID)]
#     FID IID SOL          C1          C2         Indication
# 1: AC13   1   0 -0.02851720  0.00450319                 BP
# 2: AC14   1   0 -0.04220610  0.00394058                 BP
# 3: AC15   1   0 -0.01351050 -0.03165270 Unaffected control
# 4: AC18   1   0  0.03357880  0.00310475                 BP
# 5: AC19   1   0  0.00453814  0.01607500 Unaffected control
# 6:  AC1   1   0 -0.01968050  0.01086060                 BP
# ...

CodePudding user response：

With tidyverse, we may need to recode before doing the inner_join

library(dplyr)
ann %>%
    select(FID = Database_ID, Indication = Profile) %>% 
    mutate(Indication = recode(Indication, 
      "Unaffected control" = "0/1", "BP" = "1/2")) %>% 
    inner_join(dat, ., by = "FID")

-output

   FID IID SOL         C1          C2 Indication
1 AC13   1   0 -0.0285172  0.00450319        1/2
2 AC14   1   0 -0.0422061  0.00394058        1/2
3 AC15   1   0 -0.0135105 -0.03165270        0/1