With given data frames
Table1 <- data.frame (ID = c("ID1", "ID2", "ID3"), Color = c("Yellow","Pink", "Red"), Classification = c("Furniture", "Vehicle", "RTV"), Zone = c("Europe", "Asia", "America") )
Table2 <-data.frame (ID = c("ID1", "ID2"), Product = c("Product1", "Product2"),Color = c("Pink", "Red"))
Table3 <- data.frame (ID = c("ID2", "ID1", "ID3"), Grade = c("Excellent","Mediocre", "Good"), Classification = c("RTV", "Vehicle", "Furniture"), Zone = c("America", "Asia", "Europe") )
Id like to achieve this
If thats available for one column - can I apply it across all my dataframes, so it anonymize accurately any repetitions of data?
CodePudding user response:
I'm not sure if this is what you wanted,
table_list <- list(Table1, Table2, Table3)
key <- data.frame(color = c("Yellow", "Pink", "Red"),
color_new = c("HashColor1", "HashColor2", "HashColor3"),
classification = c("Furniture", "Vehicle", "RTV"),
classification_new = c("HashClass1", "HashClass2", "HashClass3"))
match_replace <- function(x){
if ("Color" %in% names(x)) {
x$Color <- key$color_new[match(x$Color, key$color)]
}
if ("Classification" %in% names(x)) {
x$Classification <- key$classification_new[match(x$Classification, key$classification)]
}
x
}
For Table1
match_replace(Table1)
ID Color Classification Zone
1 ID1 HashColor1 HashClass1 Europe
2 ID2 HashColor2 HashClass2 Asia
3 ID3 HashColor3 HashClass3 America
For every table table_list
,
lapply(table_list, match_replace)
[[1]]
ID Color Classification Zone
1 ID1 HashColor1 HashClass1 Europe
2 ID2 HashColor2 HashClass2 Asia
3 ID3 HashColor3 HashClass3 America
[[2]]
ID Product Color
1 ID1 Product1 HashColor2
2 ID2 Product2 HashColor3
[[3]]
ID Grade Classification Zone
1 ID2 Excellent HashClass3 America
2 ID1 Mediocre HashClass2 Asia
3 ID3 Good HashClass1 Europe
CodePudding user response:
Similar idea to the other answer, but does not require that you know all the colors, classifications, zones, etc. and is agnostic to the columns (will anonymize any columns other than ID). Also, uses a true hash (md5).
The basic idea is this: for every column to be anonymized, we need to know all possible values. To do this we create a "master table" by binding together all the tables row-wise, then create the hashes based on the master table, then dis-assemble the master table back to the original format.
library(data.table)
library(openssl)
base.tables <- list(T1=Table1, T2=Table2, T3=Table3)
base.cols <- lapply(base.tables, colnames)
all.tables <- rbindlist(mapply(\(x, tname) {setDT(x)[, table:=tname]}, base.tables, names(base.tables
)), fill = TRUE)
anonCols <- setdiff(names(all.tables), c('ID', 'table')) # columns to anonymize
all.tables[, c(anonCols):= lapply(.SD, md5), .SDcols=anonCols]
anon.tables <- split(all.tables, all.tables$table)
anon.tables <- mapply(\(x, cols) {x[, cols, with=FALSE]}, anon.tables, base.cols)
base.tables
## $T1
## ID Color Classification Zone
## 1: ID1 Yellow Furniture Europe
## 2: ID2 Pink Vehicle Asia
## 3: ID3 Red RTV America
##
## $T2
## ID Product Color
## 1: ID1 Product1 Pink
## 2: ID2 Product2 Red
##
## $T3
## ID Grade Classification Zone
## 1: ID2 Excellent RTV America
## 2: ID1 Mediocre Vehicle Asia
## 3: ID3 Good Furniture Europe
anon.tables
## $T1
## ID Color Classification Zone
## 1: ID1 51e6cd92b6c45f9affdc158ecca2b8b8 bb3a8b2e390142074e49741a0121d623 912d59cdf1d3f551fae21f6f0062258f
## 2: ID2 8dc5344bc0746e1cc5abf896ca03bbdf d6af9c1eaff2a89ebd3f8d0c542be12b 154a67340e8c14dd5253dc4ff6120197
## 3: ID3 ee38e4d5dd68c4e440825018d549cb47 ad2d89100e475427fae2882f2d5dcc36 088f003833d523d9dccc529e929afdc7
##
## $T2
## ID Product Color
## 1: ID1 82cbf74569427f33b97d3ffcebba9bab 8dc5344bc0746e1cc5abf896ca03bbdf
## 2: ID2 6b27e2dc9afb06f3e433add532882f70 ee38e4d5dd68c4e440825018d549cb47
##
## $T3
## ID Grade Classification Zone
## 1: ID2 fcc2d28a33d2df558f18766e067569c6 ad2d89100e475427fae2882f2d5dcc36 088f003833d523d9dccc529e929afdc7
## 2: ID1 d6f4c79f64a33fd9e11c1e46c7ecadde d6af9c1eaff2a89ebd3f8d0c542be12b 154a67340e8c14dd5253dc4ff6120197
## 3: ID3 0c6ad70beb3a7e76c3fc7adab7c46acc bb3a8b2e390142074e49741a0121d623 912d59cdf1d3f551fae21f6f0062258f