This data set has the name of the questions as columns and scores as the values. But candidates will only answer (2) questions so there are alot of NULL values.
I am trying to make (2) new columns [Question#1] & [Question #2] to show which question the candidate took when there is a score present in the column.
I haven't been successful in trying to make code to run this smoothly. It would be filtering out "N/A" and matching with a number to call back the column name.
Example Output:
DPUT:
structure(list(candidate_id = c(1, 2, 3, 4, 5, 6), `Office Design` = c("N/A",
"N/A", "50", "N/A", "N/A", "N/A"), `Better Compression` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Simple Customer Support` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Break a Palindrome` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Delete Nnodes Greater Than X` = c("N/A",
"0", "N/A", "N/A", "50", "N/A"), `Double Size` = c("N/A", "N/A",
"N/A", "N/A", "N/A", "N/A"), `Game Winner` = c("N/A", "N/A",
"N/A", "N/A", "N/A", "50"), `Hashed Ports` = c("N/A", "N/A",
"N/A", "N/A", "N/A", "N/A"), `Maximum Substring` = c("N/A", "N/A",
"N/A", "50", "N/A", "N/A"), `Packet Stream Player` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Separate the Files` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Subarray Sum` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Tag Identification Number` = c("0",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Threshold Alerts` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Array Journey` = c("N/A",
"N/A", "N/A", "N/A", "75", "N/A"), `Barter Market` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), Encircular = c("N/A", "N/A",
"N/A", "N/A", "N/A", "N/A"), `Find the Substring` = c("N/A",
"N/A", "75", "N/A", "N/A", "N/A"), `Perfect Substring` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Reductor Array` = c("N/A",
"0", "N/A", "9", "N/A", "N/A"), `Shortest Palindrome` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Similar Numbers` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Subarray Products` = c("0",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Minimum Start Value` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), `Website Pagination` = c("N/A",
"N/A", "N/A", "N/A", "N/A", "48")), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -6L))
CodePudding user response:
You can pivot this to long format, change the values to numeric, and filter those that are not NA, make a row id using row_number()
, and then pivot back to wide format.
qs = pivot_longer(df, -1) %>%
mutate(value=as.numeric(value)) %>%
filter(!is.na(value)) %>%
group_by(candidate_id) %>%
mutate(row=row_number()) %>%
ungroup() %>%
pivot_wider(id_cols = candidate_id, names_from=row, values_from = name,names_prefix = "Question_")
qs
looks like this:
candidate_id Question_1 Question_2
<dbl> <chr> <chr>
1 1 Tag Identification Number Subarray Products
2 2 Delete Nnodes Greater Than X Reductor Array
3 3 Office Design Find the Substring
4 4 Maximum Substring Reductor Array
5 5 Delete Nnodes Greater Than X Array Journey
6 6 Game Winner Website Pagination
You can keep it like that, or if you want, join it back to the original, like this:
inner_join(df, qs)
A more concise and a faster approach uses data.table
:
library(data.table)
qs = dcast(
melt(setDT(df),id="candidate_id")[value!="N/A"][, id:=paste0("Question_",1:.N), candidate_id],
candidate_id~id, value.var="variable"
)
df[qs, on =.(candidate_id)]