Here is a screenshot of the data frame that I'm working with.
I'm also including here the dput(head(df, n = 50)).
structure(list(conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004",
"0001cd061567445d62e91e9d5c28c004", "0003bc99a5039e664fd71dab54d7d533",
"0003c6d107ae3dc12174eee321ee4589", "0006ba6c297c921e745ebe624c61b3c2",
"0007dbcbc304375c09d6cd650a144cd1", "00093c82a4a8fb0827833960fabb5e1a",
"000a508ea789bbad8e9e832bb1c2b787", "000d383e749beb47f8c93ec3a05a1b4a",
"00106ff98badafd53709fbb617052bb6", "0010a4d7e4f77d5c0ce684499a529ce2",
"00131c89636a6d0f37ae93fe6da4144e", "001dcc2ff0655c7dd8243b86ad461cff",
"002097d1eb159f92b43a6dc0510d9b08", "0020c1e9b159733d2d3ba9912c795382",
"0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3",
"00212acae3e183252078bdfd2c3963e3", "0023ff241f1c71743146300098021297",
"002f421d2a3ad29afd9cb15807ce1f0f", "00316ec40c99e9d8b53b4834386b0476",
"003593dce8e41b1a2a544b16196cc6a1", "00389f0ef15d13e4ec2eec7a1ac03240",
"003c69e077ba1ccbf051a229f5fc627a", "00415d4cd157e3f256976d0e9f5dab19",
"0041723c94fa240eea4e6245513e7213", "0043e1fd2c4ef8c1da1959a3f4dd0362",
"0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873",
"004df76075f83c1c6de314ff785aa0d7", "004df76075f83c1c6de314ff785aa0d7",
"005090f2bf5796a2cbe693e43e70d653", "0051d01a74e467cb91d779ed064f7dda",
"0052ef5a56f4a24011abe9a1a7242f49", "00592654753f3ef738a6f93227f3dd61",
"005af6fca539bb39a4c88713b0c9df2b", "005d62d75f8ecf29e7a3c8571b6f9bf6",
"005de06981b347a4af1f104fb3c7c6f2", "005dea5fc43528e8a4f00d172308cbea",
"0060afd14a7edd637d99ded56429ab4a", "00669dae25369a8655199c54aec4c229",
"00679670a8f30a3b59b9dd803c2aa48c", "006a9b088b768bbabb6d77cd6b44f763",
"006af62acaac8b6d8b620a170f09e71b", "006c137bd6884d985e2e0724c45ad2ab",
"006c83b0f6c0a8e3773a63c85f34c3c7", "006fc42950d4a610baaef0f66ded0819",
"00716fcf53da8f347571e387e1beae6d", "00716fcf53da8f347571e387e1beae6d",
"0073cee456597683aaecc9434a18eda3"), tier_1 = c("OTT", "Paid Search",
"OTT", "Email", "Paid Search", "Paid Search", "Email", "Paid Social",
"Paid Social", "Paid Search", "Direct", "Email", "Paid Search",
"Paid Social", "Paid Search", "Paid Social", "Organic Search",
"Paid Search", "Paid Social", "Organic Search", "Direct", "Paid Search",
"Paid Video", "Direct", "Email", "Paid Search", "Affiliate",
"Email", "Paid Search", "OTT", "Paid Search", "Paid Search",
"Email", "Direct", "Direct", "Paid Search", "Organic Search",
"Paid Search", "Paid Search", "Paid Search", "Paid Social", "Direct",
"Organic Search", "Paid Search", "Email", "Paid Search", "Paid Social",
"Paid Search", "Paid Social", "Organic Social"), normalized = c(0.4287711836,
0.5712288164, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.7237434946,
0.2762565054, 0.3000477093, 0.6999522907, 1, 1, 1, 1, 1, 1, 1,
1, 0.5447557896, 0.4552442104, 1, 0.3493884477, 0.6506115523,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.700741097,
0.299258903, 0.5210938212)), row.names = c(NA, -50L), groups = structure(list(
conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004",
"0003bc99a5039e664fd71dab54d7d533", "0003c6d107ae3dc12174eee321ee4589",
"0006ba6c297c921e745ebe624c61b3c2", "0007dbcbc304375c09d6cd650a144cd1",
"00093c82a4a8fb0827833960fabb5e1a", "000a508ea789bbad8e9e832bb1c2b787",
"000d383e749beb47f8c93ec3a05a1b4a", "00106ff98badafd53709fbb617052bb6",
"0010a4d7e4f77d5c0ce684499a529ce2", "00131c89636a6d0f37ae93fe6da4144e",
"001dcc2ff0655c7dd8243b86ad461cff", "002097d1eb159f92b43a6dc0510d9b08",
"0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3",
"0023ff241f1c71743146300098021297", "002f421d2a3ad29afd9cb15807ce1f0f",
"00316ec40c99e9d8b53b4834386b0476", "003593dce8e41b1a2a544b16196cc6a1",
"00389f0ef15d13e4ec2eec7a1ac03240", "003c69e077ba1ccbf051a229f5fc627a",
"00415d4cd157e3f256976d0e9f5dab19", "0041723c94fa240eea4e6245513e7213",
"0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873",
"004df76075f83c1c6de314ff785aa0d7", "005090f2bf5796a2cbe693e43e70d653",
"0051d01a74e467cb91d779ed064f7dda", "0052ef5a56f4a24011abe9a1a7242f49",
"00592654753f3ef738a6f93227f3dd61", "005af6fca539bb39a4c88713b0c9df2b",
"005d62d75f8ecf29e7a3c8571b6f9bf6", "005de06981b347a4af1f104fb3c7c6f2",
"005dea5fc43528e8a4f00d172308cbea", "0060afd14a7edd637d99ded56429ab4a",
"00669dae25369a8655199c54aec4c229", "00679670a8f30a3b59b9dd803c2aa48c",
"006a9b088b768bbabb6d77cd6b44f763", "006af62acaac8b6d8b620a170f09e71b",
"006c137bd6884d985e2e0724c45ad2ab", "006c83b0f6c0a8e3773a63c85f34c3c7",
"006fc42950d4a610baaef0f66ded0819", "00716fcf53da8f347571e387e1beae6d",
"0073cee456597683aaecc9434a18eda3"), .rows = structure(list(
1:2, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15:16, 17:18, 19L, 20L, 21L, 22L, 23L, 24L, 25L,
26L, 27:28, 29L, 30:31, 32L, 33L, 34L, 35L, 36L, 37L,
38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48:49,
50L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -44L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I need to create a summary data frame with some logic I don't know how to create.
What I need to do is group every conversion_hash_id that belongs to the exact same set of tier_1 categories and count up the total number of appearances.
For example, if there are 30 different conversion_hash_ids that are different from each other but that are each listed twice, once for OTT and once for Paid Search, then I need to count up the number of times that combination has appeared.
Here is what the new data frame could look like. These are just made up numbers for a couple cases. Maybe there are other possibilities for what the structure of the final table could look like. I didn't list all the cases here.
group count
OTT, Paid Search 30
OTT, Paid Social 25
CodePudding user response:
After grouping by 'conversion_hash_id', summarise
with the sort
ed unique
values of 'tier_1' as a single string by paste
ing (toString
- paste(..., collapse=", ")
) and then get the count
of the 'group' column
library(dplyr)
df %>%
group_by(conversion_hash_id) %>%
summarise(group = toString(sort(unique(tier_1)))) %>%
count(group)