Home > Enterprise >  R create summary data frame that has a proportions column, after grouping of complete combinations f
R create summary data frame that has a proportions column, after grouping of complete combinations f

Time:10-15

Here is a screenshot of the data frame that I'm working with.

I'm also including here the dput(head(df, n = 50)).

enter image description here

structure(list(conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004", 
"0001cd061567445d62e91e9d5c28c004", "0003bc99a5039e664fd71dab54d7d533", 
"0003c6d107ae3dc12174eee321ee4589", "0006ba6c297c921e745ebe624c61b3c2", 
"0007dbcbc304375c09d6cd650a144cd1", "00093c82a4a8fb0827833960fabb5e1a", 
"000a508ea789bbad8e9e832bb1c2b787", "000d383e749beb47f8c93ec3a05a1b4a", 
"00106ff98badafd53709fbb617052bb6", "0010a4d7e4f77d5c0ce684499a529ce2", 
"00131c89636a6d0f37ae93fe6da4144e", "001dcc2ff0655c7dd8243b86ad461cff", 
"002097d1eb159f92b43a6dc0510d9b08", "0020c1e9b159733d2d3ba9912c795382", 
"0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3", 
"00212acae3e183252078bdfd2c3963e3", "0023ff241f1c71743146300098021297", 
"002f421d2a3ad29afd9cb15807ce1f0f", "00316ec40c99e9d8b53b4834386b0476", 
"003593dce8e41b1a2a544b16196cc6a1", "00389f0ef15d13e4ec2eec7a1ac03240", 
"003c69e077ba1ccbf051a229f5fc627a", "00415d4cd157e3f256976d0e9f5dab19", 
"0041723c94fa240eea4e6245513e7213", "0043e1fd2c4ef8c1da1959a3f4dd0362", 
"0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873", 
"004df76075f83c1c6de314ff785aa0d7", "004df76075f83c1c6de314ff785aa0d7", 
"005090f2bf5796a2cbe693e43e70d653", "0051d01a74e467cb91d779ed064f7dda", 
"0052ef5a56f4a24011abe9a1a7242f49", "00592654753f3ef738a6f93227f3dd61", 
"005af6fca539bb39a4c88713b0c9df2b", "005d62d75f8ecf29e7a3c8571b6f9bf6", 
"005de06981b347a4af1f104fb3c7c6f2", "005dea5fc43528e8a4f00d172308cbea", 
"0060afd14a7edd637d99ded56429ab4a", "00669dae25369a8655199c54aec4c229", 
"00679670a8f30a3b59b9dd803c2aa48c", "006a9b088b768bbabb6d77cd6b44f763", 
"006af62acaac8b6d8b620a170f09e71b", "006c137bd6884d985e2e0724c45ad2ab", 
"006c83b0f6c0a8e3773a63c85f34c3c7", "006fc42950d4a610baaef0f66ded0819", 
"00716fcf53da8f347571e387e1beae6d", "00716fcf53da8f347571e387e1beae6d", 
"0073cee456597683aaecc9434a18eda3"), tier_1 = c("OTT", "Paid Search", 
"OTT", "Email", "Paid Search", "Paid Search", "Email", "Paid Social", 
"Paid Social", "Paid Search", "Direct", "Email", "Paid Search", 
"Paid Social", "Paid Search", "Paid Social", "Organic Search", 
"Paid Search", "Paid Social", "Organic Search", "Direct", "Paid Search", 
"Paid Video", "Direct", "Email", "Paid Search", "Affiliate", 
"Email", "Paid Search", "OTT", "Paid Search", "Paid Search", 
"Email", "Direct", "Direct", "Paid Search", "Organic Search", 
"Paid Search", "Paid Search", "Paid Search", "Paid Social", "Direct", 
"Organic Search", "Paid Search", "Email", "Paid Search", "Paid Social", 
"Paid Search", "Paid Social", "Organic Social"), normalized = c(0.4287711836, 
0.5712288164, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.7237434946, 
0.2762565054, 0.3000477093, 0.6999522907, 1, 1, 1, 1, 1, 1, 1, 
1, 0.5447557896, 0.4552442104, 1, 0.3493884477, 0.6506115523, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.700741097, 
0.299258903, 0.5210938212)), row.names = c(NA, -50L), groups = structure(list(
    conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004", 
    "0003bc99a5039e664fd71dab54d7d533", "0003c6d107ae3dc12174eee321ee4589", 
    "0006ba6c297c921e745ebe624c61b3c2", "0007dbcbc304375c09d6cd650a144cd1", 
    "00093c82a4a8fb0827833960fabb5e1a", "000a508ea789bbad8e9e832bb1c2b787", 
    "000d383e749beb47f8c93ec3a05a1b4a", "00106ff98badafd53709fbb617052bb6", 
    "0010a4d7e4f77d5c0ce684499a529ce2", "00131c89636a6d0f37ae93fe6da4144e", 
    "001dcc2ff0655c7dd8243b86ad461cff", "002097d1eb159f92b43a6dc0510d9b08", 
    "0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3", 
    "0023ff241f1c71743146300098021297", "002f421d2a3ad29afd9cb15807ce1f0f", 
    "00316ec40c99e9d8b53b4834386b0476", "003593dce8e41b1a2a544b16196cc6a1", 
    "00389f0ef15d13e4ec2eec7a1ac03240", "003c69e077ba1ccbf051a229f5fc627a", 
    "00415d4cd157e3f256976d0e9f5dab19", "0041723c94fa240eea4e6245513e7213", 
    "0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873", 
    "004df76075f83c1c6de314ff785aa0d7", "005090f2bf5796a2cbe693e43e70d653", 
    "0051d01a74e467cb91d779ed064f7dda", "0052ef5a56f4a24011abe9a1a7242f49", 
    "00592654753f3ef738a6f93227f3dd61", "005af6fca539bb39a4c88713b0c9df2b", 
    "005d62d75f8ecf29e7a3c8571b6f9bf6", "005de06981b347a4af1f104fb3c7c6f2", 
    "005dea5fc43528e8a4f00d172308cbea", "0060afd14a7edd637d99ded56429ab4a", 
    "00669dae25369a8655199c54aec4c229", "00679670a8f30a3b59b9dd803c2aa48c", 
    "006a9b088b768bbabb6d77cd6b44f763", "006af62acaac8b6d8b620a170f09e71b", 
    "006c137bd6884d985e2e0724c45ad2ab", "006c83b0f6c0a8e3773a63c85f34c3c7", 
    "006fc42950d4a610baaef0f66ded0819", "00716fcf53da8f347571e387e1beae6d", 
    "0073cee456597683aaecc9434a18eda3"), .rows = structure(list(
        1:2, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 
        14L, 15:16, 17:18, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 
        26L, 27:28, 29L, 30:31, 32L, 33L, 34L, 35L, 36L, 37L, 
        38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48:49, 
        50L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -44L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

I need to create a summary data frame with some logic I don't know how to create.

What I need to do is group every conversion_hash_id that belongs to the exact same set of tier_1 categories and count up the total number of appearances.

For example, if there are 30 different conversion_hash_ids that are different from each other but that are each listed twice, once for OTT and once for Paid Search, then I need to count up the number of times that combination has appeared.

Here is what the new data frame could look like. These are just made up numbers for a couple cases. Maybe there are other possibilities for what the structure of the final table could look like. I didn't list all the cases here.

group                   count
OTT, Paid Search         30 
OTT, Paid Social         25

CodePudding user response:

After grouping by 'conversion_hash_id', summarise with the sorted unique values of 'tier_1' as a single string by pasteing (toString - paste(..., collapse=", ")) and then get the count of the 'group' column

library(dplyr)
df %>%
    group_by(conversion_hash_id) %>%
    summarise(group = toString(sort(unique(tier_1)))) %>%
    count(group)
  • Related