I would like to ask for your help with aggregating texts by group (dyad_id) and for each member. For each dyad, alter and ego took turns (sometimes they did not take turns, such as the third observation of dyad 111_222, where 222 initiated a new discussion).
I'm trying to combine all of the writings (with a space between each message) of each person within a focal dyad.
A sample data:
structure(list(dyad_id = c("111_222 ", "111_222 ", "111_222 ",
"333_111 ", "333_111 "), alter = c(111, 222, 222, 333, 111),
ego = c(222, 111, 111, 111, 333), message_original = c("Hello my idea is this ",
"I agree with your point ", "In this essay I would like to ",
"I think he should not ", "Can you tell me more "
), message_ego_response = c("I agree with your point ",
"Same here ", "That's a great idea ",
"Can you tell me more ", "Yes to elaborate "
)), class = "data.frame", row.names = c(NA, -5L), variable.labels = structure(character(0), names = character(0)), codepage = 65001L)
The above sample looks like:
--------- ------- ----- ------------------------------- -------------------------
| dyad_id | alter | ego | message_original | message_ego_response |
--------- ------- ----- ------------------------------- -------------------------
| 111_222 | 111 | 222 | Hello my idea is this | I agree with your point |
--------- ------- ----- ------------------------------- -------------------------
| 111_222 | 222 | 111 | I agree with your point | Same here |
--------- ------- ----- ------------------------------- -------------------------
| 111_222 | 222 | 111 | In this essay I would like to | That's a great idea |
--------- ------- ----- ------------------------------- -------------------------
| 333_111 | 333 | 111 | I think he should not | Can you tell me more |
--------- ------- ----- ------------------------------- -------------------------
| 333_111 | 111 | 333 | Can you tell me more | Yes to elaborate |
--------- ------- ----- ------------------------------- -------------------------
The output I'm looking for:
--------- --------- ----------------------- --------- ------------------------------- ------------------------------------
| dyad_id | member1 | member1's messages | member2 | member2's messages | Note |
--------- --------- ----------------------- --------- ------------------------------- ------------------------------------
| 111_222 | 111 | Hello my idea is this | 222 | I agree with your point | 222's "I agree with your point" is |
| | | Same here | | In this essay I would like to | a duplicate (one in 'message_ego' |
| | | That's a great idea | | | and the other in 'message_alter') |
--------- --------- ----------------------- --------- ------------------------------- ------------------------------------
| 333_111 | 333 | I think he should not | 111 | Can you tell me more | Same here for the duplication |
| | | Yes to elaborate | | | |
--------- --------- ----------------------- --------- ------------------------------- ------------------------------------
CodePudding user response:
Run these steps one by one and note the comments in the code. This will help you understand what the code is doing at each step.
library(tidyverse)
# This function is to remove duplicates, source: https://stackoverflow.com/a/41280102/11907016
rem_dup.one <- function(x){
paste(unique(tolower(trimws(unlist(strsplit(x,split="(?!')[ [:punct:]]",fixed=F,perl=T))))),collapse = " ")
}
structure(list(dyad_id = c("111_222 ", "111_222 ", "111_222 ",
"333_111 ", "333_111 "), alter = c(111, 222, 222, 333, 111),
ego = c(222, 111, 111, 111, 333), message_original = c("Hello my idea is this ",
"I agree with your point ", "In this essay I would like to ",
"I think he should not ", "Can you tell me more "
), message_ego_response = c("I agree with your point ",
"Same here ", "That's a great idea ",
"Can you tell me more ", "Yes to elaborate "
)), class = "data.frame", row.names = c(NA, -5L), variable.labels = structure(character(0), names = character(0)), codepage = 65001L) %>%
as_tibble() %>%
# Remove white space
dplyr::mutate_if(is.character,~str_squish(.)) %>%
# Combine both messages
tidyr::unite("message_original and message_ego_response", c(message_original, message_ego_response),sep = "_") %>%
# Combine alter and ego
tidyr::unite("alter_ego",c(alter,ego),sep = "_") %>%
# Split into groups so that its easier to handle data and visualise steps at the same time
dplyr::group_by_all() %>%
dplyr::group_split() %>%
purrr::map_df(~{
.x %>%
# This entire set of functions will combine the message and the alter_ego code so that its easier to group similar codes together
tidyr::pivot_longer(cols = !matches("dyad_id"),values_to = "alter_ego_message_original_message_ego_response") %>%
tidyr::separate(alter_ego_message_original_message_ego_response,c("alter","ego"),sep = "_",remove = F) %>%
dplyr::select(-name) %>%
dplyr::mutate_at(vars(matches("alter|ego")),~str_c(.,collapse = "__")) %>%
dplyr::slice(1) %>%
tidyr::separate(alter_ego_message_original_message_ego_response,c("alter_ego","message_original_message_ego_response"),sep = "__")
}) %>%
# in the above steps, the code and messages were combined
dplyr::group_by(dyad_id) %>%
# Now the combination of messages is the next step
dplyr::group_split() %>%
purrr::map_df(~{
.x %>%
dplyr::select(matches("dyad|alter$|^ego")) %>%
tidyr::pivot_longer(cols = matches("alter$|^ego"),names_to = "alter/ego",values_to = "Code_message") %>%
dplyr::select(-`alter/ego`) %>%
dplyr::arrange(Code_message) %>%
tidyr::separate(Code_message,c("Code","message"),sep = "__") %>%
dplyr::group_by(Code) %>%
dplyr::mutate(message = str_c(message, collapse = " \n ")) %>%
dplyr::slice(1) %>%
dplyr::ungroup() %>%
dplyr::mutate(member = 1:n() %>% str_c("member",.)) %>%
tidyr::unite(Code_message,c(Code, message),sep = "_") %>%
tidyr::pivot_wider(id_cols = dyad_id,names_from = member,values_from = Code_message) %>%
tidyr::separate(member1,c("member1","member1's message"),sep = "_") %>%
tidyr::separate(member2,c("member2","member2's message"),sep = "_")
}) %>%
dplyr::group_by(dyad_id) %>%
dplyr::mutate_at(vars(matches("message")),~rem_dup.one(.)) %>%
dplyr::ungroup()
Output:
dyad_id member1 `member1's message` member2 `member2's message`
<chr> <chr> <chr> <chr> <chr>
1 111_222 111 "hello my idea is this same here that's a great" 222 i agree with your point in this essay would like to
2 333_111 111 "can you tell me more " 333 i think he should not yes to elaborate
CodePudding user response:
We can use a combination of data.table
and tidyverse
. First, I convert to a long format using data.table
, then we can clean up the empty white space (trimws
), and we can create a new column to note the information about duplicate statements. Then, I collapse the statements for one person (per dyad) using str_c
. Then, we can pivot back to the wide format and then clean up the column order and names.
library(data.table)
library(tidyverse)
names(df) <- c("dyad_id", "member1", "member2", "message1", "message2")
melt(setDT(df), measure = patterns("^member", "^message"),
value.name = c("member", "message")) %>%
group_by(dyad_id, member) %>%
mutate(message = trimws(message),
dyad_id = trimws(dyad_id),
notes = ifelse(duplicated(message), message, NA)) %>%
summarize(message = str_c(unique(message),collapse = " "), notes = max(notes, na.rm = T)) %>%
mutate(idx = row_number(),
notes = ifelse(!is.na(notes), paste0("member", row_number(), " duplicate:", notes), NA)) %>%
pivot_wider(names_from = "idx", values_from = c("message", "member")) %>%
summarize(across(everything(), ~max(.x, na.rm = T))) %>%
select(dyad_id, member_1, message_1, member_2, message_2, notes) %>%
set_names(., c("dyad_id", "member1", "member1's message", "member2", "member2's message", "notes"))
Output
dyad_id member1 `member1's message` member2 `member2's message` notes
<chr> <dbl> <chr> <dbl> <chr> <chr>
1 111_222 111 Hello my idea is this Same here That's a great idea 222 I agree with your point In this essay I would like to member2 duplicate:I agree with your point
2 333_111 111 Can you tell me more 333 I think he should not Yes to elaborate member1 duplicate:Can you tell me more
CodePudding user response:
library(tidyverse)
df <- structure(list(dyad_id = c("111_222 ", "111_222 ", "111_222 ",
"333_111 ", "333_111 "),
alter = c(111, 222, 222, 333, 111),
ego = c(222, 111, 111, 111, 333),
message_original = c("Hello my idea is this ",
"I agree with your point ",
"In this essay I would like to ",
"I think he should not ",
"Can you tell me more "),
message_ego_response = c("I agree with your point ",
"Same here ",
"That's a great idea ",
"Can you tell me more ",
"Yes to elaborate ")
),
class = "data.frame",
row.names = c(NA, -5L),
variable.labels = structure(character(0),
names = character(0)),
codepage = 65001L)
df %>%
separate(dyad_id, sep = "_", into = c("member1", "member2")) %>%
mutate(
member1_messages = if_else(member1 == alter, message_original, message_ego_response),
member2_messages = if_else(member1 == ego, message_original, "")
) %>%
select(-c(alter, ego, message_original, message_ego_response)) %>%
group_by(member1) %>%
mutate(
member1_messages = str_squish(paste(member1_messages, collapse = "")),
member2_messages = str_squish(paste(member2_messages, collapse = ""))
) %>%
ungroup() %>%
distinct()
#> # A tibble: 2 × 4
#> member1 member2 member1_messages member2_messages
#> <chr> <chr> <chr> <chr>
#> 1 111 "222 " Hello my idea is this Same here That's a gre… I agree with yo…
#> 2 333 "111 " I think he should not Yes to elaborate Can you tell me…
CodePudding user response:
- First
group_by
thedyad_id
column, then assignmember
by splittingdyad_id
by "_". - Since all of your columns have trailing white spaces, I removed them by
stringr::str_trim()
. - Then reorder the messages by the position in
dyad_id
(first twoifelse()
chunks). - After that, check if there's duplicates (the other two
ifelse()
chunks). - If either
Note1
orNote2
isNA
,coalesce
them together to replace theNA
. If both of them are notNA
,paste
them together. - In the
summarize
part, collapse multiple strings from the same member together. - Finally,
relocate
the columns to your desired position.
library(dplyr)
library(stringr)
df %>%
group_by(dyad_id) %>%
mutate(across(everything(), ~str_trim(.x, "right")),
dyad_id = gsub(" $", "", dyad_id),
member1 = strsplit(dyad_id, "_")[[1]][1],
member2 = strsplit(dyad_id, "_")[[2]][2],
member_1_message = ifelse(paste0(alter, "_", ego) == dyad_id, message_original, message_ego_response),
member_2_message = ifelse(paste0(alter, "_", ego) == dyad_id, message_ego_response, message_original),
Note1 = ifelse(length(member_1_message[duplicated(member_1_message)]) == 0,
NA,
paste(member1,"'s", member_1_message[duplicated(member_1_message)], "is a duplicate")),
Note2 = ifelse(length(member_2_message[duplicated(member_2_message)]) == 0,
NA,
paste(member2,"'s", member_2_message[duplicated(member_2_message)], "is a duplicate")),
Note = ifelse(is.na(Note1) | is.na(Note2), coalesce(Note1, Note2), paste(Note1, Note2, sep = ";"))) %>%
summarize(across(starts_with("member"), ~paste0(unique(.x), collapse = " ")),
Note = unique(Note)) %>%
relocate(dyad_id, member1, member_1_message, member2, member_2_message, Note)
# A tibble: 2 × 6
dyad_id member1 member_1_message member2 member_2_message Note
<chr> <chr> <chr> <chr> <chr> <chr>
1 111_222 111 Hello my idea is this Same here That's a great idea 222 I agree with your point In this essay I would like to 222 's I agree with your point is a duplicate
2 333_111 333 I think he should not Yes to elaborate 111 Can you tell me more 111 's Can you tell me more is a duplicate