Replace Row Information by Leading or Lagging Row

I have a large dataframe with rows that have duplicated first three columns (UnionChr, UnionStart, UnionEnd) and the remaining columns differ in values.

UnionChr	UnionStart	UnionEnd	IntersectChr	IntersectStart	IntersectEnd	IntersectLength	IntersectPileup	IntersectName	Overlap	Genotype	PeakType
chr1	3667144	3668013	.	-1	-1	.	.	.	0	WT	DKO Specific
chr1	3667144	3668013	chr1	3667144	3668013	870	20.60	dko_k27_peak_1	869	DKO	N/A
chr1	4478778	4479151	chr1	4478778	4479151	374	22.90	wt_k27_peak_4	373	WT	N/A
chr1	4478778	4479151	.	-1	-1	.	.	.	0	DKO	WT Specific
chr1	4482327	4483301	.	-1	-1	.	.	.	0	WT	DKO Specific
chr1	4482327	4483301	chr1	4482327	4483301	975	22.77	dko_k27_peak_4	974	DKO	N/A
chr1	4483527	4483784	chr1	4483527	4483784	258	24.58	wt_k27_peak_5	257	WT	N/A
chr1	4483527	4483784	.	-1	-1	.	.	.	0	DKO	WT Specific

I ultimately want to just replace the N/A value with WT or DKO specific from the duplicated row and then remove the duplicated row, so my final data should look like this:

UnionChr	UnionStart	UnionEnd	IntersectChr	IntersectStart	IntersectEnd	IntersectLength	IntersectPileup	IntersectName	Overlap	Genotype	PeakType
chr1	3667144	3668013	chr1	3667144	3668013	870	20.60	dko_k27_peak_1	869	DKO	DKO Specific
chr1	4478778	4479151	chr1	4478778	4479151	374	22.90	wt_k27_peak_4	373	WT	WT Specific
chr1	4482327	4483301	chr1	4482327	4483301	975	22.77	dko_k27_peak_4	974	DKO	DKO Specific
chr1	4483527	4483784	chr1	4483527	4483784	258	24.58	wt_k27_peak_5	257	WT	WT Specific

I can't do a search/replace based Genotype and PeakType column because I have other rows that don't have this duplicated problem that also have N/A. An additional problem is that the duplicated row is either leading or lagging, depending on the data set it came from.

I know I should use dplyr, and group by the first three columns, and somehow use lead/lag.

test <- df %>% 
  group_by(UnionChr, UnionStart, UnionEnd) %>% 
  mutate(??)

CodePudding user response：

Here is one method

library(dplyr)
library(tidyr)
df %>% 
   mutate(PeakType = na_if(PeakType, "N/A"),
       indx = is.na(PeakType)) %>%
   group_by(UnionChr, UnionStart, UnionEnd) %>% 
   fill(PeakType, .direction = "downup") %>% 
  filter(indx) %>% 
  ungroup %>%
  select(-indx)

-output

# A tibble: 4 × 12
  UnionChr UnionStart UnionEnd IntersectChr IntersectStart IntersectEnd IntersectLe…¹ Inter…² Inter…³ Overlap Genot…⁴ PeakT…⁵
  <chr>         <int>    <int> <chr>                 <int>        <int> <chr>         <chr>   <chr>     <int> <chr>   <chr>  
1 chr1        3667144  3668013 chr1                3667144      3668013 870           20.60   dko_k2…     869 DKO     DKO Sp…
2 chr1        4478778  4479151 chr1                4478778      4479151 374           22.90   wt_k27…     373 WT      WT Spe…
3 chr1        4482327  4483301 chr1                4482327      4483301 975           22.77   dko_k2…     974 DKO     DKO Sp…
4 chr1        4483527  4483784 chr1                4483527      4483784 258           24.58   wt_k27…     257 WT      WT Spe…
# … with abbreviated variable names ¹IntersectLength, ²IntersectPileup, ³IntersectName, ⁴Genotype, ⁵PeakType

data

df <- structure(list(UnionChr = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1", "chr1", "chr1"), UnionStart = c(3667144L, 3667144L, 4478778L, 
4478778L, 4482327L, 4482327L, 4483527L, 4483527L), UnionEnd = c(3668013L, 
3668013L, 4479151L, 4479151L, 4483301L, 4483301L, 4483784L, 4483784L
), IntersectChr = c(".", "chr1", "chr1", ".", ".", "chr1", "chr1", 
"."), IntersectStart = c(-1L, 3667144L, 4478778L, -1L, -1L, 4482327L, 
4483527L, -1L), IntersectEnd = c(-1L, 3668013L, 4479151L, -1L, 
-1L, 4483301L, 4483784L, -1L), IntersectLength = c(".", "870", 
"374", ".", ".", "975", "258", "."), IntersectPileup = c(".", 
"20.60", "22.90", ".", ".", "22.77", "24.58", "."), IntersectName = c(".", 
"dko_k27_peak_1", "wt_k27_peak_4", ".", ".", "dko_k27_peak_4", 
"wt_k27_peak_5", "."), Overlap = c(0L, 869L, 373L, 0L, 0L, 974L, 
257L, 0L), Genotype = c("WT", "DKO", "WT", "DKO", "WT", "DKO", 
"WT", "DKO"), PeakType = c("DKO Specific", "N/A", "N/A", "WT Specific", 
"DKO Specific", "N/A", "N/A", "WT Specific")), 
class = "data.frame", row.names = c(NA, 
-8L))

CodePudding user response：

Here is a similar way:

library(dplyr)
library(tidyr)

df %>%
  group_by(UnionStart) %>% 
  mutate(PeakType = na_if(PeakType, "N/A")) %>% 
  fill(PeakType, .direction = "downup") %>% 
  filter(!if_any(.col=everything(), .fns = ~ . == "."))

  UnionChr UnionStart UnionEnd IntersectChr IntersectStart IntersectEnd IntersectLength IntersectPileup IntersectName  Overlap Genotype PeakType    
  <chr>         <int>    <int> <chr>                 <int>        <int> <chr>           <chr>           <chr>            <int> <chr>    <chr>       
1 chr1        3667144  3668013 chr1                3667144      3668013 870             20.60           dko_k27_peak_1     869 DKO      DKO Specific
2 chr1        4478778  4479151 chr1                4478778      4479151 374             22.90           wt_k27_peak_4      373 WT       WT Specific 
3 chr1        4482327  4483301 chr1                4482327      4483301 975             22.77           dko_k27_peak_4     974 DKO      DKO Specific
4 chr1        4483527  4483784 chr1                4483527      4483784 258             24.58           wt_k27_peak_5      257 WT       WT Specific