Home > Enterprise >  Remove duplicated from data based on values from another column
Remove duplicated from data based on values from another column

Time:10-15

I have a dataset (named raw.data) with 3 columns - DateTime, Pressure, and Well.

I am not sure if this is possible, but I want to remove duplicated DateTime values for Pressure values under 4000 and leave everything above 4000 alone.

Here is a portion of the data:

> dput(raw.data[16500:17000,c(1,4,5)])
structure(list(Well = structure(c(1L, 2L, 1L, 3L, 3L, 1L, 2L,  2L, 3L, 1L, 2L, 3L, 1L, 3L, 1L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L,  2L, 3L, 2L, 1L, 3L, 3L, 1L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 2L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 2L, 3L, 2L, 3L, 1L, 1L, 3L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 1L, 3L, 1L, 2L, 3L, 2L, 1L, 3L, 3L, 2L, 1L, 1L, 2L, 
3L, 2L, 3L, 1L, 3L, 1L, 2L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 3L, 1L, 1L, 2L, 3L, 3L, 2L, 1L, 2L, 3L, 1L, 3L, 1L, 2L, 2L, 1L, 3L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 2L, 
3L, 2L, 3L, 1L, 2L, 1L, 3L, 1L, 2L, 3L, 2L, 3L, 1L, 3L, 1L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 1L, 2L, 3L, 3L, 1L, 2L, 2L, 3L, 1L, 3L, 2L, 1L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 2L, 3L, 1L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 3L, 2L, 2L, 3L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 2L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 1L, 3L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 3L, 1L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 3L, 2L, 1L, 2L, 1L, 3L, 2L, 1L, 
3L, 2L, 3L, 1L, 1L, 3L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 1L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 3L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 2L, 3L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 2L, 1L, 3L, 1L, 3L, 2L, 1L, 3L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 1L, 3L, 2L, 1L, 3L, 2L, 3L, 1L, 2L, 3L, 2L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 2L, 2L, 1L, 
3L, 3L, 2L, 1L, 1L, 2L, 3L, 2L, 3L, 1L, 3L, 1L, 2L, 3L, 2L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 2L, 1L, 3L, 2L, 3L, 2L, 1L, 2L, 3L, 1L, 1L, 2L, 3L, 3L, 1L, 2L, 2L, 1L, 3L, 3L, 1L, 2L, 3L, 1L, 
2L, 1L, 2L, 3L, 3L, 2L, 1L, 2L, 1L, 3L, 3L, 2L, 1L, 1L, 3L, 2L, 3L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 3L, 3L, 2L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 2L, 3L, 1L, 3L, 1L, 2L, 1L, 3L, 2L, 2L, 3L, 1L, 2L, 1L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 3L, 2L, 3L, 2L, 1L, 1L, 2L, 3L, 2L, 1L), .Label = c("FSV-004", 
"FSV-030", "FSV-039"), class = "factor"), Pressure = c(2617, 3228, 2605, 8082, 7999, 2607, 2713, 2701, 7927, 2605, 2675, 8166, 2609, 8037, 2604, 3919, 2610, 7932, 4553, 2599, 7981, 4120, 2604, 3937, 8107, 3445, 2599, 7990, 7985, 2597, 3292, 2605, 8047, 3188, 2604, 8067, 3098, 3015, 2606, 7865, 2962, 7901, 2601, 8055, 2601, 2905, 2603, 7961, 2833, 2592, 7871, 2785, 2599, 2716, 7955, 2652, 7965, 2598, 2598, 7864, 2612, 7834, 2623, 2599, 7978, 2623, 2600, 7949, 2604, 2601, 2615, 7778, 2595, 2606, 2595, 7886, 2593, 2608, 7950, 2607, 2587, 7860, 7817, 2600, 2597, 2594, 2592, 7917, 2585, 7834, 2594, 7816, 2596, 2581, 2592, 2575, 7883, 2594, 2576, 7904, 2595, 2576, 7751, 7790, 2594, 2572, 2562, 7823, 2595, 2574, 7814, 2591, 2596, 2578, 7781, 7957, 2565, 2589, 2557, 7851, 2592, 7836, 2587, 2560, 2556, 2587, 7907, 2590, 2557, 7949, 7790, 2585, 2564, 2591, 2560, 7917, 2549, 7855, 2589, 2548, 2589, 7803, 2594, 2547, 7826, 2553, 7919, 2595, 7800, 2586, 2543, 7807, 2589, 2539, 2540,7859, 2585, 2586, 2538, 7892, 7796, 2584, 2534, 2531, 7885, 2586, 7876, 2530, 2587, 2585, 2525, 7732, 7442, 2585, 2523, 2581, 7792, 2529, 2583, 7442, 2526, 2583, 3589, 2523, 3054, 2517, 3252, 4221, 2515, 3170, 2525, 6109, 3103, 2513, 7511, 3051, 2514, 8398, 3018, 
8802, 2994, 2509, 2520, 2971, 9328, 2956, 2512, 8493, 2941, 2507, 8983, 2929, 2502, 8920, 2921, 8710, 2501, 9349, 2506, 2909, 9214, 2489, 2900, 9194, 2503, 2888, 9105, 2505, 2879, 9016, 2490, 2873, 9014, 2867, 2482, 2863, 8891, 2496, 2859, 8994, 2494, 2856, 8957, 2486, 2854, 8882, 2494, 2846, 2477, 8946, 2846, 2477, 8874, 2483, 2841, 8761, 2478, 2841, 8845, 2473, 2836, 8957, 2826, 8908, 2480, 8801, 2820, 2472, 8914, 2816, 2475, 2810, 2470, 8952, 2467, 8832, 2811, 2472, 8896, 2822, 2471, 2808, 8921, 8859, 2804, 2466, 2797, 8823, 2463, 2791, 8968, 2462, 2786, 8965, 2457, 8774, 2446, 2782, 2453, 8807, 2781, 2450, 8908, 2777, 8861, 2777, 2443, 2773, 2450, 8726, 2769, 2446, 8842, 2450, 8811, 2768, 2444, 8662, 2766, 8606, 2445, 2758, 8754, 2442, 2762, 8722, 2442, 2762, 2444, 2764, 8607, 2758, 2434, 8617, 3083, 2441, 8723, 2438, 8603, 3359, 8546, 3425, 2433, 8672, 3475, 2432, 2869, 2433, 8677, 2823, 2426, 8576, 8557, 2814, 2429, 8712, 4304, 2427, 4398, 8666, 2430, 4327, 2424, 8631, 2428, 8657, 3637, 2424, 3144, 8623, 3074, 8530, 2420, 2422, 8621, 
3008, 2927, 2422, 8600, 8506, 2426, 2859, 2414, 2795, 8485, 2767, 8580, 2416, 2755, 2414, 8476, 2420, 8379, 2728, 2419, 2769, 8501, 2767, 8471, 2411, 8443, 2763, 2410, 2763, 2407, 8532, 2408, 2759, 8550, 8450, 2405, 2755, 2754, 8444, 2404, 2406, 8555, 2749, 2749, 8426, 2401, 2749, 8439, 2400, 8522, 2400, 2744, 2744, 2398, 8482, 2397, 8377, 2741, 2738, 2398, 8479, 8500, 2730, 2398, 2726, 2394, 8337, 2392, 2724, 8415, 2389, 8523, 2726, 2724, 2395, 8343, 2388, 8384, 2720, 2387, 2719, 8503, 2720, 2391, 8451, 2718, 2384, 8346, 2385, 8446, 2717, 2382, 2717, 8477, 2385, 2715, 8336, 2713, 8400, 2385, 8494, 2712, 2383, 2378, 2711, 8350, 2376, 8434, 2710, 8536, 2377, 2708, 8431, 2377, 2707, 8353, 2706, 2374, 2703, 2367, 8273, 7682, 2374, 2702, 2372, 3678), DateTime = structure(c(1632248460, 1632248520, 1632248520, 1632248520, 1632248580, 1632248580, 1632248580, 
1632248640, 1632248640, 1632248640, 1632248700, 1632248700, 1632248700, 1632248760, 1632248760, 1632248760, 1632248820, 1632248820, 1632248820, 1632248880, 1632248880, 1632248880, 1632248940, 1632248940, 1632248940, 1632249000, 1632249000, 1632249000, 1632249060, 1632249060, 1632249060, 
1632249120, 1632249120, 1632249120, 1632249180, 1632249180, 1632249180, 1632249240, 1632249240, 1632249240, 1632249300, 1632249300, 1632249300, 1632249360, 1632249360, 1632249360, 1632249420, 1632249420, 1632249420, 1632249480, 1632249480, 1632249480, 1632249540, 1632249540, 1632249540, 
1632249600, 1632249600, 1632249600, 1632249660, 1632249660, 1632249660, 1632249720, 1632249720, 1632249720, 1632249780, 1632249780, 1632249780, 1632249840, 1632249840, 1632249840, 1632249900, 1632249900, 1632249900, 1632249960, 1632249960, 1632249960, 1632250020, 1632250020, 1632250020, 
1632250080, 1632250080, 1632250080, 1632250140, 1632250140, 1632250140, 1632250200, 1632250200, 1632250200, 1632250260, 1632250260, 1632250260, 1632250320, 1632250320, 1632250320, 1632250380, 1632250380, 1632250380, 1632250440, 1632250440, 1632250440, 1632250500, 1632250500, 1632250500, 
1632250560, 1632250560, 1632250560, 1632250620, 1632250620, 1632250620, 1632250680, 1632250680, 1632250680, 1632250740, 1632250740, 1632250740, 1632250800, 1632250800, 1632250800, 1632250860, 1632250860, 1632250860, 1632250920, 1632250920, 1632250920, 1632250980, 1632250980, 1632250980, 
1632251040, 1632251040, 1632251040, 1632251100, 1632251100, 1632251100, 1632251160, 1632251160, 1632251160, 1632251220, 1632251220, 1632251220, 1632251280, 1632251280, 1632251280, 1632251340, 1632251340, 1632251340, 1632251400, 1632251400, 1632251400, 1632251460, 1632251460, 1632251460, 
1632251520, 1632251520, 1632251520, 1632251580, 1632251580, 1632251580, 1632251640, 1632251640, 1632251640, 1632251700, 1632251700, 1632251700, 1632251760, 1632251760, 1632251760, 1632251820, 1632251820, 1632251820, 1632251880, 1632251880, 1632251880, 1632251940, 1632251940, 1632251940, 
1632252000, 1632252000, 1632252000, 1632252060, 1632252060, 1632252060, 1632252120, 1632252120, 1632252120, 1632252180, 1632252180, 1632252180, 1632252240, 1632252240, 1632252240, 1632252300, 1632252300, 1632252300, 1632252360, 1632252360, 1632252360, 1632252420, 1632252420, 1632252420, 
1632252480, 1632252480, 1632252480, 1632252540, 1632252540, 1632252540, 1632252600, 1632252600, 1632252600, 1632252660, 1632252660, 1632252660, 1632252720, 1632252720, 1632252720, 1632252780, 1632252780, 1632252780, 1632252840, 1632252840, 1632252840, 1632252900, 1632252900, 1632252900, 
1632252960, 1632252960, 1632252960, 1632253020, 1632253020, 1632253020, 1632253080, 1632253080, 1632253080, 1632253140, 1632253140, 1632253140, 1632253200, 1632253200, 1632253200, 1632253260, 1632253260, 1632253260, 1632253320, 1632253320, 1632253320, 1632253380, 1632253380, 1632253380, 
1632253440, 1632253440, 1632253440, 1632253500, 1632253500, 1632253500, 1632253560, 1632253560, 1632253560, 1632253620, 1632253620, 1632253620, 1632253680, 1632253680, 1632253680, 1632253740, 1632253740, 1632253740, 1632253800, 1632253800, 1632253800, 1632253860, 1632253860, 1632253860, 
1632253920, 1632253920, 1632253920, 1632253980, 1632253980, 1632253980, 1632254040, 1632254040, 1632254040, 1632254100, 1632254100, 1632254100, 1632254160, 1632254160, 1632254160, 1632254220, 1632254220, 1632254220, 1632254280, 1632254280, 1632254280, 1632254340, 1632254340, 1632254340, 
1632254400, 1632254400, 1632254400, 1632254460, 1632254460, 1632254460, 1632254520, 1632254520, 1632254520, 1632254580, 1632254580, 1632254580, 1632254640, 1632254640, 1632254640, 1632254700, 1632254700, 1632254700, 1632254760, 1632254760, 1632254760, 1632254820, 1632254820, 1632254820, 
1632254880, 1632254880, 1632254880, 1632254940, 1632254940, 1632254940, 1632255000, 1632255000, 1632255000, 1632255060, 1632255060, 1632255060, 1632255120, 1632255120, 1632255120, 1632255180, 1632255180, 1632255180, 1632255240, 1632255240, 1632255240, 1632255300, 1632255300, 1632255300, 
1632255360, 1632255360, 1632255360, 1632255420, 1632255420, 1632255420, 1632255480, 1632255480,1632255480, 1632255540, 1632255540, 1632255540, 1632255600, 1632255600, 1632255600, 1632255660, 1632255660, 1632255660, 1632255720, 1632255720, 1632255720, 1632255780, 1632255780, 1632255780, 1632255840, 1632255840, 1632255840, 1632255900, 1632255900, 1632255900, 1632255960, 1632255960, 1632255960, 1632256020, 1632256020, 1632256020, 1632256080, 1632256080, 1632256080, 1632256140, 1632256140, 1632256140, 1632256200, 1632256200, 1632256200, 1632256260, 1632256260, 1632256260, 1632256320, 1632256320, 1632256320, 1632256380, 1632256380, 1632256380, 
1632256440, 1632256440, 1632256440, 1632256500, 1632256500, 1632256500, 1632256560, 1632256560, 1632256560, 1632256620, 1632256620, 1632256620, 1632256680, 1632256680, 1632256680, 1632256740, 1632256740, 1632256740, 1632256800, 1632256800, 1632256800, 1632256860, 1632256860, 1632256860, 
1632256920, 1632256920, 1632256920, 1632256980, 1632256980, 1632256980, 1632257040, 1632257040, 1632257040, 1632257100, 1632257100, 1632257100, 1632257160, 1632257160, 1632257160, 1632257220, 1632257220, 1632257220, 1632257280, 1632257280, 1632257280, 1632257340, 1632257340, 1632257340, 
1632257400, 1632257400, 1632257400, 1632257460, 1632257460, 1632257460, 1632257520, 1632257520, 1632257520, 1632257580, 1632257580, 1632257580, 1632257640, 1632257640, 1632257640, 1632257700, 1632257700, 1632257700, 1632257760, 1632257760, 1632257760, 1632257820, 1632257820, 1632257820, 
1632257880, 1632257880, 1632257880, 1632257940, 1632257940, 1632257940, 1632258000, 1632258000, 1632258000, 1632258060, 1632258060, 1632258060, 1632258120, 1632258120, 1632258120, 1632258180, 1632258180, 1632258180, 1632258240, 1632258240, 1632258240, 1632258300, 1632258300, 1632258300, 
1632258360, 1632258360, 1632258360, 1632258420, 1632258420, 1632258420, 1632258480, 1632258480), class = c("POSIXct", "POSIXt"), tzone = "")), 
row.names = 16500:17000, class = "data.frame")

I have tried filtering the data a few ways, mostly with dplyr(), but am having trouble with removal of duplicated DateTime values.

I found this popular post in stack overflow, but it isn't quite what I am looking for: Remove duplicated rows.

I tried things like this to identify the duplicated DateTime points, but they are not working:

#One try
modified <- raw.data %>% 
  group_by(rleid=data.table::rleid(Pressure < 4000)) %>% 
  mutate(Pressure = replace(Pressure, duplicated(DateTime), NA)) %>%
  ungroup()

#Another try
modified <- raw.data %>% filter(Pressure < 4000) %>% which(duplicated(DateTime)

Any help would be appreciated. Thanks.

CodePudding user response:

Why not just a slice,

bellow = raw.data[raw.data$Pressure<4000,]
bellow = bellow[!duplicated(bellow$DateTime), ]
above = raw.data[raw.data$Pressure>4000,]

Then merge the two data frames.

CodePudding user response:

Perhaps this helps

library(dplyr)
raw.data %>%
   mutate(ind = row_number() * NA^(!Pressure < 4000)) %>% 
   filter(!duplicated(DateTime[ind])|Pressure > 4000) %>%
   select(-ind)
  • Related