I have the following dataset where I have replicates of the same ID entry (example: 20103 is repeated twice):
head(Data)
## X ID DOB sector meters Oct Res_FROM Res_TO Exp_FROM Exp_TO
## 1 1 20100 8/24/1979 H38 6400 W 8/15/1979 5/15/1991 8/24/1979 12/31/1988
## 2 2 20101 5/5/1980 B01 1600 NW 5/15/1980 4/15/1991 5/15/1980 12/31/1988
## 3 3 20102 3/17/1979 H04 1600 SW 6/15/1972 8/15/1979 3/17/1979 8/15/1979
## 4 4 20103 11/30/1981 B09 3200 NE 1/15/1982 1/15/1984 1/15/1982 1/15/1984
## 5 5 20103 11/30/1981 B37 8000 N 1/15/1984 4/15/1986 1/15/1984 4/15/1986
## 6 6 20104 9/1/1978 B09 3200 NE 1/15/1982 1/15/1984 1/15/1982 1/15/1984
## Exps_Grp Yr1952 Yr1953 Yr1954 Yr1955 Yr1956 Yr1957 Yr1958 Yr1959 Yr1960
## 1 NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA
## Yr1961 Yr1962 Yr1963 Yr1964 Yr1965 Yr1966 Yr1967 Yr1968 Yr1969 Yr1970 Yr1971
## 1 NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA
## Yr1972 Yr1973 Yr1974 Yr1975 Yr1976 Yr1977 Yr1978 Yr1979 Yr1980 Yr1981
## 1 NA NA NA NA NA NA NA 1.082616 0.7834218 0.7834218
## 2 NA NA NA NA NA NA NA NA 0.6825884 1.0937646
## 3 NA NA NA NA NA NA NA 4.673775 NA NA
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA
## Yr1982 Yr1983 Yr1984 Yr1985 Yr1986 Yr1987 Yr1988
## 1 0.7834218 0.7834218 0.7834218 0.7834218 0.7834218 0.7834218 0.1956091
## 2 1.0937646 1.0937646 1.0937646 1.0937646 1.0937646 1.0937646 0.2730972
## 3 NA NA NA NA NA NA NA
## 4 2.7934596 2.8975827 0.1041230 NA NA NA NA
## 5 NA NA 0.5662659 0.5890579 0.1416258 NA NA
## 6 2.7934596 2.8975827 0.1041230 NA NA NA NA
## Yrs_Exp arth_mean median cumulative age
## 1 9.3616438 0.7545599 0.7834218 7.545599 9
## 2 8.6356164 0.9568931 1.0937646 8.612038 8
## 3 0.4136986 4.6737751 4.6737751 4.673775 0
## 4 2.0000000 1.9317218 2.7934596 5.795165 2
## 5 2.2493151 0.4323165 0.5662659 1.296950 4
## 6 2.0000000 1.9317218 2.7934596 5.795165 5
My full data has over 14,000 rows. I want to eliminate the duplicate ID entries; however, when I do this elimination, I need to keep the row that has the highest age. For example, the two rows with ID = 20103, the first age is 2 and the other age is 4. I would only want to keep the row with the highest age, which would be 4. I've used the aggregate function before, but I don't know how to apply it to only eliminate duplicates and keep the row with the highest number for age. I have included a reproducible dataset below.
structure(list(X = 1:9, ID = c(20100L, 20101L, 20102L, 20103L,
20103L, 20104L, 20104L, 20105L, 20105L), DOB = c("8/24/1979",
"5/5/1980", "3/17/1979", "11/30/1981", "11/30/1981", "9/1/1978",
"9/1/1978", "12/3/1980", "12/3/1980"), sector = c("H38", "B01",
"H04", "B09", "B37", "B09", "B37", "B09", "B09"), meters = c(6400L,
1600L, 1600L, 3200L, 8000L, 3200L, 8000L, 3200L, 3200L), Oct = c("W",
"NW", "SW", "NE", "N", "NE", "N", "NE", "NE"), Res_FROM = c("8/15/1979",
"5/15/1980", "6/15/1972", "1/15/1982", "1/15/1984", "1/15/1982",
"1/15/1984", "12/15/1980", "8/15/1983"), Res_TO = c("5/15/1991",
"4/15/1991", "8/15/1979", "1/15/1984", "4/15/1986", "1/15/1984",
"4/15/1986", "8/15/1983", "3/15/1991"), Exp_FROM = c("8/24/1979",
"5/15/1980", "3/17/1979", "1/15/1982", "1/15/1984", "1/15/1982",
"1/15/1984", "12/15/1980", "8/15/1983"), Exp_TO = c("12/31/1988",
"12/31/1988", "8/15/1979", "1/15/1984", "4/15/1986", "1/15/1984",
"4/15/1986", "8/15/1983", "12/31/1988"), Exps_Grp = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA), Yr1952 = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA), Yr1953 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1954 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1955 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1956 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1957 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1958 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1959 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1960 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1961 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1962 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1963 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1964 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1965 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1966 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1967 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1968 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1969 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1970 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1971 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1972 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1973 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1974 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1975 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1976 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Yr1977 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1978 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
Yr1979 = c(1.082616155, NA, 4.673775148, NA, NA, NA, NA,
NA, NA), Yr1980 = c(0.783421772, 0.682588398, NA, NA, NA,
NA, NA, 0.120085751, NA), Yr1981 = c(0.783421772, 1.093764595,
NA, NA, NA, NA, NA, 2.897582683, NA), Yr1982 = c(0.783421772,
1.093764595, NA, 2.793459642, NA, 2.793459642, NA, 2.897582683,
NA), Yr1983 = c(0.783421772, 1.093764595, NA, 2.897582683,
NA, 2.897582683, NA, 1.805844233, 1.09173845), Yr1984 = c(0.783421772,
1.093764595, NA, 0.104123041, 0.566265934, 0.104123041, 0.566265934,
NA, 2.897582683), Yr1985 = c(0.783421772, 1.093764595, NA,
NA, 0.589057923, NA, 0.589057923, NA, 2.897582683), Yr1986 = c(0.783421772,
1.093764595, NA, NA, 0.141625765, NA, 0.141625765, NA, 2.897582683
), Yr1987 = c(0.783421772, 1.093764595, NA, NA, NA, NA, NA,
NA, 2.897582683), Yr1988 = c(0.1956091, 0.27309722, NA, NA,
NA, NA, NA, NA, 0.723484539), Yrs_Exp = c(9.361643836, 8.635616438,
0.41369863, 2, 2.249315068, 2, 2.249315068, 2.665753425,
5.383561644), arth_mean = c(0.754559943, 0.956893087, 4.673775148,
1.931721789, 0.432316541, 1.931721789, 0.432316541, 1.930273838,
2.234258954), median = c(0.783421772, 1.093764595, 4.673775148,
2.793459642, 0.566265934, 2.793459642, 0.566265934, 2.351713458,
2.897582683), cumulative = c(7.545599433, 8.612037782, 4.673775148,
5.795165366, 1.296949622, 5.795165366, 1.296949622, 7.72109535,
13.40555372), age = c(9L, 8L, 0L, 2L, 4L, 5L, 7L, 2L, 8L)), class = "data.frame", row.names = c(NA,
-9L))
CodePudding user response:
Simply arrange by in decreasing order and then use distinct
:
library(dplyr)
df %>%
select(ID, age) %>%
arrange(desc(age)) %>%
distinct(ID, .keep_all = TRUE)
#> ID age
#> 1 20100 9
#> 2 20101 8
#> 3 20105 8
#> 4 20104 7
#> 5 20103 4
#> 6 20102 0
Note that, I have only selected the two columns of interest ID
and age
just for concise output and to focus on the issue given here, using select()
, which obviously you don't need to do.
CodePudding user response:
Option using aggregate
:
aggregate(age~ID,df,function(x) max(x))
#> ID age
#> 1 20100 9
#> 2 20101 8
#> 3 20102 0
#> 4 20103 4
#> 5 20104 7
#> 6 20105 8
Created on 2022-07-10 by the reprex package (v2.0.1)
CodePudding user response:
Here is another dplyr
way, grouping and filtering by condition has the same effect:
library(dplyr)
df %>%
select(ID, age) %>% # just to keep the main columns (remove this)
group_by(ID) %>%
filter(age==max(age)) %>%
ungroup()
ID age
<int> <int>
1 20100 9
2 20101 8
3 20102 0
4 20103 4
5 20104 7
6 20105 8
CodePudding user response:
In base R
df_new <- df[order(df$age, decreasing = T), ]
df_new <- df_new[!duplicated(df_new$ID), ]
output of ID and age columns
> df_new[ , c("ID","age")]
ID age
1 20100 9
2 20101 8
9 20105 8
7 20104 7
5 20103 4
3 20102 0