I have the following dataset:
## ID cumulative sector meters DOB
## 1 20100 41.75948 H38 6400 1979-08-24
## 2 20101 35.16874 B01 1600 1980-05-05
## 3 20102 20.60899 H04 1600 1979-03-17
## 4 20103 38.72266 c("B09", "B37") c(3200, 8000) c("1981-11-30", "1981-11-30")
## 5 20104 38.72266 c("B09", "B37") c(3200, 8000) c("1978-09-01", "1978-09-01")
## 6 20105 116.88668 c("B09", "B09") c(3200, 3200) c("1980-12-03", "1980-12-03")
## Oct Res_FROM Res_TO
## 1 W 1979-08-15 1991-05-15
## 2 NW 1980-05-15 1991-04-15
## 3 SW 1972-06-15 1979-08-15
## 4 c("NE", "N") c("1982-01-15", "1984-01-15") c("1984-01-15", "1986-04-15")
## 5 c("NE", "N") c("1982-01-15", "1984-01-15") c("1984-01-15", "1986-04-15")
## 6 c("NE", "NE") c("1980-12-15", "1983-08-15") c("1983-08-15", "1991-03-15")
## Exp_FROM Exp_TO
## 1 1979-08-24 1988-12-31
## 2 1980-05-15 1988-12-31
## 3 1979-03-17 1979-08-15
## 4 c("1982-01-15", "1984-01-15") c("1984-01-15", "1986-04-15")
## 5 c("1982-01-15", "1984-01-15") c("1984-01-15", "1986-04-15")
## 6 c("1980-12-15", "1983-08-15") c("1983-08-15", "1988-12-31")
## Exps_Grp Yr1952 Yr1953 Yr1954 Yr1955 Yr1956 Yr1957 Yr1958
## 1 fr51>88 NA NA NA NA NA NA NA
## 2 fr51>88 NA NA NA NA NA NA NA
## 3 between NA NA NA NA NA NA NA
## 4 c("between", "between") NA NA NA NA NA NA NA
## 5 c("between", "between") NA NA NA NA NA NA NA
## 6 c("between", "fr51>88") NA NA NA NA NA NA NA
## Yr1959 Yr1960 Yr1961 Yr1962 Yr1963 Yr1964 Yr1965 Yr1966 Yr1967 Yr1968 Yr1969
## 1 NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA
## Yr1970 Yr1971 Yr1972 Yr1973 Yr1974 Yr1975 Yr1976 Yr1977 Yr1978 Yr1979
## 1 NA NA NA NA NA NA NA NA NA 5.950991
## 2 NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA 20.608986
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA
## Yr1980 Yr1981 Yr1982 Yr1983 Yr1984 Yr1985 Yr1986
## 1 4.340588 4.340588 4.340588 4.340588 4.340588 4.340588 4.3405881
## 2 2.927725 4.447229 4.447229 4.447229 4.447229 4.447229 4.4472289
## 3 NA NA NA NA NA NA NA
## 4 NA NA 15.365412 16.018407 3.594414 3.052618 0.6918076
## 5 NA NA 15.365412 16.018407 3.594414 3.052618 0.6918076
## 6 0.758267 16.018407 16.018407 16.018407 16.018407 16.018407 16.0184067
## Yr1987 Yr1988 Yrs_Exp arth_mean median cumulative.1 Age
## 1 4.340588 1.083782 9.3616438 4.175948 4.340588 41.75948 9
## 2 4.447229 1.110409 8.6356164 3.907637 4.447229 35.16874 8
## 3 NA NA 0.4136986 20.608986 20.608986 20.60899 9
## 4 NA NA 4.2493151 7.744532 3.594414 38.72266 7
## 5 NA NA 4.2493151 7.744532 3.594414 38.72266 10
## 6 16.018407 3.999565 8.0493151 12.987409 16.018407 116.88668 8
From this information, I want to accomplish a couple things:
I want to calculate how many years that each individual (each ID is one individual) had a mean (which is the arth_mean column) greater than 4. Adding this as a column to my dataset would be ideal.
Coming off of number 2, I want to have each year that each individual exceeded 4 for arth_mean listed out for each person, also in a new column for my dataset. So for example, I may have a column that has 1954, 1966, 1967, etc. listed for each ID.
I'm relatively new to R, so any help or suggestions are appreciated. I have no idea how to get my desired output. I've added reproducible data below.
## structure(list(ID = 20100:20109, cumulative = c(41.75947792,
## 35.16873597, 20.60898553, 38.72265958, 38.72265958, 116.8866785,
## 271.4880918, 134.9473309, 35.91709062, 26.06165412), sector = c("H38",
## "B01", "H04", "c(\"B09\", \"B37\")", "c(\"B09\", \"B37\")", "c(\"B09\", \"B09\")",
## "c(\"B09\", \"B09\")", "H02", "B13", "H38"), meters = c("6400",
## "1600", "1600", "c(3200, 8000)", "c(3200, 8000)", "c(3200, 3200)",
## "c(3200, 3200)", "1600", "4800", "6400"), DOB = c("1979-08-24",
## "1980-05-05", "1979-03-17", "c(\"1981-11-30\", \"1981-11-30\")",
## "c(\"1978-09-01\", \"1978-09-01\")", "c(\"1980-12-03\", \"1980-12-03\")",
## "c(\"1978-04-25\", \"1978-04-25\")", "1979-06-22", "1978-10-09",
## "1982-04-26"), Oct = c("W", "NW", "SW", "c(\"NE\", \"N\")", "c(\"NE\", \"N\")",
## "c(\"NE\", \"NE\")", "c(\"NE\", \"NE\")", "SE", "N", "W"), Res_FROM = c("1979-08-15",
## "1980-05-15", "1972-06-15", "c(\"1982-01-15\", \"1984-01-15\")",
## "c(\"1982-01-15\", \"1984-01-15\")", "c(\"1980-12-15\", \"1983-08-15\")",
## "c(\"1978-04-15\", \"1983-08-15\")", "1979-06-15", "1978-10-15",
## "1982-04-15"), Res_TO = c("1991-05-15", "1991-04-15", "1979-08-15",
## "c(\"1984-01-15\", \"1986-04-15\")", "c(\"1984-01-15\", \"1986-04-15\")",
## "c(\"1983-08-15\", \"1991-03-15\")", "c(\"1983-08-15\", \"2000-01-15\")",
## "1991-04-15", "1983-03-15", "1991-04-15"), Exp_FROM = c("1979-08-24",
## "1980-05-15", "1979-03-17", "c(\"1982-01-15\", \"1984-01-15\")",
## "c(\"1982-01-15\", \"1984-01-15\")", "c(\"1980-12-15\", \"1983-08-15\")",
## "c(\"1978-04-25\", \"1983-08-15\")", "1979-06-22", "1978-10-15",
## "1982-04-26"), Exp_TO = c("1988-12-31", "1988-12-31", "1979-08-15",
## "c(\"1984-01-15\", \"1986-04-15\")", "c(\"1984-01-15\", \"1986-04-15\")",
## "c(\"1983-08-15\", \"1988-12-31\")", "c(\"1983-08-15\", \"1988-12-31\")",
## "1988-12-31", "1983-03-15", "1988-12-31"), Exps_Grp = c("fr51>88",
## "fr51>88", "between", "c(\"between\", \"between\")", "c(\"between\", \"between\")",
## "c(\"between\", \"fr51>88\")", "c(\"between\", \"fr51>88\")",
## "fr51>88", "between", "fr51>88"), Yr1952 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1953 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1954 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1955 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1956 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1957 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1958 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1959 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1960 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1961 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1962 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1963 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1964 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1965 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1966 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1967 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1968 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1969 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1970 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1971 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1972 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1973 = c(NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA), Yr1974 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## ), Yr1975 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1976 = c(NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1977 = c(NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), Yr1978 = c(NA, NA, NA, NA, NA, NA, 79.39642441,
## NA, 5.655852488, NA), Yr1979 = c(5.950991161, NA, 20.60898553,
## NA, NA, NA, 59.94484924, 28.50091596, 16.41746701, NA), Yr1980 = c(4.340588078,
## 2.927724588, NA, NA, NA, 0.758267013, 16.01840668, 12.90308755,
## 4.387060214, NA), Yr1981 = c(4.340588078, 4.447228937, NA, NA,
## NA, 16.01840668, 16.01840668, 12.90308755, 4.387060214, NA),
## Yr1982 = c(4.340588078, 4.447228937, NA, 15.36541238, 15.36541238,
## 16.01840668, 16.01840668, 12.90308755, 4.387060214, 3.274931595
## ), Yr1983 = c(4.340588078, 4.447228937, NA, 16.01840668,
## 16.01840668, 16.01840668, 16.01840668, 12.90308755, 0.682590481,
## 4.340588078), Yr1984 = c(4.340588078, 4.447228937, NA, 3.594414445,
## 3.594414445, 16.01840668, 16.01840668, 12.90308755, NA, 4.340588078
## ), Yr1985 = c(4.340588078, 4.447228937, NA, 3.052618478,
## 3.052618478, 16.01840668, 16.01840668, 12.90308755, NA, 4.340588078
## ), Yr1986 = c(4.340588078, 4.447228937, NA, 0.691807598,
## 0.691807598, 16.01840668, 16.01840668, 12.90308755, NA, 4.340588078
## ), Yr1987 = c(4.340588078, 4.447228937, NA, NA, NA, 16.01840668,
## 16.01840668, 12.90308755, NA, 4.340588078), Yr1988 = c(1.083782142,
## 1.110408824, NA, NA, NA, 3.999564755, 3.999564755, 3.221714571,
## NA, 1.083782142), Yrs_Exp = c(9.361643836, 8.635616438, 0.41369863,
## 4.249315068, 4.249315068, 8.049315068, 10.69315068, 9.534246575,
## 4.416438356, 6.687671233), arth_mean = c(4.175947792, 3.907637331,
## 20.60898553, 7.744531916, 7.744531916, 12.98740872, 24.68073562,
## 13.49473309, 5.98618177, 3.723093446), median = c(4.340588078,
## 4.447228937, 20.60898553, 3.594414445, 3.594414445, 16.01840668,
## 16.01840668, 12.90308755, 4.387060214, 4.340588078), cumulative.1 = c(41.75947792,
## 35.16873597, 20.60898553, 38.72265958, 38.72265958, 116.8866785,
## 271.4880918, 134.9473309, 35.91709062, 26.06165412), Age = c(9L,
## 8L, 9L, 7L, 10L, 8L, 10L, 9L, 10L, 6L)), class = "data.frame", row.names = c(NA,
## -10L))
CodePudding user response:
If your 10 row dataset above is dat
, then you can melt the dataset into a long format (Yr columns), restrict to rows where the value in those year columns exceeds 4, count the number of rows as num_yr_above
, and concatenate the year values as years_above
.
Then just join back to dat
library(data.table)
setDT(dat)
dat[melt(dat,id="ID",
measure=patterns("Yr\\d"))[value>4, .(
num_yr_above=.N,
years_above = list(gsub("Yr","",variable))),
by=ID],
on=.(ID)]
Output: (all columns included, but here I show only the new additional columns added to the dataset)
ID num_yr_above years_above
1: 20106 10 1978,1979,1980,1981,1982,1983,...
2: 20108 5 1978,1979,1980,1981,1982
3: 20100 9 1979,1980,1981,1982,1983,1984,...
4: 20102 1 1979
5: 20107 9 1979,1980,1981,1982,1983,1984,...
6: 20101 7 1981,1982,1983,1984,1985,1986,...
7: 20105 7 1981,1982,1983,1984,1985,1986,...
8: 20103 2 1982,1983
9: 20104 2 1982,1983
10: 20109 5 1983,1984,1985,1986,1987