I'm working with a cross-country panel dataset.
I want to create a new variable (let's call it C1
) such that the variable for Argentina in 1999 reflects the average value of inf from the beginning of the sample till the previous year (i.e., 1998).
Likewise, C1 for Turkey in 1991 reflects the average of inf from the beginning of the sample till 1990. Note that the first year for which data is available might be different for each country.
I would like to ideally do this using dplyr
. Any suggestions or leads would be much appreciated.
Below is a snapshot of my data.
# A tibble: 82 x 3
country year inf
<chr> <dbl> <dbl>
1 Argentina 1960 NA
2 Argentina 1961 20.3
3 Argentina 1962 28.9
4 Argentina 1963 25.6
5 Argentina 1964 28.8
6 Argentina 1965 21.2
7 Argentina 1966 25.6
8 Argentina 1967 29.0
9 Argentina 1968 10.3
10 Argentina 1969 7.80
# ... with 72 more rows
structure(list(country = c("Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Argentina", "Argentina",
"Argentina", "Argentina", "Argentina", "Turkey", "Turkey", "Turkey",
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey",
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey",
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey",
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey",
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey",
"Turkey", "Turkey", "Turkey"), year = c(1960, 1961, 1962, 1963,
1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974,
1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985,
1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996,
1997, 1998, 1999, 2000, 1960, 1961, 1962, 1963, 1964, 1965, 1966,
1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977,
1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988,
1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
2000), inf = c(NA, 20.3106975252928, 28.8718416517331, 25.5911543481732,
28.7746173195633, 21.2329349235212, 25.6034624263967, 29.0182629356081,
10.2753105574405, 7.79583003660558, 6.46787463937974, 31.2710638825753,
64.2447859181755, 65.5352873661713, 30.6304413715773, 197.697364540376,
438.322779724218, 159.427177705708, 161.372172005774, 147.377044038694,
95.7904246111226, 105.27636637456, 194.535299671424, 380.158528520508,
611.196303889043, 607.447497820506, 77.2922366864323, 127.539918062845,
381.246344411434, 3046.09115199662, 2078.31681803114, 140.502378663632,
16.0719935356945, -3.56109557576575, 2.84933971509302, 3.16512339016516,
-0.0523754965904146, -0.46391313116429, -1.70527962400085, -1.83655839220664,
1.03728710073175, NA, 4.37340043959567, 5.78893779037435, 6.4166944802911,
2.35243651899155, 4.11785566096488, 5.94759067350768, 5.93989052660011,
4.76015624386707, 6.87631710287759, 8.64450444669336, 16.9020871544943,
10.9756760490106, 21.9302187016976, 28.9923473546045, 21.3092400682888,
15.6126574914859, 24.0890232450352, 47.5411467102416, 76.7208667189401,
93.0032247916414, 44.0570704966979, 28.2268344007077, 26.2582454161426,
48.2368326072076, 53.054476874505, 36.0068855504432, 33.6122290331458,
69.0181311101417, 75.4048306122696, 58.2443981066746, 59.1641068909336,
65.1994382769795, 68.3794283559812, 104.749137215122, 86.007542441817,
77.223512216706, 81.4548625820616, 143.639661614673, 54.2799583040521,
49.3607762430324)), row.names = c(NA, -82L), class = c("tbl_df",
"tbl", "data.frame"))
CodePudding user response:
Use cummean
. Since you're first value is NA, you can group_by
NA values and country, and apply the cummean
.
df %>%
group_by(country, na = is.na(inf)) %>%
mutate(C1 = head(c(NA, cummean(inf)), -1)) %>%
ungroup() %>%
select(-na)
# A tibble: 82 x 4
country year inf C1
<chr> <dbl> <dbl> <dbl>
1 Argentina 1960 NA NA
2 Argentina 1961 20.3 NA
3 Argentina 1962 28.9 20.3
4 Argentina 1963 25.6 24.6
5 Argentina 1964 28.8 24.9
6 Argentina 1965 21.2 25.9
7 Argentina 1966 25.6 25.0
8 Argentina 1967 29.0 25.1
9 Argentina 1968 10.3 25.6
10 Argentina 1969 7.80 23.7
# ... with 72 more rows
CodePudding user response:
To deal with the years with missing values, you can compute the running mean in another dataframe with missings dropped, then merge the results back into your original dataframe.
library(dplyr)
avg_calc <- mydata %>%
filter(!is.na(inf)) %>%
group_by(country) %>%
mutate(
C1 = cummean(inf),
year = year 1, # add one to year so merged avg reflects previous year
inf = NULL
) %>%
ungroup()
mydata <- mydata %>%
left_join(avg_calc)
mydata
#> # A tibble: 82 x 4
#> country year inf C1
#> <chr> <dbl> <dbl> <dbl>
#> 1 Argentina 1960 NA NA
#> 2 Argentina 1961 20.3 NA
#> 3 Argentina 1962 28.9 20.3
#> 4 Argentina 1963 25.6 24.6
#> 5 Argentina 1964 28.8 24.9
#> 6 Argentina 1965 21.2 25.9
#> 7 Argentina 1966 25.6 25.0
#> 8 Argentina 1967 29.0 25.1
#> 9 Argentina 1968 10.3 25.6
#> 10 Argentina 1969 7.80 23.7
#> # ... with 72 more rows