I have the following df:
car <- data.frame(stringsAsFactors = FALSE, year = c(2010,2011,2012,2013,2014,2015,2010,2011,2012,2013,2014,2015,2011,2012,2013,2014,2015),
person = c("A","A","A","A","A","A", "B","B","B","B","B","B","C","C","C","C","C"),
car = c("BMW", "BMW", "AUDI", "AUDI", "AUDI", "Mercedes", "Citroen","Citroen", "Citroen", "Toyota", "Toyota", "Peugeot", "Volkswagen","Volkswagen","Mercedes", "Mercedes","Tesla"))
Im trying to reproduce this (i. e. creating the how_long_does_the_person_have_the_car variable)
car <- data.frame(stringsAsFactors = FALSE, year = c(2010,2011,2012,2013,2014,2015,2010,2011,2012,2013,2014,2015,2011,2012,2013,2014,2015),
person = c("A","A","A","A","A","A", "B","B","B","B","B","B","C","C","C","C","C"),
car = c("BMW", "BMW", "AUDI", "AUDI", "AUDI", "Mercedes", "Citroen","Citroen", "Citroen", "Toyota", "Toyota", "Peugeot", "Volkswagen","Volkswagen","Mercedes", "Mercedes","Tesla"),
how_long_does_the_person_have_the_car = c(1,2,1,2,3,1,1,2,3,1,2,1,"0/NA", "0/NA", 1,2,1))
tibble::tribble(
~year, ~person, ~car, ~how_long_does_the_person_have_the_car,
2010, "A", "BMW", "1",
2011, "A", "BMW", "2",
2012, "A", "AUDI", "1",
2013, "A", "AUDI", "2",
2014, "A", "AUDI", "3",
2015, "A", "Mercedes", "1",
2010, "B", "Citroen", "1",
2011, "B", "Citroen", "2",
2012, "B", "Citroen", "3",
2013, "B", "Toyota", "1",
2014, "B", "Toyota", "2",
2015, "B", "Peugeot", "1",
2011, "C", "Volkswagen", "0/NA",
2012, "C", "Volkswagen", "0/NA",
2013, "C", "Mercedes", "1",
2014, "C", "Mercedes", "2",
2015, "C", "Tesla", "1"
The how_long_does_the_person_have_the_car variable is a cumulative sum of the variable car, and gets resetted everytime the car or person changes. The problem is that when i use this formula:
car <- car %>%
group_by(person, car) %>%
mutate( how_long_does_the_person_have_the_car = row_number())
I get this output:
tibble::tribble(
~year, ~person, ~car, ~how_long_does_the_person_have_the_car,
2010, "A", "BMW", 1,
2011, "A", "BMW", 2,
2012, "A", "AUDI", 1,
2013, "A", "AUDI", 2,
2014, "A", "AUDI", 3,
2015, "A", "Mercedes", 1,
2010, "B", "Citroen", 1,
2011, "B", "Citroen", 2,
2012, "B", "Citroen", 3,
2013, "B", "Toyota", 1,
2014, "B", "Toyota", 2,
2015, "B", "Peugeot", 1,
2011, "C", "Volkswagen", 1,
2012, "C", "Volkswagen", 2,
2013, "C", "Mercedes", 1,
2014, "C", "Mercedes", 2,
2015, "C", "Tesla", 1
)
However, as i am missing data from 2010 for person C, i would like to have for 2011 & 2012 for person C a value of 0 or NA or something else, as the data from 2010 is missing, and we do not know if person C has his/her Volkswagen for 1 or 2 years in 2011/2012, or for 2/3 years, as we do not know which car person C had in 2010. In 2013, the cumulative value can start counting from 1 again, as now we now that in 2013 person C bought a Mercedes instead.
CodePudding user response:
One possible solution can be to complete the years by declaring the row number as NA and use that to replace the values, i.e.
library(dplyr)
library(tidyr)
car %>%
group_by(person, car) %>%
mutate(res = row_number()) %>%
ungroup() %>%
complete(year, person) %>%
arrange(person, year, car) %>%
fill(car, .direction = 'updown') %>%
group_by(person, car) %>%
mutate(res1 = replace(res, any(is.na(res)), NA)) %>%
filter(!is.na(res)) %>%
select(-res)
# A tibble: 17 x 4
# Groups: person, car [9]
year person car res1
<dbl> <chr> <chr> <int>
1 2010 A BMW 1
2 2011 A BMW 2
3 2012 A AUDI 1
4 2013 A AUDI 2
5 2014 A AUDI 3
6 2015 A Mercedes 1
7 2010 B Citroen 1
8 2011 B Citroen 2
9 2012 B Citroen 3
10 2013 B Toyota 1
11 2014 B Toyota 2
12 2015 B Peugeot 1
13 2011 C Volkswagen NA
14 2012 C Volkswagen NA
15 2013 C Mercedes 1
16 2014 C Mercedes 2
17 2015 C Tesla 1
CodePudding user response:
Possible option:
library(tidyverse)
car <- data.frame(
stringsAsFactors = FALSE, year = c(2010, 2011, 2012, 2013, 2014, 2015, 2010, 2011, 2012, 2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015),
person = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C"),
car = c("BMW", "BMW", "AUDI", "AUDI", "AUDI", "Mercedes", "Citroen", "Citroen", "Citroen", "Toyota", "Toyota", "Peugeot", "Volkswagen", "Volkswagen", "Mercedes", "Mercedes", "Tesla")
)
car |>
mutate(min_year = min(year)) |>
group_by(person, car) |>
mutate(how_long = row_number()) |>
group_by(person) |>
mutate(
first_car = first(car),
how_long = if_else(first(year) > min_year & car == first_car, NA_integer_, how_long)
) |>
select(-first_car)
#> # A tibble: 17 × 5
#> # Groups: person [3]
#> year person car min_year how_long
#> <dbl> <chr> <chr> <dbl> <int>
#> 1 2010 A BMW 2010 1
#> 2 2011 A BMW 2010 2
#> 3 2012 A AUDI 2010 1
#> 4 2013 A AUDI 2010 2
#> 5 2014 A AUDI 2010 3
#> 6 2015 A Mercedes 2010 1
#> 7 2010 B Citroen 2010 1
#> 8 2011 B Citroen 2010 2
#> 9 2012 B Citroen 2010 3
#> 10 2013 B Toyota 2010 1
#> 11 2014 B Toyota 2010 2
#> 12 2015 B Peugeot 2010 1
#> 13 2011 C Volkswagen 2010 NA
#> 14 2012 C Volkswagen 2010 NA
#> 15 2013 C Mercedes 2010 1
#> 16 2014 C Mercedes 2010 2
#> 17 2015 C Tesla 2010 1
Created on 2022-06-02 by the reprex package (v2.0.1)
CodePudding user response:
I'm a bit confused by your problem, as you already have the cumulutative number of years the cars are owned, with some missing data. Is it not enought to just convert the missing data to explicit NA's ?
car$how_long_does_the_person_have_the_car[
car$how_long_does_the_person_have_the_car == '0/NA'
] <- NA
You will then have the desired output you want. If it is necessary to cumulate/rank these again, you can rank by the ownership variable (Assuming it is there already), but this will just give you the same result:
car <- car %>% #This is assuming explicit NA's in ranking variable
group_by(person, car) %>%
mutate(how_long_does_the_person_have_the_car = row_number(how_long_does_the_person_have_the_car)) #Rank by column
Both of these give the same output:
# A tibble: 17 × 4
# Groups: person, car [9]
year person car how_long_does_the_person_have_the_car
<dbl> <chr> <chr> <int>
1 2010 A BMW 1
2 2011 A BMW 2
3 2012 A AUDI 1
4 2013 A AUDI 2
5 2014 A AUDI 3
6 2015 A Mercedes 1
7 2010 B Citroen 1
8 2011 B Citroen 2
9 2012 B Citroen 3
10 2013 B Toyota 1
11 2014 B Toyota 2
12 2015 B Peugeot 1
13 2011 C Volkswagen NA
14 2012 C Volkswagen NA
15 2013 C Mercedes 1
16 2014 C Mercedes 2
17 2015 C Tesla 1