Home > Enterprise >  R: Data imputation for percentages
R: Data imputation for percentages

Time:08-03

I have a data set for country-wise 4G internet coverage for a certain period, in percentage. The data looks like this:

> head(new_data, 20)
# A tibble: 20 × 10
   Country             `2012` `2013`             `2014` `2015`             `2016` `2017` `2018` `2019` `2020`            
   <chr>               <chr>  <chr>              <chr>  <chr>              <chr>  <chr>  <chr>  <chr>  <chr>             
 1 Afghanistan         NULL   NULL               0      0                  0      4      7      22     26                
 2 Albania             NA     NA                 0      35                 80.2   85.3   85.5   95     98.38             
 3 Algeria             0      0                  0      0                  3.62   30.49  52.84  53.63  76.180000000000007
 4 Andorra             NA     NA                 NULL   50                 50     85     85     85     85                
 5 Angola              NA     NULL               NULL   7                  8      8      8      18     30                
 6 Antigua and Barbuda 65     78.599999999999994 80     98                 99     99     NA     99     99                
 7 Argentina           NA     NULL               0      65                 85     85     90.76  91.18  97.68             
 8 Armenia             17.54  44                 46     46.5               52.48  90.05  99.1   99.3   100               
 9 Australia           52.21  85                 95     94                 98     99     99.2   99.4   99.5              
10 Austria             31.57  58.41              85     98                 98     98     98     98     98                
11 Azerbaijan          6.68   14.31              49     39                 41     42     49     93     93                
12 Bahamas             NA     NA                 35     95                 95     95     95     95     95                
13 Bahrain             NA     88.55              95.6   95.6               100    100    100    100    100               
14 Bangladesh          NULL   NULL               59     65                 65     67     79     82     97.8              
15 Barbados            NA     NA                 0      0                  20     99.6   99.6   99.6   99.6              
16 Belarus             NA     NA                 6      20.399999999999999 41.5   68.5   75.7   75.7   89.5              
17 Belgium             12.23  51.7               88     99.89              100    100    100    100    100               
18 Belize              NA     NA                 NA     0                  46.94  65     68     70     70                
19 Benin               NA     NA                 NULL   11.52              23.89  39.72  40     45     46                
20 Bhutan              NULL   5                  10.72  40                 53     55     60     78     80                

How do I impute the data row-wise (i.e. country-wise) keeping in mind that the values should be always between 0 and 100?

Edit

> dput(head(new_data, 20))
structure(list(Country = c("Afghanistan", "Albania", "Algeria", 
"Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", 
"Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", 
"Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan"
), `2012` = c("NULL", NA, "0", NA, NA, "65", NA, "17.54", "52.21", 
"31.57", "6.68", NA, NA, "NULL", NA, NA, "12.23", NA, NA, "NULL"
), `2013` = c("NULL", NA, "0", NA, "NULL", "78.599999999999994", 
"NULL", "44", "85", "58.41", "14.31", NA, "88.55", "NULL", NA, 
NA, "51.7", NA, NA, "5"), `2014` = c("0", "0", "0", "NULL", "NULL", 
"80", "0", "46", "95", "85", "49", "35", "95.6", "59", "0", "6", 
"88", NA, "NULL", "10.72"), `2015` = c("0", "35", "0", "50", 
"7", "98", "65", "46.5", "94", "98", "39", "95", "95.6", "65", 
"0", "20.399999999999999", "99.89", "0", "11.52", "40"), `2016` = c("0", 
"80.2", "3.62", "50", "8", "99", "85", "52.48", "98", "98", "41", 
"95", "100", "65", "20", "41.5", "100", "46.94", "23.89", "53"
), `2017` = c("4", "85.3", "30.49", "85", "8", "99", "85", "90.05", 
"99", "98", "42", "95", "100", "67", "99.6", "68.5", "100", "65", 
"39.72", "55"), `2018` = c("7", "85.5", "52.84", "85", "8", NA, 
"90.76", "99.1", "99.2", "98", "49", "95", "100", "79", "99.6", 
"75.7", "100", "68", "40", "60"), `2019` = c("22", "95", "53.63", 
"85", "18", "99", "91.18", "99.3", "99.4", "98", "93", "95", 
"100", "82", "99.6", "75.7", "100", "70", "45", "78"), `2020` = c("26", 
"98.38", "76.180000000000007", "85", "30", "99", "97.68", "100", 
"99.5", "98", "93", "95", "100", "97.8", "99.6", "89.5", "100", 
"70", "46", "80")), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

CodePudding user response:

A suggestion to fill NA with nearby values

df %>% 
  pivot_longer(-Country, names_to = "year") %>% 
  mutate(value = value %>% as.numeric()) %>% 
  group_by(Country) %>% 
  fill(value, .direction = "updown") %>% 
  pivot_wider(names_from = year, 
              values_from = value)

# A tibble: 20 x 10
# Groups:   Country [20]
   Country          `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019` `2020`
   <chr>             <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 Afghanistan        0       0      0      0     0       4      7     22     26  
 2 Albania            0       0      0     35    80.2    85.3   85.5   95     98.4
 3 Algeria            0       0      0      0     3.62   30.5   52.8   53.6   76.2
 4 Andorra           50      50     50     50    50      85     85     85     85  
 5 Angola             7       7      7      7     8       8      8     18     30  
 6 Antigua and Bar~  65      78.6   80     98    99      99     99     99     99  
 7 Argentina          0       0      0     65    85      85     90.8   91.2   97.7
 8 Armenia           17.5    44     46     46.5  52.5    90.0   99.1   99.3  100  
 9 Australia         52.2    85     95     94    98      99     99.2   99.4   99.5
10 Austria           31.6    58.4   85     98    98      98     98     98     98  
11 Azerbaijan         6.68   14.3   49     39    41      42     49     93     93  
12 Bahamas           35      35     35     95    95      95     95     95     95  
13 Bahrain           88.6    88.6   95.6   95.6 100     100    100    100    100  
14 Bangladesh        59      59     59     65    65      67     79     82     97.8
15 Barbados           0       0      0      0    20      99.6   99.6   99.6   99.6
16 Belarus            6       6      6     20.4  41.5    68.5   75.7   75.7   89.5
17 Belgium           12.2    51.7   88     99.9 100     100    100    100    100  
18 Belize             0       0      0      0    46.9    65     68     70     70  
19 Benin             11.5    11.5   11.5   11.5  23.9    39.7   40     45     46  
20 Bhutan             5       5     10.7   40    53      55     60     78     80  
  • Related