I am creating a new column using a vector value matching %in% operator
, but it returns NA
in the created column.
Any thoughts and solutions?
I used this chunk of code:
> head(GddIdwHumRes)
aggre_per slope Intercept r.squared adj.r.squared
1 3_GDD.AS_sum_Season1 4.239 3525 0.048890 -0.019050
2 3_GDD.Ch_sum_Season1 3.438 4004 0.048950 -0.018980
3 3_gridded_idw2_sum_Season1 -1.834 6315 0.136600 0.074970
4 3_hum.AS.mean_Season1 -505.800 6463 0.004094 -0.067040
5 3_hum.Ch.mean_Season1 -1921.000 7535 0.061420 -0.005625
6 3_ndvi.mean_Season1 1562.000 5415 0.158600 0.098480
fstatistic.value pval pearson Wilayah Season
1 0.71960 0.4106 0.22110 I Season1
2 0.72050 0.4102 0.22120 I Season1
3 2.21600 0.1588 -0.36960 I Season1
4 0.05756 0.8139 -0.06399 I Season1
5 0.91610 0.3547 -0.24780 I Season1
6 2.63900 0.1266 0.39820 I Season1
> # Mutate a `WI` column
> GddIdwHumRes <- GddIdwHumRes %>%
mutate(WI = case_when(
aggre_per %in% 'GDD.AS_sum' ~ "GDD_AS_sum",
aggre_per %in% 'GDD.Ch_sum' ~ "GDD_Ch_sum",
aggre_per %in% 'gridded_idw2_sum' ~ "idw2_sum",
aggre_per %in% "hum.AS.mean" ~ "hum_AS_mean",
aggre_per %in% "hum.Ch.mean" ~ "hum_Ch_mean",
aggre_per %in% "ndvi.mean" ~ "ndvi_mean",
aggre_per %in% "spi_1" ~ "spi_1",
aggre_per %in% "spi_3" ~ "spi_3",
aggre_per %in% "spi_6" ~ "spi_6"
))
> head(GddIdwHumRes)
aggre_per slope Intercept r.squared adj.r.squared
1 3_GDD.AS_sum_Season1 4.239 3525 0.048890 -0.019050
2 3_GDD.Ch_sum_Season1 3.438 4004 0.048950 -0.018980
3 3_gridded_idw2_sum_Season1 -1.834 6315 0.136600 0.074970
4 3_hum.AS.mean_Season1 -505.800 6463 0.004094 -0.067040
5 3_hum.Ch.mean_Season1 -1921.000 7535 0.061420 -0.005625
6 3_ndvi.mean_Season1 1562.000 5415 0.158600 0.098480
fstatistic.value pval pearson Wilayah Season WI
1 0.71960 0.4106 0.22110 I Season1 <NA>
2 0.72050 0.4102 0.22120 I Season1 <NA>
3 2.21600 0.1588 -0.36960 I Season1 <NA>
4 0.05756 0.8139 -0.06399 I Season1 <NA>
5 0.91610 0.3547 -0.24780 I Season1 <NA>
6 2.63900 0.1266 0.39820 I Season1 <NA>
And here is link to my dataset: https://drive.google.com/file/d/1nyT1zlUrPRv3CwQpe_n13auKAKoGMJuG/view?usp=sharing
CodePudding user response:
The reason we are getting all NA
is because %in%
looks for a fixed match and not substring match. If we want to match based on substring, either use grepl
(from base R
) or str_detect
(from stringr
) to create logical vectors in case_when
.
> head(GddIdwHumRes$aggre_per)
[1] "3_GDD.AS_sum_Season1" "3_GDD.Ch_sum_Season1" "3_gridded_idw2_sum_Season1" "3_hum.AS.mean_Season1" "3_hum.Ch.mean_Season1" "3_ndvi.mean_Season1"
> head(GddIdwHumRes$aggre_per) %in% 'GDD.AS_sum'
[1] FALSE FALSE FALSE FALSE FALSE FALSE
> grepl('GDD.AS_sum', head(GddIdwHumRes$aggre_per))
[1] TRUE FALSE FALSE FALSE FALSE FALSE
As %in%
returns FALSE
, the case_when
conditions are not met and the default case is returned i.e. NA
Based on the pattern showed, it may not even require case_when
as we could directly extract the substring with str_extract
library(dplyr)
library(stringr)
GddIdwHumRes <- GddIdwHumRes %>%
mutate(WI = str_extract(aggre_per, "(GDD|ndvi|hum|gridded|spi).*(?=_Season)"))
-output
> GddIdwHumRes$WI
[1] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[10] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[19] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[28] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[37] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[46] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[55] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[64] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[73] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[82] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[91] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[100] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[109] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[118] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[127] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[136] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[145] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[154] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[163] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[172] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[181] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[190] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[199] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[208] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[217] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[226] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[235] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[244] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[253] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[262] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[271] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[280] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[289] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[298] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[307] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[316] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[325] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[334] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[343] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[352] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[361] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[370] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[379] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "spi_1" "spi_3" "spi_6"
[388] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum"
[397] "hum.AS.mean" "hum.Ch.mean" "ndvi.mean" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[406] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[415] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[424] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[433] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[442] "spi_1" "spi_3" "spi_6" "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
[451] "GDD.AS_sum" "GDD.Ch_sum" "gridded_idw2_sum" "hum.AS.mean" "hum.Ch.mean" "ndvi.mean"
If we want to use case_when
, the str_detect
code would be
GddIdwHumRe <- GddIdwHumRes %>%
mutate(WI = case_when(
str_detect(aggre_per, 'GDD.AS_sum') ~ "GDD_AS_sum",
str_detect(aggre_per, 'GDD.Ch_sum') ~ "GDD_Ch_sum",
str_detect(aggre_per,'gridded_idw2_sum') ~ "idw2_sum",
str_detect(aggre_per, "hum.AS.mean") ~ "hum_AS_mean",
str_detect(aggre_per, "hum.Ch.mean") ~ "hum_Ch_mean",
str_detect(aggre_per,"ndvi.mean")~ "ndvi_mean",
str_detect(aggre_per,"spi_1") ~ "spi_1",
str_detect(aggre_per, "spi_3") ~ "spi_3",
str_detect(aggre_per, "spi_6") ~ "spi_6"
))