Home > Blockchain >  filter dataframes for the first occurrence of any non-zero value per row
filter dataframes for the first occurrence of any non-zero value per row

Time:04-01

These are the type of data frames that I have, with two examples as to how they can differ:

-   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   Type1   992.0   4461.0  1.2 38476.0 :1..4473

Second data frame:

-   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   Type2   1.0 5131.0  0.4 44433.0 -:1998..7151
-   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   Type2   5331.0  845.0   1.3 6672.0  -:1164..2016
Type3   1945.0  91.0    18.7    426.0   -:501..597  Type3   1912.0  91.0    18.7    426.0   -:501..597  -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
Type3   2071.0  196.0   18.9    468.0   -:10..236   Type3   2038.0  196.0   18.9    468.0   -:10..236   Type3   2049.0  141.0   16.3    441.0   -:10..196   Type3   2049.0  141.0   16.3    441.0   -:10..196   Type3   8294.0  151.0   17.2    580.0   -:10..196
-   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   Type4   8604.0  1473.0  0.5 13042.0 :1..1471
-   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   Type5   9795.0  2114.0  32.0    1971.0  :1296..3439
-   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   Type6   10131.0 5684.0  0.3 49063.0 :1455..7113

What I am looking for is code that will extract the first occurrence of anything that isn't '-' without relying on the occurrence being 'Type*', just anything not '-'. So the output would look like this:

Type1

or

Type2
Type3
Type4
Type5
Type6

I can obviously subset for anything that doesn't equal '-' but I can't figure out how to only get the first occurrence, because I want the output to have the same dimensions. I see a lot of solutions for the first occurrence of any word in an entire dataframe, but this needs to be per row and I just can't seem to get it working.

dput for the first data.frame:

dput(x)
structure(list(V1 = structure(1L, .Label = "-", class = "factor"), 
    V2 = structure(1L, .Label = "-", class = "factor"), V3 = structure(1L, .Label = "-", class = "factor"), 
    V4 = structure(1L, .Label = "-", class = "factor"), V5 = structure(1L, .Label = "-", class = "factor"), 
    V6 = structure(1L, .Label = "-", class = "factor"), V7 = structure(1L, .Label = "-", class = "factor"), 
    V8 = structure(1L, .Label = "-", class = "factor"), V9 = structure(1L, .Label = "-", class = "factor"), 
    V10 = structure(1L, .Label = "-", class = "factor"), V11 = structure(1L, .Label = "-", class = "factor"), 
    V12 = structure(1L, .Label = "-", class = "factor"), V13 = structure(1L, .Label = "-", class = "factor"), 
    V14 = structure(1L, .Label = "-", class = "factor"), V15 = structure(1L, .Label = "-", class = "factor"), 
    V16 = structure(1L, .Label = "-", class = "factor"), V17 = structure(1L, .Label = "-", class = "factor"), 
    V18 = structure(1L, .Label = "-", class = "factor"), V19 = structure(1L, .Label = "-", class = "factor"), 
    V20 = structure(1L, .Label = "-", class = "factor"), V21 = structure(1L, .Label = "-", class = "factor"), 
    V22 = structure(1L, .Label = "-", class = "factor"), V23 = structure(1L, .Label = "-", class = "factor"), 
    V24 = structure(1L, .Label = "-", class = "factor"), V25 = structure(1L, .Label = "Type1", class = "factor"), 
    V26 = 992, V27 = 4461, V28 = 1.2, V29 = 38476, V30 = structure(1L, .Label = ":1..4473", class = "factor")), class = "data.frame", row.names = c(NA, -1L))

The second data frame example:

structure(list(V1 = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 1L), .Label = c("-", 
"Type2"), class = "factor"), V2 = structure(c(1L, 1L, 2L, 3L, 
1L, 1L, 1L), .Label = c("-", "1945.0", "2071.0"), class = "factor"), 
    V3 = structure(c(1L, 1L, 3L, 2L, 1L, 1L, 1L), .Label = c("-", 
    "196.0", "91.0"), class = "factor"), V4 = structure(c(1L, 
    1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", "18.7", "18.9"), class = "factor"), 
    V5 = structure(c(1L, 1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", 
    "426.0", "468.0"), class = "factor"), V6 = structure(c(1L, 
    1L, 3L, 2L, 1L, 1L, 1L), .Label = c("-", "-:10..236", "-:501..597"
    ), class = "factor"), V7 = structure(c(1L, 1L, 2L, 2L, 1L, 
    1L, 1L), .Label = c("-", "Type2"), class = "factor"), V8 = structure(c(1L, 
    1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", "1912.0", "2038.0"
    ), class = "factor"), V9 = structure(c(1L, 1L, 3L, 2L, 1L, 
    1L, 1L), .Label = c("-", "196.0", "91.0"), class = "factor"), 
    V10 = structure(c(1L, 1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", 
    "18.7", "18.9"), class = "factor"), V11 = structure(c(1L, 
    1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", "426.0", "468.0"
    ), class = "factor"), V12 = structure(c(1L, 1L, 3L, 2L, 1L, 
    1L, 1L), .Label = c("-", "-:10..236", "-:501..597"), class = "factor"), 
    V13 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-", 
    "Type2"), class = "factor"), V14 = structure(c(1L, 1L, 1L, 
    2L, 1L, 1L, 1L), .Label = c("-", "2049.0"), class = "factor"), 
    V15 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-", 
    "141.0"), class = "factor"), V16 = structure(c(1L, 1L, 1L, 
    2L, 1L, 1L, 1L), .Label = c("-", "16.3"), class = "factor"), 
    V17 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-", 
    "441.0"), class = "factor"), V18 = structure(c(1L, 1L, 1L, 
    2L, 1L, 1L, 1L), .Label = c("-", "-:10..196"), class = "factor"), 
    V19 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-", 
    "Type2"), class = "factor"), V20 = structure(c(1L, 1L, 1L, 
    2L, 1L, 1L, 1L), .Label = c("-", "2049.0"), class = "factor"), 
    V21 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-", 
    "141.0"), class = "factor"), V22 = structure(c(1L, 1L, 1L, 
    2L, 1L, 1L, 1L), .Label = c("-", "16.3"), class = "factor"), 
    V23 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-", 
    "441.0"), class = "factor"), V24 = structure(c(1L, 1L, 1L, 
    2L, 1L, 1L, 1L), .Label = c("-", "-:10..196"), class = "factor"), 
    V25 = structure(c(4L, 4L, 1L, 3L, 5L, 2L, 5L), .Label = c("-", 
    "Type3", "Type2", "Type4", "Type5"), class = "factor"), 
    V26 = structure(c(2L, 4L, 1L, 5L, 6L, 7L, 3L), .Label = c("-", 
    "1.0", "10131.0", "5331.0", "8294.0", "8604.0", "9795.0"), class = "factor"), 
    V27 = structure(c(5L, 7L, 1L, 3L, 2L, 4L, 6L), .Label = c("-", 
    "1473.0", "151.0", "2114.0", "5131.0", "5684.0", "845.0"), class = "factor"), 
    V28 = structure(c(3L, 5L, 1L, 6L, 4L, 7L, 2L), .Label = c("-", 
    "0.3", "0.4", "0.5", "1.3", "17.2", "32.0"), class = "factor"), 
    V29 = structure(c(4L, 7L, 1L, 6L, 2L, 3L, 5L), .Label = c("-", 
    "13042.0", "1971.0", "44433.0", "49063.0", "580.0", "6672.0"
    ), class = "factor"), V30 = structure(c(4L, 3L, 1L, 2L, 5L, 
    6L, 7L), .Label = c("-", "-:10..196", "-:1164..2016", "-:1998..7151", 
    ":1..1471", ":1296..3439", ":1455..7113"), class = "factor")), class = "data.frame", row.names = c(NA, 
-7L))

CodePudding user response:

The following works:

apply(df, MARGIN = 1, FUN = function(row) row[!grepl("-", row)][1])
[1] "Type2" "Type2" "Type3" "Type3" "Type4" "Type5" "Type6"

apply with MARGIN = 1 acts on rows. The function in FUN uses grepl to catch all elements of the row not matching - and returns the first element with [1].

CodePudding user response:

We can use a vectorized option with max.col to find the position of column where the first case of non - occurs and cbind with sequence of rows and extract the values based on the row/column index

df1[cbind(seq_len(nrow(df1)), max.col(df1 != "-", "first"))]
[1] "Type4" "Type4" "Type2" "Type2" "Type5" "Type3" "Type5"
 x[cbind(seq_len(nrow(x)), max.col(x != "-", "first"))]
[1] "Type1"
  • Related