Identify the first & last non-zero elements/indices within a group in numpy-CodePudding

import numpy as np

group = np.array([0,0,0,0,1,1,1,1,1,1,2,2,2,2])  
array = np.array([1,2,3,0,0,2,0,3,4,0,0,0,0,1])  
targt = np.array([1,1,1,0,0,2,2,2,2,0,0,0,0,1])  

def func(group: np.array, array: np.array):
    pass
    return array

Step 1: Find the indexes of the first & last non-zero elements for each group, i.e. (0, 2) for group 0, (5, 8) for group 1, (13, 13) for group 2.
Step 2: Replace the slices between each indexes with the first nan-zero value within each group, i,e, group 0 [1,2,3,0] -> [1,1,1,0], group 1 [0,2,0,3,4,0] -> [0,2,2,2,2,0], group 2 no change.

How could I do it without spliting arrays or iteration?

[Solutions]

def first_nonzero_index(arr: np.array, axis: int, mode: str = None, invalid_value: float = -1):
    mask = arr != 0
    if mode is None or mode == "head":
        return np.where(mask.any(axis=axis), mask.argmax(axis=axis), invalid_value)
    else:
        return np.where(mask.any(axis=axis),
                        arr.shape[axis] - np.flip(mask, axis=axis).argmax(axis=axis) - 1, invalid_value)

def func1(group: np.array, array: np.array):  # my solution
    group_size = np.bincount(group)[:-1]
    group_idx_end = np.cumsum(group_size)
    array_split = np.split(array, group_idx_end)

    concat_list = []
    for arr in array_split:
        idx_start = first_nonzero_index(arr, axis=0, mode="head")
        if idx_start != -1:
            idx_end = first_nonzero_index(arr, axis=0, mode="tail")   1
            arr_ffill_first_nonzero = np.zeros_like(arr, dtype=float)
            arr_ffill_first_nonzero[idx_start:idx_end] = arr[idx_start]
            concat_list.append(arr_ffill_first_nonzero)
        else:
            concat_list.append(arr)
    return np.hstack(concat_list)

def fill(val):  # contributor @d.b
    inds = np.where(val != 0)[0]
    start, end = min(inds), max(inds)
    fill_val = val[start]
    val[start:end   1] = fill_val
    return val

def split(val, grp):  # contributor @d.b
    arr = []
    curr = [val[0]]
    for i in range(1, len(val)):
        if grp[i] == grp[i - 1]:
            curr.append(val[i])
        else:
            arr.append(np.array(curr))
            curr = [val[i]]
    if curr:
        arr.append(np.array(curr))
    return arr

def func2(group, array):  # contributor @d.b
    return np.concatenate([fill(x) for x in split(array, group)])

def func3(group: np.array, array: np.array):  # my solution
    group_size = np.bincount(group)[:-1]
    group_idx_end = np.cumsum(group_size)
    array_split = np.split(array, group_idx_end)

    concat_list = []
    for arr in array_split:
        idx = np.where(arr != 0)[0]
        start, end = min(idx), max(idx)
        arr[start:end   1] = arr[start]
        concat_list.append(arr)
    return np.hstack(concat_list)

def func4(group, array):  # contributor @d.b
    nonzero = (array != 0)
    _, marker_idx = np.unique(group[nonzero], return_index=True)
    nonzero_idx = np.arange(len(array))[nonzero]
    #STEP 2
    starts = np.minimum.reduceat(nonzero_idx, marker_idx)
    ends = np.maximum.reduceat(nonzero_idx, marker_idx)
    #STEP 3
    values = array[starts]
    out = np.zeros_like(array)
    out[starts] = values
    #check the case we can't insert the last negative value
    if ends[-1] 1==len(array):
        out[ends[:-1] 1] = -values[:-1]
    else:
        out[ends 1] = -values
    return np.cumsum(out)

Output: [1. 1. 1. 0. 0. 2. 2. 2. 2. 0. 0. 0. 0. 1.]

[Test]

n = 2000
group = np.array([0,0,0,0,1,0,0,0,0,1,0,0,0,0,0]*n)  # 30000
array = np.array([1,2,3,0,0,2,0,3,0,0,0,0,0,1,0]*n)
group = np.cumsum(group)

Run 100 times:

func1	func2	func3	func4 (Best solution)
9.70s	2.54s	1.99s	0.03s

CodePudding user response：

group = np.array([0,0,0,0,1,1,1,1,1,1,2,2,2,2])  
array = np.array([1,2,3,0,0,2,0,3,4,0,0,0,0,1])  
targt = np.array([1,1,1,0,0,2,2,2,2,0,0,0,0,1])

You can do the following steps:

STEP 1. Find indices of nonzero items of array and mark the startings of new groups

nonzero_idx -> [*0,1,2,/,*/,5,/,7,8,/,*/,/,/,13] (cross out slashes)
marker_idx -> [0, 4, 10]

STEP 2. Find starting and ending indices for each group, use np.ufunc.reduceat
```
starts -> [ 0,  5, 13]
ends -> [ 2,  8, 13]
```
STEP 3. Think of an out array such that np.cumsum(out) collapses into target array. Like so:
```
[1,0,0,-1,0,2,0,0,0,-2,0,0,0,1] -> [1,1,1,0,0,2,2,2,2,0,0,0,0,1]
```

Now, code:

#STEP 1
nonzero = (array != 0)
_, marker_idx = np.unique(group[nonzero], return_index=True)
nonzero_idx = np.arange(len(array))[nonzero]
#STEP 2
starts = np.minimum.reduceat(nonzero_idx, marker_idx)
ends = np.maximum.reduceat(nonzero_idx, marker_idx)
#STEP 3
values = array[starts]
out = np.zeros_like(array)
out[starts] = values
#check the case we can't insert the last negative value
if ends[-1] 1==len(array): 
    out[ends[:-1] 1] = -values[:-1]
else:
    out[ends 1] = -values
>>> np.cumsum(out)
array([1, 1, 1, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 1], dtype=int32)

No loops needed!

CodePudding user response：

1

import numpy as np
import pandas as pd

def foo(s):
    chk = np.where(s > 0)[0]
    start = min(chk)
    end = max(chk)
    ans = [True if (start <= ind <= end) else False for ind in range(len(s))]
    return ans

pd.Series(array).groupby(group).transform(
    lambda x: x.mask(foo(x), x[x > 0].iloc[0])).to_numpy() 
# array([1, 1, 1, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 1])

2

def split(val, grp):
    inds = np.where(np.append(False, grp[1:] != grp[:-1]))[0]
    return np.array_split(val, inds)

def fill(val):
    inds = np.where(val > 0)[0]
    start, end = min(inds), max(inds)
    val[start:end   1] = val[start]
    return val

np.concatenate([fill(x) for x in split(array, group)])
# array([1, 1, 1, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 1])