Combine 2 different sized arrays element-wise based on index pairing array-CodePudding

Say, we had 2 arrays of unique values:

a = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])  # any values are possible, 
b = np.array([0, 11, 12, 13, 14, 15, 16, 17, 18, 19])  # sorted values are for demonstration

, where a[0] corresponds to b[0], a[1] to b[11], a[2]-b[12], etc.

Then, due to some circumstances we randomly lost some of it and received noise elements from/to both a & b. Now 'useful data' in a and b are kind of 'eroded' like this:

a = np.array([0, 1, 313, 2, 3, 4, 5, 934, 6, 8, 9, 730, 241, 521])
b = np.array([112, 514, 11, 13, 16, 955, 17, 18, 112])

The noise elements have negligible probability to coincide with any of 'useful data'. So, if to search them, we could find the left ones and to define the 'index pairing array':

cor_tab = np.array([[1,2], [4,3], [8,4], [9,7]])

which, if applied, provides pairs of 'useful data' left:

np.column_stack((a[cor_tab[:,0]], b[cor_tab[:,1]]))    
array([[1, 11],
       [3, 13],
       [6, 16],
       [8, 18]])

The question: Given the 'eroded' a and b, how to combine them into numpy array such that:

values indexed in cor_tab are paired in the same column/row,
lost values are treated as -1,
noise as 'don't care', and

array looks like this:

   [[ -1 112], 
    [  0 514], 
    [  1  11], 
    [313  -1], 
    [  2  -1], 
    [  3  13], 
    [  4  -1], 
    [  5  -1], 
    [934  -1], 
    [  6  16], 
    [ -1 955], 
    [ -1  17], 
    [  8  18], 
    [  9  -1], 
    [730  -1], 
    [241  -1], 
    [521 112]]

, where 'useful data' is at indices: 2, 5, 9, 12?

Initially I solved this, in dubious way:

import numpy as np

def combine(aa, bb, t):
    c0 = np.empty((0), int)
    c1 = np.empty((0), int)
    # add -1 & 'noise' at the left side:
    if t[0][0] > t[0][1]:
        c0 = np.append(c0, aa[: t[0][0]])
        c1 = np.append(c1, [np.append([-1] * (t[0][0] - t[0][1]), bb[: t[0][1]])])
    else:
        c0 = np.append(c0, [np.append([-1] * (t[0][1] - t[0][0]), aa[: t[0][0]])])
        c1 = np.append(c1, bb[: t[0][1]])

    ind_compenstr = t[0][0] - t[0][1]  # 'index compensator'
    for i, ii in enumerate(t):
        x = ii[0] - ii[1] - ind_compenstr
        # add -1 & 'noise' in the middle:
        if x > 0:
            c0 = np.append(c0, [aa[ii[0]-x:ii[0]]])
            c1 = np.append(c1, [[-1] * x])
        elif x == 0:
            c0 = np.append(c0, [aa[ii[0]-x:ii[0]]])
            c1 = np.append(c1, [bb[ii[1]-x:ii[1]]])
        else:
            x = abs(x)
            c0 = np.append(c0, [[-1] * x])
            c1 = np.append(c1, [bb[ii[1]-x:ii[1]]])
        # add useful elements: 
        c0 = np.append(c0, aa[ii[0]])
        c1 = np.append(c1, bb[ii[1]])
        ind_compenstr  = x
    # add -1 & 'noise' at the right side: 
    l0 = len(aa) - t[-1][0]
    l1 = len(bb) - t[-1][1]
    if l0 > l1:
        c0 = np.append(c0, aa[t[-1][0]   1:])
        c1 = np.append(c1, [np.append(bb[t[-1][1]   1:], [-1] * (l0 - l1))])
    else:
        c0 = np.append(c0, [np.append(aa[t[-1][0]   1:], [-1] * (l1 - l0))])
        c1 = np.append(c1, bb[t[-1][1]   1:])

    return np.array([c0,c1])

But bellow I suggest another solution.

CodePudding user response：

It is difficult to understand what the question want, but IIUC, at first, we need to find the column size of the expected array that contains combined uncommon values between the two arrays (np.union1d), and then create an array based on that size full filled by -1 (np.full). Now, using np.searchsorted, the indices of values of an array in another array will be achieved. Values that are not contained in the other array can be given by np.in1d in invert mode. So we can achieve the goal by indexing as:

union_ = np.union1d(a, b)
# [0 1 2 3 4 5 6 7 8 9]

res = np.full((2, union_.size), -1)
# [[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
#  [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]]

arange_row_ids = np.arange(union_.size)
# [0 1 2 3 4 5 6 7 8 9]

col_inds = np.searchsorted(a, b)[np.in1d(b, a, invert=True)]
# np.searchsorted(a, b)      ---> [1 3 6 7 7]
# np.in1d(b, a, invert=True) ---> [False False False  True False]
# [7]

res[0, np.delete(arange_row_ids, col_inds   np.arange(col_inds.size))] = a
# np.delete(arange_row_ids, col_inds   np.arange(col_inds.size)) ---> [0 1 2 3 4 5 6 8 9]
# [[ 0  1  2  3  4  5  6 -1  8  9]
#  [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]]

col_inds = np.searchsorted(b, a)[np.in1d(a, b, invert=True)]
# np.searchsorted(b, a)      ---> [0 0 1 1 2 2 2 4 5]
# np.in1d(a, b, invert=True) ---> [ True False  True False  True  True False False  True]
# [0 1 2 2 5]

res[1, np.delete(arange_row_ids, col_inds   np.arange(col_inds.size))] = b
# np.delete(arange_row_ids, col_inds   np.arange(col_inds.size))  ---> [1 3 6 7 8]
# [[ 0  1  2  3  4  5  6 -1  8  9]
#  [-1  1 -1  3 -1 -1  6  7  8 -1]]

The question is not clear enough to see if the answer is the expected one, but I think it is helpful that could help for further modifications based on the need.

CodePudding user response：

Here's a partially vectorized solution:

import numpy as np

# this function if from Divakar's answer at #https://stackoverflow.com/questions/38619143/convert-python-#sequence-to-numpy-array-filling-missing-values that I used as #function:    

def boolean_indexing(v):
        lens = np.array([len(item) for item in v])
        mask = lens[:,None] > np.arange(lens.max())[::-1]
        out = np.full(mask.shape, -1, dtype=int)
        out[mask] = np.concatenate(v)
        return out

# 2 arrays with eroded useful data and the index pairing array:
a = np.array([0, 1, 313, 2, 3, 4, 5, 934, 6, 8, 9, 730, 241, 521])
b = np.array([112, 514, 11, 13, 16, 955, 17, 18, 112])
cor_tab = np.array([[1,2], [4,3], [8,4], [9,7]])

# split every array by correspondent indices in `cor_tab`:
aa = np.split(a, cor_tab[:,0] 1)
bb = np.split(b, cor_tab[:,1] 1)

#initiate 2 flat empty arrays:
aaa = np.empty((0), int)
bbb = np.empty((0), int)

# loop over the splitted arrays:    
for i, j in zip(aa,bb):
    c = boolean_indexing([i, j])
    aaa = np.append(aaa, c[0])
    bbb = np.append(bbb, c[1])

ccc = np.array([aaa,bbb]).T

In case of other types of data, here is another example. Lets take two arrays of letters:

a = np.array(['y', 'w', 'a', 'e', 'i', 'o', 'u', 'y', 'w', 'a', 'e', 'i', 'o', 'u'])
b = np.array(['t', 'h', 'b', 't', 'c', 'n', 's', 'j', 'p', 'z', 'n', 'h', 't', 's', 'm', 'p'])

, and index pairing array:

cor_tab = np.array([[2,0], [3,2], [4,3], [5,5], [6,6], [9,10], [11,12], [13,13]])
np.column_stack((a[cor_tab[:,0]], b[cor_tab[:,1]]))
array([['a', 't'], # useful data
       ['e', 'b'],
       ['i', 't'],
       ['o', 'n'],
       ['u', 's'],
       ['a', 'n'],
       ['i', 't'],
       ['u', 's']], dtype='<U1')

The only correction required is dtype='<U1' in boolean_indexing(). Result is:

   [['y' '-'], 
    ['w' '-'], 
    ['a' 't'], 
    ['-' 'h'], 
    ['e' 'b'], 
    ['i' 't'], 
    ['-' 'c'], 
    ['o' 'n'], 
    ['u' 's'], 
    ['-' 'j'], 
    ['y' 'p'], 
    ['w' 'z'], 
    ['a' 'n'], 
    ['e' 'h'], 
    ['i' 't'], 
    ['o' '-'], 
    ['u' 's'], 
    ['-' 'm'], 
    ['-' 'p']]

It works for floats as well if change dtype in boolean_indexing() to float.