I have 2 lists of values (of the same shape),
and a metric (it should work with any metric that can go into the scipy.spatial.distance.cdist
function):
vals_1 = np.array([[0,1], [1,1], [2,1]])
vals_2 = np.array([[0,1], [2,1], [4,1]])
metric = 'euclidean'
I would like to calculate the distances between the corresponding pairs of values.
I can do it like so (with a loop):
dists = []
for val_1, val_2 in zip(vals_1, vals_2):
dists.append(cdist([val_1], [val_2], metric=metric).item())
dists # [0.0, 1.0, 2.0]
Here is one way to vectorize it:
dists = cdist(vals_1, vals_2, metric=metric).diagonal()
dists # array([0., 1., 2.])
It takes too much memory: it calculates all the pairwise distances:
array([[0., 2., 4.],
[1., 1., 3.],
[2., 0., 2.]])
and then selects only the diagonal.
Is there an efficient vectorized way to do this - not calculating the redundant distances?
CodePudding user response:
To calculate the distance between the corresponding position vectors, the corresponding position operation and the operation along the axis of numpy can be vectorized well (reference to Basic array operations), so we can easily write the following code:
def euclidean(XA, XB, *, out=None):
return np.sqrt(np.add.reduce(np.square(XA - XB), 1), out=out)
# ^^^^^^^^^^^^^ is equivalent to np.sum but faster
def sqeuclidean(XA, XB, *, out=None):
return np.add.reduce(np.square(XA - XB), 1, out=out)
def cityblock(XA, XB, *, out=None):
return np.add.reduce(np.abs(XA - XB), 1, out=out)
def chebyshev(XA, XB, *, out=None):
return np.maximum.reduce(np.abs(XA - XB), 1, out=out)
# ^^^^^^^^^^^^^^^^^ is equivalent to np.max but faster
def hamming(XA, XB, *, out=None):
return np.add.reduce(XA != XB, 1, out=out)
def mahalanobis(XA, XB, VI, *, out=None):
delta = XA - XB
return np.sqrt(np.add.reduce(delta @ VI * delta, 1), out=out)
# more functions...
If we need to work like cdist
, we can refer to the implementation of it to write similar code, here are some core parts completed:
_METRIC_INFOS = {
euclidean: ['euclidean', 'euclid', 'eu', 'e'],
sqeuclidean: ['sqeuclidean', 'sqe', 'sqeuclid'],
cityblock: ['manhattan', 'cityblock', 'cblock', 'cb', 'c'],
chebyshev: ['chebychev', 'chebyshev', 'cheby', 'cheb', 'ch'],
hamming: ['matching', 'hamming', 'hamm', 'ha', 'h'],
mahalanobis: ['mahalanobis', 'mahal', 'mah']
}
_METRICS = {metric.__name__: metric for metric in _METRIC_INFOS}
_METRIC_ALIAS = {alias: metric for metric, aka in _METRIC_INFOS.items() for alias in aka}
_METRIC_NAMES = list(_METRICS)
def dist(XA, XB, metric='euclidean', *, out=None, **kwargs):
XA = np.asarray(XA)
XB = np.asarray(XB)
if XA.ndim != 2:
raise ValueError('XA must be a 2-dimensional array.')
if XA.shape != XB.shape:
raise ValueError('XA and XB must have the same shape.')
if callable(metric):
return _dist_callable(XA, XB, out=out, metric=metric, **kwargs)
elif isinstance(metric, str):
metric = metric.lower()
metirc_info = _METRIC_ALIAS.get(metric, None)
if metirc_info is not None:
return metirc_info(XA, XB, out=out, **kwargs)
else:
raise ValueError(f'Unknown Distance Metric: {metric}')
else:
raise TypeError('2nd argument metric must be a string identifier '
'or a function.')
def _dist_callable(XA, XB, *, out, metirc, **kwargs):
mA = XA.shape[0]
if out is None:
out = np.empty(mA)
for i in range(mA):
out[i] = metirc(XA[i], XB[i], **kwargs)
return out
Test:
>>> a = np.arange(20).reshape(-1, 4)
>>> b = a[::-1]
>>> dist(a, b)
array([32., 16., 0., 16., 32.])
>>> dist(a, b, 'cityblock')
array([64, 32, 0, 32, 64])
>>> dist(a, b, 'chebyshev')
array([16, 8, 0, 8, 16])
>>> dist(a, b, 'sqeuclidean')
array([1024, 256, 0, 256, 1024])
>>> dist(a, b, 'hamming')
array([4, 4, 0, 4, 4])
>>> dist(a, b, 'mahalanobis', VI=np.eye(a.shape[1]) * 2)
array([45.254834, 22.627417, 0. , 22.627417, 45.254834])