I have a two-dimensional array where columns are tournaments and rows are results of players. What I would like to do is to create results matrix where every row represents win frequency of a particular player against everyone else.
Here is my naive solution to this problem.
import numpy as np
input_example = np.array([
[55.90, 81.50, 76.60, 69.50],
[52.50, 74.60, 74.00, 64.80],
[52.40, 74.90, 78.20, 60.90],
[52.60, 78.90, 77.60, 60.80],
])
output_example = np.array([
[0.50, 1.00, 0.75, 0.75],
[0.00, 0.50, 0.50, 0.25],
[0.25, 0.50, 0.50, 0.50],
[0.25, 0.75, 0.50, 0.50],
])
def results_matrix(tournament_results):
players, tournaments = tournament_results.shape
res_matrix = np.zeros(shape=(players, tournaments))
for n in range(players):
for m in range(players):
if n == m:
res_matrix[n][m] = 0.50
continue
res_matrix[n][m] = (tournament_results[n] > tournament_results[m]).sum() / tournaments
return res_matrix
if __name__ == '__main__':
res_matrix = results_matrix(input_example)
assert np.array_equal(res_matrix, output_example)
While it produces the correct results, I would like to turn this into idiomatic numpy solution so that it would work fast with large inputs.
You may assume that the results are distinct in a particular tournament so that no pair of players can have identical result.
CodePudding user response:
How about this solution?
players, tournaments = input_example.shape
arr0 = input_example.reshape(players, 1, tournaments)
arr1 = input_example.reshape(1, players, tournaments)
res_matrix = (arr0 > arr1).sum(2)/tournaments
res_matrix[np.arange(players), np.arange(players)] = 0.5
EDIT: If you don't have enough memory to do it that way, there's no escaping loops. But you can simply use numba to make your code run faster as it is now.
import numpy as np
from numba import jit
input_example = np.array([
[55.90, 81.50, 76.60, 69.50],
[52.50, 74.60, 74.00, 64.80],
[52.40, 74.90, 78.20, 60.90],
[52.60, 78.90, 77.60, 60.80],
])
output_example = np.array([
[0.50, 1.00, 0.75, 0.75],
[0.00, 0.50, 0.50, 0.25],
[0.25, 0.50, 0.50, 0.50],
[0.25, 0.75, 0.50, 0.50],
])
def results_matrix(tournament_results):
players, tournaments = tournament_results.shape
res_matrix = np.zeros(shape=(players, players))
for n in range(players):
for m in range(players):
if n == m:
res_matrix[n][m] = 0.50
continue
res_matrix[n][m] = (tournament_results[n] > tournament_results[m]).sum() / tournaments
return res_matrix
@jit(nopython=True)
def results_matrix_ba(tournament_results):
players, tournaments = tournament_results.shape
res_matrix = np.zeros(shape=(players, players))
for n in range(players):
for m in range(players):
if n == m:
res_matrix[n][m] = 0.50
continue
res_matrix[n][m] = (tournament_results[n] > tournament_results[m]).sum() / tournaments
return res_matrix
print('running w/o numba')
%timeit assert np.array_equal(output_example, results_matrix(input_example))
print('running with numba')
%timeit assert np.array_equal(output_example, results_matrix_ba(input_example))
Output:
running w/o numba
The slowest run took 4.10 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 5: 71.9 µs per loop
running with numba
The slowest run took 5.06 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 5: 7.74 µs per loop
np.random.seed(0)
input_example = np.random.rand(900, 500)
print('running w/o numba')
%timeit results_matrix(input_example)
print('running with numba')
%timeit results_matrix_ba(input_example)
Output:
running w/o numba
1 loop, best of 5: 4.72 s per loop
running with numba
1 loop, best of 5: 407 ms per loop