How to check every combination of 5 rows for a condition in a 2d array using numpy-CodePudding

For multiple reasons I would like to rewrite this (working!) code below in numpy, but I can't find a good way to do it; #1, I've literally never used numpy before and am generally new, #2, Python is too slow, #3, I'd like to output a name column using print(m[:,0]), which since #4, itertools combinations only outputs a 2d list, not a 2d numpy array, I can't.

def compCheck(m):          # function to check how many attributes the group shares
    rowsNum = len(m)
    columnsNum = len(m[0])
    sCount = 0             # counts the non empties in a row
    matches = 0            # counts the total number of matches

    for a in range(2,columnsNum):
        for b in range(0,rowsNum):
            if m[b][a]:            # if entry isn't blank
                sCount  = 1
        if sCount >= 3:
            matches  = 1
        sCount = 0
    print (matches)

from itertools import combinations
teamSize = 5

for i in combinations(masterList, teamSize):
    compCheck(i)

To explain what this code does (or should do), it creates a list of every unique combination of 5 rows without replacement from a 2d list (called masterList). It looks at each combination and checks the columns (offset by 2 so it doesn't count the names). If at least 3 of the 5 entries in the column are filled, then it counts that column as a match. It then returns the total number of matches and moves to the next combination.

Example of the check should be:

Input: compCheck([["Alex", "Smith", "Chess", "Skiing", "", ""],
["Bob", "Dole", "Chess", "", "", ""],
["Charlie", "Chaplin", "Chess", "", "", ""],
["Daisy", "Buchanon", "", "", "", "Partying"],
["Emily", "Evans", "Chess", "Skiing", "", ""]]

Output: "1 for ['Alex' 'Bob' 'Charlie' 'Daisy' 'Emily']"

An example for the input would be much the same as the list above (but with many more rows), so I'll just post the example of a 6 row list:

from itertools import combinations
teamSize = 5
masterList = [["Alex", "Smith", "Chess", "Skiing", "", ""],
["Bob", "Dole", "Chess", "", "", ""],
["Charlie", "Chaplin", "Chess", "", "", ""],
["Daisy", "Buchanon", "", "", "", "Partying"],
["Emily", "Evans", "Chess", "Skiing", "", ""],
["Frank", "Ferdinand", "", "Skiing", "", ""]]

for i in combinations(masterList, teamSize):
    compCheck(i)

Output: ["1 for ['Alex' 'Bob' 'Charlie' 'Daisy' 'Emily']",
"1 for ['Alex' 'Bob' 'Charlie' 'Daisy' 'Frank']",
"2 for ['Alex' 'Bob' 'Charlie' 'Emily' 'Frank']",
"2 for ['Alex' 'Bob' 'Daisy' 'Emily' 'Frank']",
"2 for ['Alex' 'Charlie' 'Daisy' 'Emily' 'Frank']",
"1 for ['Bob' 'Charlie' 'Daisy' 'Emily' 'Frank]"]

CodePudding user response：

I think pandas.DataFrame is a better fit in this case.

masterList = [
    ["Alex", "Smith", "Chess", "Skiing", "", ""],
    ["Bob", "Dole", "Chess", "", "", ""],
    ["Charlie", "Chaplin", "Chess", "", "", ""],
    ["Daisy", "Buchanon", "", "", "", "Partying"],
    ["Emily", "Evans", "Chess", "Skiing", "", ""],
    ["Frank", "Ferdinand", "", "Skiing", "", ""]
]

df = (
    pd.DataFrame(
        masterList,
        columns = ['name','surname','chess','skiing','whatever','partying']
    )
    .drop(columns='surname')
    .set_index('name')
    .applymap(bool)
)

Here's what the data looks like after conversion:

We must take into account that combinations returns a sequence of tuples, not a 2D list. Therefore, we must convert each combination to a list before extracting the data:

NGroup = 5
minShare = 3
for combo in combinations(df.index, NGroup):
    print(
        '{count} for {combo}'.format(
            count=(df.loc[[*combo]].sum() >= minShare).sum(), 
            combo=', '.join(combo)
        )
    )

Here's the output:

If numpy is the choice, than this code can be used with the same output:

data = np.array(masterList)
captions = data[:, 0]
hobbies = (data[:, 2:] != '')

for combo in combinations(range(len(hobbies)), NGroup):
    print(
        '{count} for {combo}'.format(
            count=(hobbies[[*combo]].sum(axis=0) >= minShare).sum(), 
            combo=', '.join(captions[[*combo]])
        )
    )

CodePudding user response：

You can do this easily with numpy

for instance

def compCheck(m):          # function to check how many attributes the group shares
    rowsNum = len(m)
    columnsNum = len(m[0])
    sCount = 0             # counts the non empties in a row
    matches = 0            # counts the total number of matches

    for a in range(2,columnsNum):
        for b in range(0,rowsNum):
            if m[b][a]:            # if entry isn't blank
                sCount  = 1
        if sCount >= 3:
            matches  = 1
        sCount = 0
    return matches

from itertools import combinations
import numpy as np
teamSize = 5
masterList =  [['A','B',0,1,0,1],
               ['A','B',0,0,0,1],
               ['A','B',1,1,0,0],
               ['A','B',0,1,1,1],
               ['A','B',0,1,0,0],
               ['A','B',1,1,0,1],
               ['A','B',1,1,0,0],
               ['A','B',0,1,1,1],
               ['A','B',0,0,1,1],
               ['A','B',1,1,0,1], ]
for i in combinations(masterList, teamSize):
    Mat2D = np.array([l[2:] for l in i])
    print(np.sum(np.count_nonzero(np.array(Mat2D),axis=0) >= 3))
    print(compCheck(i))

If the matrix is rigth, you said that the two first value are strings.

it is probably better to remove the names directly from the masterList

def compCheck(m):          # function to check how many attributes the group shares
    rowsNum = len(m)
    columnsNum = len(m[0])
    sCount = 0             # counts the non empties in a row
    matches = 0            # counts the total number of matches

    for a in range(0,columnsNum):
        for b in range(0,rowsNum):
            if m[b][a]:            # if entry isn't blank
                sCount  = 1
        if sCount >= 3:
            matches  = 1
        sCount = 0
    return matches

from itertools import combinations
import numpy as np
teamSize = 5
masterList =  [['A','B',0,1,0,1],
               ['A','B',0,0,0,1],
               ['A','B',1,1,0,0],
               ['A','B',0,1,1,1],
               ['A','B',0,1,0,0],
               ['A','B',1,1,0,1],
               ['A','B',1,1,0,0],
               ['A','B',0,1,1,1],
               ['A','B',0,0,1,1],
               ['A','B',1,1,0,1], ]
masterList =  np.array([l[2:] for l in masterList])
for i in combinations(masterList, teamSize):
    print(np.sum(np.count_nonzero(np.array(i),axis=0) >= 3))
    print(compCheck(i))

update

here a code with your data

from itertools import combinations
import numpy as np
teamSize = 5
masterList = [["Alex", "Smith", "Chess", "Skiing", "", ""],
              ["Bob", "Dole", "Chess", "", "", ""],
              ["Charlie", "Chaplin", "Chess", "", "", ""],
              ["Daisy", "Buchanon", "", "", "", "Partying"],
              ["Emily", "Evans", "Chess", "Skiing", "", ""],
              ["Frank", "Ferdinand", "", "Skiing", "", ""]]

for i in combinations(masterList, teamSize):
    Mat2D = np.array([l[2:] for l in i])
    check = np.sum(np.count_nonzero(np.array(Mat2D), axis=0) >= 3)
    if check:
        print(check, ' for ', [l[0] for l in i])

with give

1  for  ['Alex', 'Bob', 'Charlie', 'Daisy', 'Emily']
1  for  ['Alex', 'Bob', 'Charlie', 'Daisy', 'Frank']
2  for  ['Alex', 'Bob', 'Charlie', 'Emily', 'Frank']
2  for  ['Alex', 'Bob', 'Daisy', 'Emily', 'Frank']
2  for  ['Alex', 'Charlie', 'Daisy', 'Emily', 'Frank']
1  for  ['Bob', 'Charlie', 'Daisy', 'Emily', 'Frank']