only integer scalar arrays can be converted to a scalar index numpy-CodePudding

I found keras tutorial and when was following it got error.

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)


def crop_center(frame):
    cropped = center_crop_layer(frame[None, ...])
    cropped = cropped.numpy().squeeze()
    return cropped


# Following method is modified from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def load_video(path, max_frames=0):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center(frame)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)


def build_feature_extractor():
    feature_extractor = keras.applications.DenseNet121(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.densenet.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()


# Label preprocessing with StringLookup.
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"]), mask_token=None
)
print(label_processor.get_vocabulary())


def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_features` are what we will feed to our sequence model.
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))

        # Pad shorter videos.
        if len(frames) < MAX_SEQ_LENGTH:
            diff = MAX_SEQ_LENGTH - len(frames)
            padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
            frames = np.concatenate(frames, padding)

        frames = frames[None, ...]

        # Initialize placeholder to store the features of the current video.
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                if np.mean(batch[j, :]) > 0.0:
                    temp_frame_features[i, j, :] = feature_extractor.predict(
                        batch[None, j, :]
                    )

                else:
                    temp_frame_features[i, j, :] = 0.0

        frame_features[idx,] = temp_frame_features.squeeze()

    return frame_features, labels

When i call prepare_all_videos and pass train_df or test_df to it, this error occurs:

    81             diff = MAX_SEQ_LENGTH - len(frames)
     82             padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
---> 83             frames = np.concatenate(frames, padding)
     84 
     85         frames = frames[None, ...]

<__array_function__ internals> in concatenate(*args, **kwargs)

TypeError: only integer scalar arrays can be converted to a scalar index

That's how test_df looks like:

video_name tag 0 v_CricketShot_g01_c01.avi CricketShot
1 v_CricketShot_g01_c02.avi CricketShot
2 v_CricketShot_g01_c03.avi CricketShot
3 v_CricketShot_g01_c04.avi CricketShot
4 v_CricketShot_g01_c05.avi CricketShot
... ... ...
219 v_TennisSwing_g07_c03.avi TennisSwing
220 v_TennisSwing_g07_c04.avi TennisSwing
221 v_TennisSwing_g07_c05.avi TennisSwing
222 v_TennisSwing_g07_c06.avi TennisSwing
223 v_TennisSwing_g07_c07.avi TennisSwing

What's wrong? How to fix it? This tutorial has colab and you can run it if you want.

CodePudding user response：

Most probably it's because frame is returning an empty array, so concat is failing. So add a condition to check the length of the frame,

frames = load_video(os.path.join(root_dir, path))

if len(frames) == 0:
   continue

# Pad shorter videos.
if len(frames) < MAX_SEQ_LENGTH:
   diff = MAX_SEQ_LENGTH - len(frames)
   padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))