Own Dataset: ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int)-CodePudding

I'm quite new to Tensorflow, and I've tried following a standard introductory example with a slightly different dataset. However, I'm getting an error and unable to proceed:

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

along with:

TypeError: Could not build a TypeSpec for 3 01 04 02 0Name: Parch, dtype: object with type Series

    import tensorflow as tf
    import tensorflow._api.v2.compat.v2.feature_column as fc
    import pandas as pd
    import numpy as np
    
    
    #df = pd.read_csv("train.csv")
    #df = df.drop(columns=['Cabin', 'Name','Ticket','PassengerId'])
    df = {'Survived': [0, 1, 1, 1, 0], 'Pclass': [3, 1, 3, 1, 3], 'Sex': ['male', 'female', 'female', 'female', 'male'],
          'Age': [22.0, 38.0, 26.0, 35.0, 35.0], 'SibSp': [1, 1, 0, 1, 0], 'Parch': [0, 0, 0, 0, 0], 'Fare': [7.2500,
          71.2833, 7.9250, 53.1000, 8.0500], 'Embarked': ['S', 'C', 'S', 'S', 'S']}
    df = pd.DataFrame(df)
    df.dropna(inplace=True)
    
    df['Pclass'] = df['Pclass'].astype('object')
    df['SibSp'] = df['SibSp'].astype('object')
    df['Parch'] = df['Parch'].astype('object')
    
    train, test = np.split(df.sample(frac=1), [int(0.8*len(df))])
    y_train_labels = train.pop('Survived')
    y_test_labels = test.pop('Survived')
    
    numerical_columns = ['Age','Fare']
    categorical_columns = ['Sex','Embarked','Pclass','Parch','SibSp']
    
    feature_column = []
    for feature in categorical_columns:
        vocabulary = df[feature].unique()
        feature_column.append(tf.feature_column.categorical_column_with_vocabulary_list(feature,vocabulary))
    
    for feature in numerical_columns:
        feature_column.append(tf.feature_column.numeric_column(feature, dtype=tf.float32))
    
    def make_input_fn(data_df, label_df, num_epochs=20, shuffle=True, batch_size=32):
        def input_function():
            ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
            if shuffle:
                ds = ds.shuffle(1000)
            ds = ds.batch(batch_size).repeat(num_epochs)
            return ds
        return input_function()
    
    train_input_fn = make_input_fn(train, y_train_labels)
    eval_input_fn = make_input_fn(test, y_test_labels, num_epochs=1, shuffle=False)
    
    
    linear_est = tf.estimator.LinearClassifier(feature_columns=feature_column)
    linear_est.train(train_input_fn)
    result = linear_est.evaluate(eval_input_fn)

I've provided a minimum reproducible example from my dataset, if there are any other possible errors please let me know.

CodePudding user response：

I think the features Pclass, SibSp, and Parch should be categorized as numerical features. Normally, you use categorical_column_with_vocabulary_list to map strings to numerical values, but the three features mentioned above are already numerical. If you really want to use categorical_column_with_vocabulary_list then first convert your features to strings or leave them as integers. As documented here:

Use this when your inputs are in string or integer format, and you have an in-memory vocabulary mapping each value to an integer ID.

Here is an example with numerical features:

import tensorflow as tf
import tensorflow._api.v2.compat.v2.feature_column as fc
import pandas as pd
import numpy as np

df = {'Survived': [0, 1, 1, 1, 0], 
      'Pclass': [3, 1, 3, 1, 3], 
      'Sex': ['male', 'female', 'female', 'female', 'male'],
      'Age': [22.0, 38.0, 26.0, 35.0, 35.0], 
      'SibSp': [1, 1, 0, 1, 0], 
      'Parch': [0, 0, 0, 0, 0], 
      'Fare': [7.2500, 71.2833, 7.9250, 53.1000, 8.0500], 
      'Embarked': ['S', 'C', 'S', 'S', 'S']}
df = pd.DataFrame(df)
df.dropna(inplace=True)

df['Pclass'] = df['Pclass'].astype(np.float32)
df['SibSp'] = df['SibSp'].astype(np.float32)
df['Parch'] = df['Parch'].astype(np.float32)

train, test = np.split(df.sample(frac=1), [int(0.8*len(df))])
y_train_labels = train.pop('Survived')
y_test_labels = test.pop('Survived')

numerical_columns = ['Age','Fare', 'Pclass', 'SibSp', 'Parch']
categorical_columns = ['Sex', 'Embarked']

feature_column = []
for feature in categorical_columns:
    vocabulary = df[feature].unique()
    feature_column.append(tf.feature_column.categorical_column_with_vocabulary_list(feature,vocabulary))

for feature in numerical_columns:
    feature_column.append(tf.feature_column.numeric_column(feature, dtype=tf.float32))

def make_input_fn(data_df, label_df, num_epochs=20, shuffle=True, batch_size=32):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds
    return input_function

train_input_fn = make_input_fn(train, y_train_labels)
eval_input_fn = make_input_fn(test, y_test_labels, num_epochs=1, shuffle=False)

linear_est = tf.estimator.LinearClassifier(feature_columns=feature_column)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)