I'm quite new to Tensorflow, and I've tried following a standard introductory example with a slightly different dataset. However, I'm getting an error and unable to proceed:
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).
along with:
TypeError: Could not build a TypeSpec for 3 01 04 02 0Name: Parch, dtype: object with type Series
import tensorflow as tf
import tensorflow._api.v2.compat.v2.feature_column as fc
import pandas as pd
import numpy as np
#df = pd.read_csv("train.csv")
#df = df.drop(columns=['Cabin', 'Name','Ticket','PassengerId'])
df = {'Survived': [0, 1, 1, 1, 0], 'Pclass': [3, 1, 3, 1, 3], 'Sex': ['male', 'female', 'female', 'female', 'male'],
'Age': [22.0, 38.0, 26.0, 35.0, 35.0], 'SibSp': [1, 1, 0, 1, 0], 'Parch': [0, 0, 0, 0, 0], 'Fare': [7.2500,
71.2833, 7.9250, 53.1000, 8.0500], 'Embarked': ['S', 'C', 'S', 'S', 'S']}
df = pd.DataFrame(df)
df.dropna(inplace=True)
df['Pclass'] = df['Pclass'].astype('object')
df['SibSp'] = df['SibSp'].astype('object')
df['Parch'] = df['Parch'].astype('object')
train, test = np.split(df.sample(frac=1), [int(0.8*len(df))])
y_train_labels = train.pop('Survived')
y_test_labels = test.pop('Survived')
numerical_columns = ['Age','Fare']
categorical_columns = ['Sex','Embarked','Pclass','Parch','SibSp']
feature_column = []
for feature in categorical_columns:
vocabulary = df[feature].unique()
feature_column.append(tf.feature_column.categorical_column_with_vocabulary_list(feature,vocabulary))
for feature in numerical_columns:
feature_column.append(tf.feature_column.numeric_column(feature, dtype=tf.float32))
def make_input_fn(data_df, label_df, num_epochs=20, shuffle=True, batch_size=32):
def input_function():
ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
if shuffle:
ds = ds.shuffle(1000)
ds = ds.batch(batch_size).repeat(num_epochs)
return ds
return input_function()
train_input_fn = make_input_fn(train, y_train_labels)
eval_input_fn = make_input_fn(test, y_test_labels, num_epochs=1, shuffle=False)
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_column)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)
I've provided a minimum reproducible example from my dataset, if there are any other possible errors please let me know.
CodePudding user response:
I think the features Pclass, SibSp, and Parch
should be categorized as numerical features. Normally, you use categorical_column_with_vocabulary_list
to map strings to numerical values, but the three features mentioned above are already numerical. If you really want to use categorical_column_with_vocabulary_list
then first convert your features to strings or leave them as integers. As documented here:
Use this when your inputs are in string or integer format, and you have an in-memory vocabulary mapping each value to an integer ID.
Here is an example with numerical features:
import tensorflow as tf
import tensorflow._api.v2.compat.v2.feature_column as fc
import pandas as pd
import numpy as np
df = {'Survived': [0, 1, 1, 1, 0],
'Pclass': [3, 1, 3, 1, 3],
'Sex': ['male', 'female', 'female', 'female', 'male'],
'Age': [22.0, 38.0, 26.0, 35.0, 35.0],
'SibSp': [1, 1, 0, 1, 0],
'Parch': [0, 0, 0, 0, 0],
'Fare': [7.2500, 71.2833, 7.9250, 53.1000, 8.0500],
'Embarked': ['S', 'C', 'S', 'S', 'S']}
df = pd.DataFrame(df)
df.dropna(inplace=True)
df['Pclass'] = df['Pclass'].astype(np.float32)
df['SibSp'] = df['SibSp'].astype(np.float32)
df['Parch'] = df['Parch'].astype(np.float32)
train, test = np.split(df.sample(frac=1), [int(0.8*len(df))])
y_train_labels = train.pop('Survived')
y_test_labels = test.pop('Survived')
numerical_columns = ['Age','Fare', 'Pclass', 'SibSp', 'Parch']
categorical_columns = ['Sex', 'Embarked']
feature_column = []
for feature in categorical_columns:
vocabulary = df[feature].unique()
feature_column.append(tf.feature_column.categorical_column_with_vocabulary_list(feature,vocabulary))
for feature in numerical_columns:
feature_column.append(tf.feature_column.numeric_column(feature, dtype=tf.float32))
def make_input_fn(data_df, label_df, num_epochs=20, shuffle=True, batch_size=32):
def input_function():
ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
if shuffle:
ds = ds.shuffle(1000)
ds = ds.batch(batch_size).repeat(num_epochs)
return ds
return input_function
train_input_fn = make_input_fn(train, y_train_labels)
eval_input_fn = make_input_fn(test, y_test_labels, num_epochs=1, shuffle=False)
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_column)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)