I am trying to call scikit learn fit functions on dataframes where the elements of each column are numpy arrays. However, I get the error "setting an array element with a sequence," presumably because I am trying to call fit on a dataframe of arrays rather than scalar values. How do I work around this? I'd really appreciate some help.
Here is my code. You can find the data I'm using here: https://competitions.codalab.org/competitions/21163
training_data = pd.read_csv('/train.tsv', sep='\t')
testing_data = pd.read_csv('/dev.tsv', sep='\t')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True,max_length=1024)
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to(device)
# These are used to map the data to their appropriate column on each pass
pomt_train_x = pd.DataFrame(columns=["claim", "reason", "category", "speaker", "checker", "tags", "claim entities", "article title"])
feature_dict = {1: "claim", 4: "reason", 5: "category", 6: "speaker", 7: "checker", 8: "tags", 9: "claim entities", 10: "article title"}
# Sort the data appropriately.
for i, data in enumerate(training_data[training_data.columns].to_numpy()):
if 'pomt' in data[0]:
appended_data = {}
for j, sentence in enumerate(data):
if j in feature_dict:
inputs = tokenizer(str(sentence), return_tensors="pt", max_length=512, pad_to_max_length=True).to(device)
outputs = model(**inputs)
appended_data[feature_dict[j]] = outputs.last_hidden_state[:,0][0].cpu().detach().numpy()
pomt_train_x = pomt_train_x.append(appended_data, ignore_index=True)
print(f"{i 1} out of {training_data.index.stop} from training")
count = 0
# append testing data to training data
for i, data in enumerate(testing_data[testing_data.columns].to_numpy()):
if 'pomt' in data[0]:
appended_data = {}
for j, sentence in enumerate(data):
if j in feature_dict:
inputs = tokenizer(str(sentence), return_tensors="pt", max_length=512, pad_to_max_length=True).to(device)
outputs = model(**inputs)
appended_data[feature_dict[j]] = outputs.last_hidden_state[:,0][0].cpu().detach().numpy()
pomt_train_x = pomt_train_x.append(appended_data, ignore_index=True)
print(f"{i 1} out of {testing_data.index.stop} from testing")
count = 1
# Map the possible labels to an emotion
positive_set = set(['half-true', 'correct attribution!', 'correct', 'determination: barely true', 'factscan score: true',
'correct attribution', 'mostly true', 'mostly-correct', 'truth!', 'partially true', 'half true',
'mostly truth!', 'determination: true', 'true messages', 'authorship confirmed!', 'verdict: true',
'mostly_true', 'determination: mostly true', 'confirmed authorship!', 'conclusion: accurate', 'accurate',
'true', 'partly true', 'fact', 'full flop', 'in-the-green', 'verified'])
negative_set = set({'fake news', 'verdict: false', '3 pinnochios', 'fiction!', 'bogus warning', 'we rate this claim false',
'determination: false', 'disputed!', 'false', 'fiction', 'a lot of baloney', '2 pinnochios', 'some baloney',
'mostly_false', 'cherry picks', 'miscaptioned', 'misleading!', 'misleading recommendations', 'mostly fiction!',
'mostly false', 'a little baloney', 'fiction! & satire!', 'conclusion: false', 'rating: false',
'determination: misleading', 'promise broken', '4 pinnochios', 'misleading', 'promise kept',
'misattributed', 'fake', 'previously truth! now resolved!','incorrect attribution!', 'incorrect',
'spins the facts', 'determination: a stretch', 'factscan score: misleading', 'pants on fire!',
'factscan score: false', 'exaggerates', 'outdated', 'facebook scams', 'unsupported', 'opinion!',
'verdict: unsubstantiated', 'scam', 'virus!', 'no flip', 'scam!', 'unverified', 'distorts the facts', 'outdated!'
'understated', 'no evidence', 'unproven!', 'inaccurate attribution!', 'statirical reports', 'unproven', 'exaggerated',
'determination: huckster propaganda', 'grass roots movement!', 'commentary!', 'in-the-red', 'unsubstantiated messages',})
neutral_set = set({'truth! & fiction!', 'conclusion: unclear', '1', 'unobservable', 'needs context', 'truth! & disputed!', 'half flip',
'0', 'in-between', '4', 'None', '2', 'none', 'investigation pending!','not the whole story', '10','in the works',
'truth! & misleading!', '3', 'mixture', 'not yet rated', 'legend', 'stalled', 'truth! & unproven!', 'truth! & outdated!',
'compromise'})
# Read in the labels for the appropriate data
pomt_train_y = pd.DataFrame(columns=["label"])
sign_to_append = 0
for i, data in enumerate(training_data[training_data.columns].to_numpy()):
if 'pomt' in data[0]:
if data[2] in positive_set:
sign_to_append = 1
elif data[2] in negative_set:
sign_to_append = -1
else:
sign_to_append = 0
pomt_train_y = pomt_train_y.append({'label':sign_to_append}, ignore_index=True)
print(f"{i 1} out of {training_data.index.stop} from training")
# append testing data to training data
for i, data in enumerate(testing_data[testing_data.columns].to_numpy()):
if 'pomt' in data[0]:
if data[2] in positive_set:
sign_to_append = 1
elif data[2] in negative_set:
sign_to_append = -1
else:
sign_to_append = 0
pomt_train_y = pomt_train_y.append({'label':sign_to_append}, ignore_index=True)
print(f"{i 1} out of {testing_data.index.stop} from testing")
pomt_X_train, pomt_X_test, pomt_Y_train, pomt_Y_test = train_test_split(pomt_train_x, pomt_train_y, test_size= (count / pomt_train_x.shape[0]), stratify=pomt_train_y)
pomt_Y_train = pomt_Y_train.astype("int")
pomt_Y_test = pomt_Y_test.astype("int")
# One Vs. One Multiclass Classification
clf = OneVsOneClassifier(SVC(C = 1, verbose=True))
# Fit to Training Data
clf.fit(pomt_X_train, pomt_Y_train)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-22-3314e23093e3> in <module>()
1 # Fit to Training Data
----> 2 clf.fit(pomt_X_train.squeeze(), pomt_Y_train)
3
4 # Training data accuracy
5 X_train_prediction = clf.predict(pomt_X_train)
4 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in __array__(self, dtype)
1991
1992 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
-> 1993 return np.asarray(self._values, dtype=dtype)
1994
1995 def __array_wrap__(
ValueError: setting an array element with a sequence.
CodePudding user response:
I figured out what to do on my own end. I basically just created a column in the dataframe to reflect each element of the list, not each list itself. It's a bit unintuitive but it works.