Scikit Learn fit(): Setting an array element with a sequence fit-CodePudding

I am trying to call scikit learn fit functions on dataframes where the elements of each column are numpy arrays. However, I get the error "setting an array element with a sequence," presumably because I am trying to call fit on a dataframe of arrays rather than scalar values. How do I work around this? I'd really appreciate some help.

Here is my code. You can find the data I'm using here: https://competitions.codalab.org/competitions/21163

training_data = pd.read_csv('/train.tsv', sep='\t')
testing_data = pd.read_csv('/dev.tsv', sep='\t')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True,max_length=1024)
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to(device)

# These are used to map the data to their appropriate column on each pass
pomt_train_x = pd.DataFrame(columns=["claim", "reason", "category", "speaker", "checker", "tags", "claim entities", "article title"])
feature_dict = {1: "claim", 4: "reason", 5: "category", 6: "speaker", 7: "checker", 8: "tags", 9: "claim entities", 10: "article title"}

# Sort the data appropriately.
for i, data in enumerate(training_data[training_data.columns].to_numpy()):
    if 'pomt' in data[0]:
        appended_data = {}
        for j, sentence in enumerate(data):
            if j in feature_dict:
                inputs = tokenizer(str(sentence), return_tensors="pt", max_length=512, pad_to_max_length=True).to(device)
                outputs = model(**inputs)
                appended_data[feature_dict[j]] = outputs.last_hidden_state[:,0][0].cpu().detach().numpy()
        pomt_train_x = pomt_train_x.append(appended_data, ignore_index=True)
        print(f"{i   1} out of {training_data.index.stop} from training")

count = 0
# append testing data to training data
for i, data in enumerate(testing_data[testing_data.columns].to_numpy()):
    if 'pomt' in data[0]:
        appended_data = {}
        for j, sentence in enumerate(data):
            if j in feature_dict:
                inputs = tokenizer(str(sentence), return_tensors="pt", max_length=512, pad_to_max_length=True).to(device)
                outputs = model(**inputs)
                appended_data[feature_dict[j]] = outputs.last_hidden_state[:,0][0].cpu().detach().numpy()
        pomt_train_x = pomt_train_x.append(appended_data, ignore_index=True)
        print(f"{i   1} out of {testing_data.index.stop} from testing")
        count  = 1

# Map the possible labels to an emotion
positive_set = set(['half-true', 'correct attribution!', 'correct', 'determination: barely true', 'factscan score: true',
                'correct attribution', 'mostly true', 'mostly-correct', 'truth!', 'partially true', 'half true',
                'mostly truth!', 'determination: true', 'true messages', 'authorship confirmed!', 'verdict: true',
                'mostly_true', 'determination: mostly true', 'confirmed authorship!', 'conclusion: accurate', 'accurate',
                'true', 'partly true', 'fact', 'full flop', 'in-the-green', 'verified'])
negative_set = set({'fake news', 'verdict: false', '3 pinnochios', 'fiction!', 'bogus warning', 'we rate this claim false',
                'determination: false', 'disputed!', 'false', 'fiction', 'a lot of baloney', '2 pinnochios', 'some baloney',
                'mostly_false', 'cherry picks', 'miscaptioned', 'misleading!', 'misleading recommendations', 'mostly fiction!',
                'mostly false', 'a little baloney', 'fiction! & satire!', 'conclusion: false', 'rating: false',
                'determination: misleading', 'promise broken', '4 pinnochios', 'misleading', 'promise kept',
                'misattributed', 'fake', 'previously truth! now resolved!','incorrect attribution!', 'incorrect',
                'spins the facts', 'determination: a stretch', 'factscan score: misleading', 'pants on fire!',
                'factscan score: false', 'exaggerates', 'outdated', 'facebook scams', 'unsupported', 'opinion!',
                'verdict: unsubstantiated', 'scam', 'virus!', 'no flip', 'scam!', 'unverified', 'distorts the facts', 'outdated!'
                'understated', 'no evidence', 'unproven!', 'inaccurate attribution!', 'statirical reports', 'unproven', 'exaggerated', 
                'determination: huckster propaganda', 'grass roots movement!', 'commentary!', 'in-the-red', 'unsubstantiated messages',})
neutral_set = set({'truth! & fiction!', 'conclusion: unclear', '1', 'unobservable', 'needs context', 'truth! & disputed!', 'half flip',
               '0',  'in-between', '4', 'None', '2', 'none',  'investigation pending!','not the whole story', '10','in the works',
               'truth! & misleading!', '3',  'mixture', 'not yet rated', 'legend', 'stalled', 'truth! & unproven!', 'truth! & outdated!',
               'compromise'})

# Read in the labels for the appropriate data
pomt_train_y = pd.DataFrame(columns=["label"])

sign_to_append = 0

for i, data in enumerate(training_data[training_data.columns].to_numpy()):
    if 'pomt' in data[0]:
        if data[2] in positive_set:
            sign_to_append = 1
        elif data[2] in negative_set:
            sign_to_append = -1
        else:
            sign_to_append = 0
        pomt_train_y = pomt_train_y.append({'label':sign_to_append}, ignore_index=True)
        print(f"{i   1} out of {training_data.index.stop} from training")

# append testing data to training data
for i, data in enumerate(testing_data[testing_data.columns].to_numpy()):
    if 'pomt' in data[0]:
        if data[2] in positive_set:
            sign_to_append = 1
        elif data[2] in negative_set:
            sign_to_append = -1
        else:
            sign_to_append = 0
        pomt_train_y = pomt_train_y.append({'label':sign_to_append}, ignore_index=True)
        print(f"{i   1} out of {testing_data.index.stop} from testing")

pomt_X_train, pomt_X_test, pomt_Y_train, pomt_Y_test = train_test_split(pomt_train_x, pomt_train_y, test_size= (count / pomt_train_x.shape[0]), stratify=pomt_train_y)
pomt_Y_train = pomt_Y_train.astype("int")
pomt_Y_test = pomt_Y_test.astype("int")

# One Vs. One Multiclass Classification
clf = OneVsOneClassifier(SVC(C = 1, verbose=True))

# Fit to Training Data
clf.fit(pomt_X_train, pomt_Y_train)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
<ipython-input-22-3314e23093e3> in <module>()
      1 # Fit to Training Data
----> 2 clf.fit(pomt_X_train.squeeze(), pomt_Y_train)
      3 
      4 # Training data accuracy
      5 X_train_prediction = clf.predict(pomt_X_train)

4 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in __array__(self, dtype)
   1991 
   1992     def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
-> 1993         return np.asarray(self._values, dtype=dtype)
   1994 
   1995     def __array_wrap__(

ValueError: setting an array element with a sequence.

CodePudding user response：

I figured out what to do on my own end. I basically just created a column in the dataframe to reflect each element of the list, not each list itself. It's a bit unintuitive but it works.