Home > Enterprise >  i cant load my own dataset on scikit learn getting continous errors
i cant load my own dataset on scikit learn getting continous errors

Time:02-04

i have made a file.txt with the next name "archivo", the file contains those lines

# Imprime un hola mundo
print("hola mundo")
# definimos variables 2 variables en un entero, luego hacemos suma de variables y luego imprime el resultado
a = 1
b = 3
c = a b
print(c)
# input para que el usuario de opcion de que quiere escribir
T = input("Ingresa tu Texto aqui >>:  ")
# checamos el texto ingresado en la variable T por el usuario mediante un if, elif, else
if T == "":
    print("No hay Texto")
elif T == "hola":
    print("hola")
else:
    print("Opcion no encontrada")

and i have made the next code trying to train my own AI

# -*- coding: utf-8 -*-
import os, sys
try:
    from sklearn.neural_network import MLPClassifier
    from sklearn.model_selection import train_test_split
    import json
    import PyPDF2
    import numpy as np
    import pandas as pd 
    from time import sleep
except:
    os.system("pip install scikit-learn")
    os.system("pip install PyPDF2")
    os.system("pip install numpy")
    os.system("pip install json")
    os.system("pip install pandas")
    from sklearn.neural_network import MLPClassifier
    import json
    from time import sleep
    from sklearn.model_selection import train_test_split
    import PyPDF2
    import numpy as np
    import pandas as pd 

# Cargar el archivo .txt
data_set = os.path.join("X:\scripts de python por mi\Scripts por mi\inteligencia artificial\scripts IA por mi\programacion\dataset")
datos = open(rf'{data_set}\archivo.txt').readlines()
#datos = pd.read_csv(rf"{data_set}\archivo.txt")
#datos = np.loadtxt(rf"{data_set}\archivo.txt", delimiter=',')
#X = datos[:,0:-1]
#y = datos[:,-1] 
X = []
y = []
for index, daxt in enumerate(datos):
    fg = daxt.rstrip()
    X.append(fg)
    y.append(index)


# Procesamos el subconjunto de datos en binario para su entrenamiento
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X = label_encoder.fit_transform(X)

# Normalizar los datos
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Divido mi conjunto de datos de entrenamiento en pruebas también
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

# Entrenamos el perceptron Multicapa con función Sigmoide y 3 neuronas mas optimizador grandient decent
clf = MLPClassifier(activation='logistic', hidden_layer_sizes=(3,), solver='sgd')
clf.fit(X_train,y_train)

# Realizamos la predicción con el conjunto de datos de prueba
prediccion = clf.predict(X_test)

# Mostramos el f1_score resultante de la clasificación Sirve para ver que tanta precision tuvo la IA
from sklearn.metrics import f1_score
f1_score(X_test, y_test, average="binary")

the problem is when i run the .py file i get this output

X:\scripts de python por mi\Scripts por mi\inteligencia artificial\scripts IA por mi\programacion>python test.py
Traceback (most recent call last):
  File "X:\scripts de python por mi\Scripts por mi\inteligencia artificial\scripts IA por mi\programacion\test.py", line 48, in <module>
    X = scaler.fit_transform(X)
        ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XGreen\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 142, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XGreen\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 859, in fit_transform
    return self.fit(X, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XGreen\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\preprocessing\_data.py", line 427, in fit
    return self.partial_fit(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XGreen\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\preprocessing\_data.py", line 466, in partial_fit
    X = self._validate_data(
        ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XGreen\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 546, in _validate_data
    X = check_array(X, input_name="X", **check_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XGreen\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py", line 902, in check_array
    raise ValueError(
ValueError: Expected 2D array, got 1D array instead:
array=[ 3. 14.  5.  8.  9. 10. 15.  6.  7.  4. 13.  0. 11.  2. 12.  1.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample

i have tried change my code to the next...

# -*- coding: utf-8 -*-
import os, sys
try:
    from sklearn.neural_network import MLPClassifier
    from sklearn.model_selection import train_test_split
    import json
    import PyPDF2
    import numpy as np
    import pandas as pd 
    from time import sleep
except:
    os.system("pip install scikit-learn")
    os.system("pip install PyPDF2")
    os.system("pip install numpy")
    os.system("pip install json")
    os.system("pip install pandas")
    from sklearn.neural_network import MLPClassifier
    import json
    from time import sleep
    from sklearn.model_selection import train_test_split
    import PyPDF2
    import numpy as np
    import pandas as pd 

# Cargar el archivo .txt
data_set = os.path.join("X:\scripts de python por mi\Scripts por mi\inteligencia artificial\scripts IA por mi\programacion\dataset")
datos = open(rf'{data_set}\archivo.txt').readlines()
#datos = pd.read_csv(rf"{data_set}\archivo.txt")
datos = np.loadtxt(rf"{data_set}\archivo.txt", delimiter=',')
X = datos[:,0:-1]
y = datos[:,-1] 
#X = []
#y = []
"""
for index, daxt in enumerate(datos):
    fg = daxt.rstrip()
    X.append(fg)
    y.append(index)
"""

# Procesamos el subconjunto de datos en binario para su entrenamiento
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X = label_encoder.fit_transform(X)

# Normalizar los datos
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Divido mi conjunto de datos de entrenamiento en pruebas también
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

# Entrenamos el perceptron Multicapa con función Sigmoide y 3 neuronas mas optimizador grandient decent
clf = MLPClassifier(activation='logistic', hidden_layer_sizes=(3,), solver='sgd')
clf.fit(X_train,y_train)

# Realizamos la predicción con el conjunto de datos de prueba
prediccion = clf.predict(X_test)

# Mostramos el f1_score resultante de la clasificación Sirve para ver que tanta precision tuvo la IA
from sklearn.metrics import f1_score
f1_score(X_test, y_test, average="binary")

but i get the next output...

X:\scripts de python por mi\Scripts por mi\inteligencia artificial\scripts IA por mi\programacion>python test.py
ValueError: could not convert string to float: 'print("hola mundo")'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "X:\scripts de python por mi\Scripts por mi\inteligencia artificial\scripts IA por mi\programacion\test.py", line 29, in <module>
    datos = np.loadtxt(rf"{data_set}\archivo.txt", delimiter=',')
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XGreen\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\npyio.py", line 1348, in loadtxt
    arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter,
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XGreen\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\npyio.py", line 999, in _read
    arr = _load_from_filelike(
          ^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string 'print("hola mundo")' to float64 at row 0, column 1.

i have tried open file with pandas my .txt but i get error too, i dont know how to fix it, if someone can help me please, thanks for read :) NOTE: the code is docummented in spanish because i'm from spain, i doc my code for understand it better.

CodePudding user response:

You must reshape your input data (X) in order to have shape (N_samples, N_features). In your case you have just one feature, so:

for index, daxt in enumerate(datos):
    fg = daxt.rstrip()
    X.append(fg)
    y.append(index)

#add the following lines
X = np.array(X).reshape(-1,1)
y = np.array(y)
  • Related