AttributeError: 'DataFrame' object has no attribute 'seek'-CodePudding

In the DataProcessor class, the raw_file_processing, dataset_csv, classes_csv, and idset_csv functions process the raw datafile and output csv files that can be read by the read_archive function. My code raised AttributeError: 'DataFrame' object has no attribute 'seek' error.

import pandas as pd
import warnings
import numpy as np
import os
import zipfile
import re
from sklearn.model_selection import train_test_split


class DataProcesser:
    def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
                 read_on_init=True, **kwargs):

        self.archive_path = archive_path
        self.archive = zipfile.ZipFile(self.archive_path, 'r')
        self.col_id = col_id
        self.col_class = col_class
        self.col_classname = col_classname
        self.col_set = col_set
        self.dataset = None
        self.dataset_cropped = None
        self.id_set = None
        self.classes = None
        self.train_set = None
        self.validation_set = None
        self.test_set = None
        self.logs = []
        self.stats = None
        self.flag_subset = False
        self.flag_process = False
        self.flag_split = False
        self.measurement_df = None
        if read_on_init:
            self.read_archive(**kwargs)

        def raw_file_processing(self):

            # If the path contains HTAN CODEX data, perform the following processing steps
            if os.path.isdir(archive_path):

                # 'Class' refers to the independent variable
                # The class info is the 3rd column tile_num in the current example
                # The rationale for looking at tile_num is that if we're examining tumor progression, we can observe the relative positions of the tumor growth
                # Tumor progression may be denoted by the corresponding values of tumor progression markers/antibodies such as CEA
                # In the future, we may append all the tumor patient files and normal patient files and then assign patient number as "class"
                self.col_classname = self.archive_path.iloc[2]

                # Dummy-code the classes
                self.col_class = pd.get_dummies(self.col_classname)

                # Create the ID series by concatenating columns 1-3
                self.col_id = self.archive_path.assign(
                    ID=self.archive_path[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
                        lambda row: '_'.join([str(each) for each in row]), axis=1))
                self.col_id = self.archive_path.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])

                # Obtain measurement info
                # Normalize data against blank/empty columns
                # log-transform the data
                for col in self.archive_path[9:]:
                    if re.findall(r"Blank|Empty", col):
                        background = col
                    else:
                        for index, row in col:
                            norm_data = row / background
                            self.measurement_df = np.log2(norm_data)

            return self.archive_path, self.col_id, self.col_class, self.measurement_df

    def dataset_csv(self):

        # If the path contains HTAN CODEX data, perform the following processing steps
        if os.path.isdir(self.archive_path):
            """Col 1: ID
            Col 2: class
            Col 3-n: measurements"""
            id_col = self.col_id

            self.col_class = self.col_class.to_frame()

            frames = [id_col, self.col_class, self.measurement_df]
            self.dataset = pd.concat(frames)
            data_csv = self.dataset.to_csv("../input_data/dataset.csv")

        return data_csv

    def classes_csv(self):

        # If the path contains HTAN CODEX data, perform the following processing steps
        if os.path.isdir(self.archive_path):
            # Remove any duplicate rows with the same col_class and cls_col info
            self.cls_df = pd.DataFrame({'class': [self.col_class], 'class_name': [self.col_classname]})
            self.cls_df.drop_duplicate(keep=False, inplace=True)

            # Save as csv file
            self.cls_df.to_csv('../input_data/classes.csv')

        return self.cls_df

    def idset_csv(self):

        # If the path contains HTAN CODEX data, perform the following processing steps
        if os.path.isdir(self.archive_path):
            # Get the ids
            ids = self.archive_path[0]

            # Train-test-validation split
            ids.sample(frac=1)
            train, test = train_test_split(ids, test_size=0.2, random_state=1)
            train, val = train_test_split(train, test_size=0.25, random_state=1)

            # Assuming train, val, test are dataframes
            # A string is assigned to the "set" column.
            train.loc[:, 'set'] = 'train'
            val.loc[:, 'set'] = 'val'
            test.loc[:, 'set'] = 'test'

            # Save as csv file
            id_set = pd.concat([train, val, test], axis=0)
            id_set_csv = id_set.to_csv('../input_data/id_set.csv', index=False)

        return id_set_csv

    def zip_files(self):

        # Create a ZipFile object for dataset.csv, classes.csv, and id_set.csv
        zip = ZipFile("data.zip", "w")
        zip.write("dataset.csv")
        zip.write("classes.csv")
        zip.write("id_set.csv")
        zip.close()
        return zip

    def read_archive(self, datatable=True, **kwargs):
        """
        Read a zip archive, without extraction, than contains:
        * data as .csv, observations in rows, measurements in columns. Names of columns must have the format:
         A_1, A_2, A_3,..., C_1, C_2,... where A and C are groups (sensors) and 1,2,3... measurement time
        * IDs of training/validation/test as .csv
        * Explicit name of classes as .csv
        :return: 2 pandas, one with raw data, one with IDs
        """
        if datatable:
            try:
                from datatable import fread
                self.dataset = fread(self.archive.open('dataset.csv'), **kwargs).to_pandas()
                self.id_set = fread(self.archive.open('id_set.csv'), **kwargs).to_pandas()
                self.classes = fread(self.archive.open('classes.csv'), **kwargs).to_pandas()
            except ModuleNotFoundError:
                warnings.warn('datatable module not found, using pandas instead. To prevent this message from appearing'
                              ' use "datatable = False" when reading the archive.')
                self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
                self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
                self.classes = pd.read_csv(self.archive.open('classes.csv'))
        else:
            self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
            self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
            self.classes = pd.read_csv(self.archive.open('classes.csv'))
        self.check_datasets()
        self.logs.append('Read archive: {0}'.format(self.archive_path))
        return None


input_path = "//wsl$/Ubuntu-20.04/home/melissachua/CODEX/input_data"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            data_file = pd.read_csv(data)
            data = DataProcesser(data_file, datatable=False)

meas_var = None
start_time = None
end_time = None

# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            data_file = pd.read_csv(data)

            # The data object is used to automatically derive some parameters (e.g. number of classes)
            data = DataProcesser(data_file, datatable=False)

Traceback

> Traceback (most recent call last):   File
> "C:/Users/User/PycharmProjects/CODEX/main.py", line 171, in <module>
>     data = DataProcesser(data_file, datatable=False)   File "C:/Users/User/PycharmProjects/CODEX/main.py", line 16, in __init__
>     self.archive = zipfile.ZipFile(self.archive_path, 'r')   File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1269, in __init__
>     self._RealGetContents()   File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1332, in _RealGetContents
>     endrec = _EndRecData(fp)   File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 264, in _EndRecData
>     fpin.seek(0, 2)   File "C:\Users\User\PycharmProjects\CODEX\venv\lib\site-packages\pandas\core\generic.py",
> line 5487, in __getattr__
>     return object.__getattribute__(self, name) AttributeError: 'DataFrame' object has no attribute 'seek'
> 
> Process finished with exit code 1

CodePudding user response：

The error you are getting is from the zipfile.ZipFile call. You should pass (the path to) a .zip file to your constructor, not a pandas DataFrame.

CodePudding user response：

In your code you have the following lines:


data_file = pd.read_csv(data)
data = DataProcesser(data_file, datatable=False)

With the first line you are reading a csv file into a DataFrame and storing this DataFrame in a variable data_file. The second line uses this DataFrame as input for your DataProcesser constructor. The constructor, however, is defined as follows:

def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
                 read_on_init=True, **kwargs):

You are passing your DataFrame as archive_path, which is not what your constructor is expecting. The constructor is expecting a str as file name (also a file would be appropriate) for example for the zipfile constructor or the os functions. See Zipfile documnetation for example.

Therefore, store your DataFrame in another variable and use the archive_path for the file path. Unfortunately, you have mixed up the DataFrame instance and the archive_path multiple times in your code. Here are a few examples.


# Constrcutor
self.archive = zipfile.ZipFile(self.archive_path, 'r')

...

# First method
if os.path.isdir(archive_path):

...

self.col_classname = self.archive_path.iloc[2]

...
for col in self.archive_path[9:]: