In the DataProcessor
class, the raw_file_processing
, dataset_csv
, classes_csv
, and idset_csv
functions process the raw datafile and output csv
files that can be read by the read_archive
function.
My code raised AttributeError: 'DataFrame' object has no attribute 'seek'
error.
import pandas as pd
import warnings
import numpy as np
import os
import zipfile
import re
from sklearn.model_selection import train_test_split
class DataProcesser:
def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
read_on_init=True, **kwargs):
self.archive_path = archive_path
self.archive = zipfile.ZipFile(self.archive_path, 'r')
self.col_id = col_id
self.col_class = col_class
self.col_classname = col_classname
self.col_set = col_set
self.dataset = None
self.dataset_cropped = None
self.id_set = None
self.classes = None
self.train_set = None
self.validation_set = None
self.test_set = None
self.logs = []
self.stats = None
self.flag_subset = False
self.flag_process = False
self.flag_split = False
self.measurement_df = None
if read_on_init:
self.read_archive(**kwargs)
def raw_file_processing(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(archive_path):
# 'Class' refers to the independent variable
# The class info is the 3rd column tile_num in the current example
# The rationale for looking at tile_num is that if we're examining tumor progression, we can observe the relative positions of the tumor growth
# Tumor progression may be denoted by the corresponding values of tumor progression markers/antibodies such as CEA
# In the future, we may append all the tumor patient files and normal patient files and then assign patient number as "class"
self.col_classname = self.archive_path.iloc[2]
# Dummy-code the classes
self.col_class = pd.get_dummies(self.col_classname)
# Create the ID series by concatenating columns 1-3
self.col_id = self.archive_path.assign(
ID=self.archive_path[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
lambda row: '_'.join([str(each) for each in row]), axis=1))
self.col_id = self.archive_path.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])
# Obtain measurement info
# Normalize data against blank/empty columns
# log-transform the data
for col in self.archive_path[9:]:
if re.findall(r"Blank|Empty", col):
background = col
else:
for index, row in col:
norm_data = row / background
self.measurement_df = np.log2(norm_data)
return self.archive_path, self.col_id, self.col_class, self.measurement_df
def dataset_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
"""Col 1: ID
Col 2: class
Col 3-n: measurements"""
id_col = self.col_id
self.col_class = self.col_class.to_frame()
frames = [id_col, self.col_class, self.measurement_df]
self.dataset = pd.concat(frames)
data_csv = self.dataset.to_csv("../input_data/dataset.csv")
return data_csv
def classes_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
# Remove any duplicate rows with the same col_class and cls_col info
self.cls_df = pd.DataFrame({'class': [self.col_class], 'class_name': [self.col_classname]})
self.cls_df.drop_duplicate(keep=False, inplace=True)
# Save as csv file
self.cls_df.to_csv('../input_data/classes.csv')
return self.cls_df
def idset_csv(self):
# If the path contains HTAN CODEX data, perform the following processing steps
if os.path.isdir(self.archive_path):
# Get the ids
ids = self.archive_path[0]
# Train-test-validation split
ids.sample(frac=1)
train, test = train_test_split(ids, test_size=0.2, random_state=1)
train, val = train_test_split(train, test_size=0.25, random_state=1)
# Assuming train, val, test are dataframes
# A string is assigned to the "set" column.
train.loc[:, 'set'] = 'train'
val.loc[:, 'set'] = 'val'
test.loc[:, 'set'] = 'test'
# Save as csv file
id_set = pd.concat([train, val, test], axis=0)
id_set_csv = id_set.to_csv('../input_data/id_set.csv', index=False)
return id_set_csv
def zip_files(self):
# Create a ZipFile object for dataset.csv, classes.csv, and id_set.csv
zip = ZipFile("data.zip", "w")
zip.write("dataset.csv")
zip.write("classes.csv")
zip.write("id_set.csv")
zip.close()
return zip
def read_archive(self, datatable=True, **kwargs):
"""
Read a zip archive, without extraction, than contains:
* data as .csv, observations in rows, measurements in columns. Names of columns must have the format:
A_1, A_2, A_3,..., C_1, C_2,... where A and C are groups (sensors) and 1,2,3... measurement time
* IDs of training/validation/test as .csv
* Explicit name of classes as .csv
:return: 2 pandas, one with raw data, one with IDs
"""
if datatable:
try:
from datatable import fread
self.dataset = fread(self.archive.open('dataset.csv'), **kwargs).to_pandas()
self.id_set = fread(self.archive.open('id_set.csv'), **kwargs).to_pandas()
self.classes = fread(self.archive.open('classes.csv'), **kwargs).to_pandas()
except ModuleNotFoundError:
warnings.warn('datatable module not found, using pandas instead. To prevent this message from appearing'
' use "datatable = False" when reading the archive.')
self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
self.classes = pd.read_csv(self.archive.open('classes.csv'))
else:
self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
self.classes = pd.read_csv(self.archive.open('classes.csv'))
self.check_datasets()
self.logs.append('Read archive: {0}'.format(self.archive_path))
return None
input_path = "//wsl$/Ubuntu-20.04/home/melissachua/CODEX/input_data"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
data_file = pd.read_csv(data)
data = DataProcesser(data_file, datatable=False)
meas_var = None
start_time = None
end_time = None
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
data_file = pd.read_csv(data)
# The data object is used to automatically derive some parameters (e.g. number of classes)
data = DataProcesser(data_file, datatable=False)
Traceback
> Traceback (most recent call last): File
> "C:/Users/User/PycharmProjects/CODEX/main.py", line 171, in <module>
> data = DataProcesser(data_file, datatable=False) File "C:/Users/User/PycharmProjects/CODEX/main.py", line 16, in __init__
> self.archive = zipfile.ZipFile(self.archive_path, 'r') File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1269, in __init__
> self._RealGetContents() File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 1332, in _RealGetContents
> endrec = _EndRecData(fp) File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\zipfile.py",
> line 264, in _EndRecData
> fpin.seek(0, 2) File "C:\Users\User\PycharmProjects\CODEX\venv\lib\site-packages\pandas\core\generic.py",
> line 5487, in __getattr__
> return object.__getattribute__(self, name) AttributeError: 'DataFrame' object has no attribute 'seek'
>
> Process finished with exit code 1
CodePudding user response:
The error you are getting is from the zipfile.ZipFile call. You should pass (the path to) a .zip file to your constructor, not a pandas DataFrame.
CodePudding user response:
In your code you have the following lines:
data_file = pd.read_csv(data)
data = DataProcesser(data_file, datatable=False)
With the first line you are reading a csv file into a DataFrame
and storing this DataFrame
in a variable data_file
.
The second line uses this DataFrame
as input for your DataProcesser
constructor. The constructor, however, is defined as follows:
def __init__(self, archive_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
read_on_init=True, **kwargs):
You are passing your DataFrame
as archive_path
, which is not what your constructor is expecting. The constructor is expecting a str
as file name (also a file
would be appropriate) for example for the zipfile
constructor or the os
functions. See Zipfile documnetation for example.
Therefore, store your DataFrame
in another variable and use the archive_path
for the file path. Unfortunately, you have mixed up the DataFrame
instance and the archive_path
multiple times in your code. Here are a few examples.
# Constrcutor
self.archive = zipfile.ZipFile(self.archive_path, 'r')
...
# First method
if os.path.isdir(archive_path):
...
self.col_classname = self.archive_path.iloc[2]
...
for col in self.archive_path[9:]: