I am trying to read a list of 10.000 tensors in a variable, and then create a ragged tensor from them. Of course, they make my RAM collapse:
def load_batch(path_list):
np_list = []
for path in path_list:
np_list.append(np.load(path, mmap_mode='r'))
return np_list
train_tensors_paths = sorted(glob.glob('/content/drive/MyDrive/dataset/*.npy'), key=lambda x: x.split('/')[-1])
train_tensors = load_batch(train_tensors_paths)
train_tensors = tf.ragged.constant(train_tensors, ragged_rank=1)
I wonder whether there is a method to flow the tensors from a dataframe, just like the flow_from_dataframe method for images.
CodePudding user response:
One way to retrieve data from a file quickly using tensorflow is
data_list = tf.data.Dataset.list_files('arrays/*.npy' , shuffle=False)
#If your data is an int value then cast it as an int32
batch_size = 8
def load_data(file):
#print(file)
return tf.cast(np.load(file) , dtype=tf.float32)
def process_path(file_path):
data = tf.numpy_function(load_data, [file_path], tf.float32)
return data
data_set = data_list.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
#Furthermore, for better performance do this also...
def configure_for_performance(ds):
ds = ds.cache()
ds = ds.shuffle(buffer_size=1000)
ds = ds.batch(batch_size)
ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
return ds
dataset = configure_for_performance(data_set)