How to avoid ragged tensors collapse my RAM?-CodePudding

I am trying to read a list of 10.000 tensors in a variable, and then create a ragged tensor from them. Of course, they make my RAM collapse:

def load_batch(path_list):
    np_list = []
        
    for path in path_list:
        np_list.append(np.load(path, mmap_mode='r'))
    return np_list

train_tensors_paths = sorted(glob.glob('/content/drive/MyDrive/dataset/*.npy'), key=lambda x: x.split('/')[-1])

train_tensors = load_batch(train_tensors_paths)
train_tensors = tf.ragged.constant(train_tensors, ragged_rank=1)

I wonder whether there is a method to flow the tensors from a dataframe, just like the flow_from_dataframe method for images.

CodePudding user response：

One way to retrieve data from a file quickly using tensorflow is

data_list = tf.data.Dataset.list_files('arrays/*.npy' , shuffle=False)

#If your data is an int value then cast it as an int32
batch_size = 8
def load_data(file):
    #print(file)
    return tf.cast(np.load(file) , dtype=tf.float32)

def process_path(file_path):
    data = tf.numpy_function(load_data, [file_path], tf.float32)
    return data

data_set = data_list.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)

#Furthermore, for better performance do this also...
def configure_for_performance(ds):
  ds = ds.cache()
  ds = ds.shuffle(buffer_size=1000)
  ds = ds.batch(batch_size)
  ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
  return ds

dataset = configure_for_performance(data_set)