Numpy command to calculate sine (and cosine) consumes all RAM-CodePudding

I am trying to calculate sine and cosine of month number (e.g. Jan=1, Feb=2, ... Dec=12) for a series of observations that covers ~5 years:

def get_sin(value, max_value):
    sine =  np.sin(value * (2.*np.pi/max_value))
    return sine

def get_cosine(value, max_value):
    cosine = np.cos(value * (2.*np.pi/max_value))
    return cosine

I run the following command on the data:

df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(month, 12), axis=1)

However my desktop RAM is exausted, and then I get the following MemoryError:

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
Input In [466], in <cell line: 1>()
----> 1 df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(month, 12), axis=1)

File ~\Anaconda3\lib\site-packages\pandas\core\frame.py:8839, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
   8828 from pandas.core.apply import frame_apply
   8830 op = frame_apply(
   8831     self,
   8832     func=func,
   (...)
   8837     kwargs=kwargs,
   8838 )
-> 8839 return op.apply().__finalize__(self, method="apply")

File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:727, in FrameApply.apply(self)
    724 elif self.raw:
    725     return self.apply_raw()
--> 727 return self.apply_standard()

File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:854, in FrameApply.apply_standard(self)
    851 results, res_index = self.apply_series_generator()
    853 # wrap results
--> 854 return self.wrap_results(results, res_index)

File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:880, in FrameApply.wrap_results(self, results, res_index)
    878 # see if we can infer the results
    879 if len(results) > 0 and 0 in results and is_sequence(results[0]):
--> 880     return self.wrap_results_for_axis(results, res_index)
    882 # dict of scalars
    883 
    884 # the default dtype of an empty Series will be `object`, but this
    885 # code can be hit by df.mean() where the result should have dtype
    886 # float64 even if it's an empty Series.
    887 constructor_sliced = self.obj._constructor_sliced

File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:1027, in FrameColumnApply.wrap_results_for_axis(self, results, res_index)
   1023     result.index = res_index
   1025 # we may want to infer results
   1026 else:
-> 1027     result = self.infer_to_same_shape(results, res_index)
   1029 return result

File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:1033, in FrameColumnApply.infer_to_same_shape(self, results, res_index)
   1031 def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
   1032     """infer the results to the same shape as the input object"""
-> 1033     result = self.obj._constructor(data=results)
   1034     result = result.T
   1036     # set the index

File ~\Anaconda3\lib\site-packages\pandas\core\frame.py:636, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    630     mgr = self._init_mgr(
    631         data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
    632     )
    634 elif isinstance(data, dict):
    635     # GH#38939 de facto copy defaults to False only in non-dict cases
--> 636     mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
    637 elif isinstance(data, ma.MaskedArray):
    638     import numpy.ma.mrecords as mrecords

File ~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py:494, in dict_to_mgr(data, index, columns, dtype, typ, copy)
    487     arrays = [
    488         arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
    489     ]
    491 if copy:
    492     # arrays_to_mgr (via form_blocks) won't make copies for EAs
    493     # dtype attr check to exclude EADtype-castable strs
--> 494     arrays = [
    495         x
    496         if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
    497         else x.copy()
    498         for x in arrays
    499     ]
    500     # TODO: can we get rid of the dt64tz special case above?
    502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)

File ~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py:497, in <listcomp>(.0)
    487     arrays = [
    488         arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
    489     ]
    491 if copy:
    492     # arrays_to_mgr (via form_blocks) won't make copies for EAs
    493     # dtype attr check to exclude EADtype-castable strs
    494     arrays = [
    495         x
    496         if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
--> 497         else x.copy()
    498         for x in arrays
    499     ]
    500     # TODO: can we get rid of the dt64tz special case above?
    502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)

File ~\Anaconda3\lib\site-packages\pandas\core\generic.py:6032, in NDFrame.copy(self, deep)
   5926 @final
   5927 def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
   5928     """
   5929     Make a copy of this object's indices and data.
   5930 
   (...)
   6030     dtype: object
   6031     """
-> 6032     data = self._mgr.copy(deep=deep)
   6033     self._clear_item_cache()
   6034     return self._constructor(data).__finalize__(self, method="copy")

File ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py:603, in BaseBlockManager.copy(self, deep)
    600 else:
    601     new_axes = list(self.axes)
--> 603 res = self.apply("copy", deep=deep)
    605 res.axes = new_axes
    607 if self.ndim > 1:
    608     # Avoid needing to re-compute these

File ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py:304, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
    302         applied = b.apply(f, **kwargs)
    303     else:
--> 304         applied = getattr(b, f)(**kwargs)
    305 except (TypeError, NotImplementedError):
    306     if not ignore_failures:

File ~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py:643, in Block.copy(self, deep)
    641 values = self.values
    642 if deep:
--> 643     values = values.copy()
    644 return type(self)(values, placement=self._mgr_locs, ndim=self.ndim)

File ~\Anaconda3\lib\site-packages\pandas\core\arrays\masked.py:680, in BaseMaskedArray.copy(self)
    678 def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
    679     data, mask = self._data, self._mask
--> 680     data = data.copy()
    681     mask = mask.copy()
    682     return type(self)(data, mask, copy=False)

MemoryError: Unable to allocate 404. KiB for an array with shape (51724,) and data type float64

I suppose there is something very inefficient with my coding. Can anybody suggest what I am doing wrong?

UPDATE:

I noticed something very weird about variable 'month'. I used

df_ufvdate['month'] = df_ufvdate['month'].astype('int64')

to convert 'month' into an integer and when I run df_ufvdate.info(max_cols=250, show_counts='True') I see that 'month' is type 'int64':

month                    51724 non-null  int64

However, when I run

df_ufvdate['month'].describe()

I get that 'month' is type 'float64':

count    51724.000000
mean         8.030895
std          3.693370
min          1.000000
25%          5.000000
50%          9.000000
75%         11.000000
max         12.000000
Name: month, dtype: float64

Here is more info on df_ufvdate:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51724 entries, 1 to 62618
Data columns (total 211 columns)
dtypes: Int64(34), float64(105), int64(1), object(71)
memory usage: 85.3  MB

Here is my desktop specs:

Windows 64, RAM: 24GB, Jupyter: 6.4.8, Python 3.9.12 (main, Apr 4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]

CodePudding user response：

I got it fixed:

def get_sin(row, column, max_value):
    value = row[column]
    sine =  np.sin(value * (2.*np.pi/max_value))
    return sine

def get_cosine(row, column, max_value):
    value = row[column]
    cosine = np.cos(value * (2.*np.pi/max_value))
    return cosine

and then these lambdas will do the trick:

df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(row, 'month', 12), axis=1)

df_ufvdate['month_cosine'] = df_ufvdate.apply(lambda row: get_cosine(row, 'month', 12), axis=1)

Thank all who commented on this question!