I am trying to calculate sine and cosine of month number (e.g. Jan=1, Feb=2, ... Dec=12) for a series of observations that covers ~5 years:
def get_sin(value, max_value):
sine = np.sin(value * (2.*np.pi/max_value))
return sine
def get_cosine(value, max_value):
cosine = np.cos(value * (2.*np.pi/max_value))
return cosine
I run the following command on the data:
df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(month, 12), axis=1)
However my desktop RAM is exausted, and then I get the following MemoryError:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
Input In [466], in <cell line: 1>()
----> 1 df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(month, 12), axis=1)
File ~\Anaconda3\lib\site-packages\pandas\core\frame.py:8839, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8828 from pandas.core.apply import frame_apply
8830 op = frame_apply(
8831 self,
8832 func=func,
(...)
8837 kwargs=kwargs,
8838 )
-> 8839 return op.apply().__finalize__(self, method="apply")
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:727, in FrameApply.apply(self)
724 elif self.raw:
725 return self.apply_raw()
--> 727 return self.apply_standard()
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:854, in FrameApply.apply_standard(self)
851 results, res_index = self.apply_series_generator()
853 # wrap results
--> 854 return self.wrap_results(results, res_index)
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:880, in FrameApply.wrap_results(self, results, res_index)
878 # see if we can infer the results
879 if len(results) > 0 and 0 in results and is_sequence(results[0]):
--> 880 return self.wrap_results_for_axis(results, res_index)
882 # dict of scalars
883
884 # the default dtype of an empty Series will be `object`, but this
885 # code can be hit by df.mean() where the result should have dtype
886 # float64 even if it's an empty Series.
887 constructor_sliced = self.obj._constructor_sliced
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:1027, in FrameColumnApply.wrap_results_for_axis(self, results, res_index)
1023 result.index = res_index
1025 # we may want to infer results
1026 else:
-> 1027 result = self.infer_to_same_shape(results, res_index)
1029 return result
File ~\Anaconda3\lib\site-packages\pandas\core\apply.py:1033, in FrameColumnApply.infer_to_same_shape(self, results, res_index)
1031 def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
1032 """infer the results to the same shape as the input object"""
-> 1033 result = self.obj._constructor(data=results)
1034 result = result.T
1036 # set the index
File ~\Anaconda3\lib\site-packages\pandas\core\frame.py:636, in DataFrame.__init__(self, data, index, columns, dtype, copy)
630 mgr = self._init_mgr(
631 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
632 )
634 elif isinstance(data, dict):
635 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 636 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
637 elif isinstance(data, ma.MaskedArray):
638 import numpy.ma.mrecords as mrecords
File ~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py:494, in dict_to_mgr(data, index, columns, dtype, typ, copy)
487 arrays = [
488 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
489 ]
491 if copy:
492 # arrays_to_mgr (via form_blocks) won't make copies for EAs
493 # dtype attr check to exclude EADtype-castable strs
--> 494 arrays = [
495 x
496 if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
497 else x.copy()
498 for x in arrays
499 ]
500 # TODO: can we get rid of the dt64tz special case above?
502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
File ~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py:497, in <listcomp>(.0)
487 arrays = [
488 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
489 ]
491 if copy:
492 # arrays_to_mgr (via form_blocks) won't make copies for EAs
493 # dtype attr check to exclude EADtype-castable strs
494 arrays = [
495 x
496 if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
--> 497 else x.copy()
498 for x in arrays
499 ]
500 # TODO: can we get rid of the dt64tz special case above?
502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
File ~\Anaconda3\lib\site-packages\pandas\core\generic.py:6032, in NDFrame.copy(self, deep)
5926 @final
5927 def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
5928 """
5929 Make a copy of this object's indices and data.
5930
(...)
6030 dtype: object
6031 """
-> 6032 data = self._mgr.copy(deep=deep)
6033 self._clear_item_cache()
6034 return self._constructor(data).__finalize__(self, method="copy")
File ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py:603, in BaseBlockManager.copy(self, deep)
600 else:
601 new_axes = list(self.axes)
--> 603 res = self.apply("copy", deep=deep)
605 res.axes = new_axes
607 if self.ndim > 1:
608 # Avoid needing to re-compute these
File ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py:304, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
302 applied = b.apply(f, **kwargs)
303 else:
--> 304 applied = getattr(b, f)(**kwargs)
305 except (TypeError, NotImplementedError):
306 if not ignore_failures:
File ~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py:643, in Block.copy(self, deep)
641 values = self.values
642 if deep:
--> 643 values = values.copy()
644 return type(self)(values, placement=self._mgr_locs, ndim=self.ndim)
File ~\Anaconda3\lib\site-packages\pandas\core\arrays\masked.py:680, in BaseMaskedArray.copy(self)
678 def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
679 data, mask = self._data, self._mask
--> 680 data = data.copy()
681 mask = mask.copy()
682 return type(self)(data, mask, copy=False)
MemoryError: Unable to allocate 404. KiB for an array with shape (51724,) and data type float64
I suppose there is something very inefficient with my coding. Can anybody suggest what I am doing wrong?
UPDATE:
I noticed something very weird about variable 'month'. I used
df_ufvdate['month'] = df_ufvdate['month'].astype('int64')
to convert 'month' into an integer and when I run df_ufvdate.info(max_cols=250, show_counts='True') I see that 'month' is type 'int64':
month 51724 non-null int64
However, when I run
df_ufvdate['month'].describe()
I get that 'month' is type 'float64':
count 51724.000000
mean 8.030895
std 3.693370
min 1.000000
25% 5.000000
50% 9.000000
75% 11.000000
max 12.000000
Name: month, dtype: float64
Here is more info on df_ufvdate:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51724 entries, 1 to 62618
Data columns (total 211 columns)
dtypes: Int64(34), float64(105), int64(1), object(71)
memory usage: 85.3 MB
Here is my desktop specs:
Windows 64, RAM: 24GB, Jupyter: 6.4.8, Python 3.9.12 (main, Apr 4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
CodePudding user response:
I got it fixed:
def get_sin(row, column, max_value):
value = row[column]
sine = np.sin(value * (2.*np.pi/max_value))
return sine
def get_cosine(row, column, max_value):
value = row[column]
cosine = np.cos(value * (2.*np.pi/max_value))
return cosine
and then these lambdas will do the trick:
df_ufvdate['month_sine'] = df_ufvdate.apply(lambda row: get_sin(row, 'month', 12), axis=1)
df_ufvdate['month_cosine'] = df_ufvdate.apply(lambda row: get_cosine(row, 'month', 12), axis=1)
Thank all who commented on this question!