df
:
avg | count | date | val | prop | unit | distance | d-atmp | d-clouds | d-dewpoint |
---|---|---|---|---|---|---|---|---|---|
0.0786107 | 12 | 2014-10-03 00:00:00 | 22 | atmp | (Deg C) | 24829.6 | 24829.6 | nan | nan |
0.0786107 | 12 | 2014-10-03 00:00:00 | 0 | clouds | (oktas) | 22000.6 | nan | 22000.6 | nan |
0.0786107 | 12 | 2014-10-03 00:00:00 | 32 | dewpoint | (Deg C) | 21344.1 | nan | nan | 21344.1 |
0.0684246 | 6 | 2014-10-04 00:00:00 | 21.5 | atmp | (Deg C) | 26345.1 | 26345.1 | nan | nan |
cols = ['avg', 'date', 'count', 'd-atmp', 'd-cloud', 'd-dewpoint']
d = pd.pivot_table(x, index=cols, columns=['prop', 'unit'], values='val', aggfunc=max)
Ideal result:
date | countObs | avg | d-atmp | atmp (Deg C) | d-clouds | clouds (oktas) | d-dewpoint | dewpoint (Deg C) |
---|---|---|---|---|---|---|---|---|
2014-10-03 00:00:00 | 12 | 0.0786107 | 24829.6 | 22 | 22000.6 | 0 | 21344.1 | 32 |
2014-10-04 00:00:00 | 6 | 0.0684246 | 26345.1 | 21.5 | nan | nan | nan | nan |
Error
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) ~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in array_func(values) 1067 try:
-> 1068 result = self.grouper._cython_operation( 1069 "aggregate", values, how, axis=data.ndim - 1, min_count=min_count
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_cython_operation(self, kind, values, how, axis, min_count, **kwargs)
998 ngroups = self.ngroups
--> 999 return cy_op.cython_operation( 1000 values=values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in cython_operation(self, values, axis, min_count, comp_ids, ngroups,
**kwargs)
659
--> 660 return self._cython_op_ndim_compat(
661 values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_cython_op_ndim_compat(self, values, min_count, ngroups, comp_ids, mask, **kwargs)
515
--> 516 return self._call_cython_op(
517 values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_call_cython_op(self, values, min_count, ngroups, comp_ids, mask, **kwargs)
561 out_shape = self._get_output_shape(ngroups, values)
--> 562 func, values = self.get_cython_func_and_vals(values, is_numeric)
563 out_dtype = self.get_out_dtype(values.dtype)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in get_cython_func_and_vals(self, values, is_numeric)
204
--> 205 func = self._get_cython_function(kind, how, values.dtype, is_numeric)
206
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_get_cython_function(cls, kind, how, dtype, is_numeric)
169 # raise NotImplementedError here rather than TypeError later
--> 170 raise NotImplementedError(
171 f"function is not implemented for this dtype: "
NotImplementedError: function is not implemented for this dtype: [how->mean,dtype->object]
During handling of the above exception, another exception occurred:
AssertionError Traceback (most recent call last) <ipython-input-119-b64b487d2810> in <module>
5 # o
6 # cols = []
----> 7 d = pd.pivot_table(x, index=cols, columns=['osmcObsProperty', 'unit'], values='val') #, aggfunc=max #np.mean or max appear similar , dropna=False
8
9 d.reset_index(inplace=True)
~/.local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
93 return table.__finalize__(data, method="pivot_table")
94
---> 95 table = __internal_pivot_table(
96 data,
97 values,
~/.local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in
__internal_pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
163
164 grouped = data.groupby(keys, observed=observed, sort=sort)
--> 165 agged = grouped.agg(aggfunc)
166 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
167 agged = agged.dropna(how="all")
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
977
978 op = GroupByApply(self, func, args, kwargs)
--> 979 result = op.agg()
980 if not is_dict_like(func) and result is not None:
981 return result
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in agg(self)
156
157 if isinstance(arg, str):
--> 158 return self.apply_str()
159
160 if is_dict_like(arg):
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_str(self)
505 elif self.axis != 0:
506 raise ValueError(f"Operation {f} does not support axis=1")
--> 507 return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs)
508
509 def apply_multiple(self) -> FrameOrSeriesUnion:
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in
_try_aggregate_string_function(self, obj, arg, *args, **kwargs)
575 if f is not None:
576 if callable(f):
--> 577 return f(*args, **kwargs)
578
579 # people may try to aggregate on a non-callable attribute
~/.local/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in mean(self, numeric_only) 1685 numeric_only = self._resolve_numeric_only(numeric_only) 1686
-> 1687 result = self._cython_agg_general( 1688 "mean", 1689 alt=lambda x: Series(x).mean(numeric_only=numeric_only),
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in
_cython_agg_general(self, how, alt, numeric_only, min_count) 1080 # TypeError -> we may have an exception in trying to aggregate 1081 # continue and exclude the block
-> 1082 new_mgr = data.grouped_reduce(array_func, ignore_failures=True) 1083 1084 if len(new_mgr) < len(data):
~/.local/lib/python3.9/site-packages/pandas/core/internals/managers.py in grouped_reduce(self, func, ignore_failures) 1233 for sb in blk._split(): 1234 try:
-> 1235 applied = sb.apply(func) 1236 except (TypeError, NotImplementedError): 1237 if not ignore_failures:
~/.local/lib/python3.9/site-packages/pandas/core/internals/blocks.py in apply(self, func, **kwargs)
379 """
380 with np.errstate(all="ignore"):
--> 381 result = func(self.values, **kwargs)
382
383 return self._split_op_result(result)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in array_func(values) 1074 # try to python agg 1075
# TODO: shouldn't min_count matter?
-> 1076 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) 1077 1078 return result
~/.local/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in
_agg_py_fallback(self, values, ndim, alt) 1396 # should always be preserved by the implemented aggregations 1397 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
-> 1398 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) 1399 1400 if isinstance(values, Categorical):
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in agg_series(self, obj, func, preserve_dtype) 1047 1048 else:
-> 1049 result = self._aggregate_series_fast(obj, func) 1050 1051 npvalues = lib.maybe_convert_objects(result, try_float=False)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_aggregate_series_fast(self, obj, func) 1072 ids = ids.take(indexer) 1073 sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups)
-> 1074 result, _ = sgrouper.get_result() 1075 return result 1076
~/.local/lib/python3.9/site-packages/pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesGrouper.get_result()
AssertionError: `result` has not been initialized.
CodePudding user response:
IIUC, you can use groupby_agg
:
out = df.groupby('date', as_index=False).agg(max)
Output:
date | avg | count | val | prop | unit | distance | d-atmp | d-clouds | d-dewpoint |
---|---|---|---|---|---|---|---|---|---|
2014-10-03 00:00:00 | 0.0786107 | 12 | 32 | dewpoint | (oktas) | 24829.6 | 24829.6 | 22000.6 | 21344.1 |
2014-10-04 00:00:00 | 0.0684246 | 6 | 21.5 | atmp | (Deg C) | 26345.1 | 26345.1 | nan | nan |
CodePudding user response:
You could pivot
; then use groupby
max
:
cols = ['avg', 'date', 'count', 'd-atmp', 'd-clouds', 'd-dewpoint']
tmp = df.pivot(index=cols, columns=['prop', 'unit'], values='val')
tmp.columns = tmp.columns.map(' '.join)
out = tmp.reset_index().groupby('date', as_index=False).max()\
[['date', 'count', 'avg', 'd-atmp', 'atmp (Deg C)', 'd-clouds',
'clouds (oktas)', 'd-dewpoint', 'dewpoint (Deg C)']]
Output:
date count avg d-atmp atmp (Deg C) d-clouds clouds (oktas) d-dewpoint dewpoint (Deg C)
0 2014-10-03 00:00:00 12 0.078611 24829.6 22.0 22000.6 0.0 21344.1 32.0
1 2014-10-04 00:00:00 6 0.068425 26345.1 21.5 NaN NaN NaN NaN