How to correctly read csv file generated by groupby results?-CodePudding

I have calculated the mean value of DataFrame by two groups and saved the results to CSV file.

Then, I tried to read it again by read_csv(), but the .loc() function doesn't work for the loaded DataFrame.

Here's the code example:

import pandas as pd
import numpy as np

np.random.seed(100)
df = pd.DataFrame(np.random.randn(100, 3), columns=['a', 'b', 'value'])

a_bins = np.arange(-3, 4, 1)
b_bins = np.arange(-2, 4, 2)

# calculate the mean value
df['a_bins'] = pd.cut(df['a'], bins=a_bins)
df['b_bins'] = pd.cut(df['b'], bins=b_bins)
df_value_bin = df.groupby(['a_bins','b_bins']).agg({'value':'mean'})

# save to csv file
df_value_bin.to_csv('test.csv')

# read the exported file
df_test = pd.read_csv('test.csv')

When I type:

df_value_bin.loc[(1.5, -1)]

I got this output

value    0.254337
Name: ((1, 2], (-2, 0]), dtype: float64

But, if I use the same method to locate the value from the loaded CSV file:

df_test.loc[(1.5, -1)]

I got this Keyerror:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/tmp/ipykernel_33836/4042082162.py in <module>
----> 1 df_test.loc[(1.5, -1)]

~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    923                 with suppress(KeyError, IndexError):
    924                     return self.obj._get_value(*key, takeable=self._takeable)
--> 925             return self._getitem_tuple(key)
    926         else:
    927             # we by definition only have the 0th axis

~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1098     def _getitem_tuple(self, tup: tuple):
   1099         with suppress(IndexingError):
-> 1100             return self._getitem_lowerdim(tup)
   1101 
   1102         # no multi-index, so validate all of the indexers

~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_lowerdim(self, tup)
    836                 # We don't need to check for tuples here because those are
    837                 #  caught by the _is_nested_tuple_indexer check above.
--> 838                 section = self._getitem_axis(key, axis=i)
    839 
    840                 # We should never have a scalar section here, because

~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1162         # fall thru to straight lookup
   1163         self._validate_key(key, axis)
-> 1164         return self._get_label(key, axis=axis)
   1165 
   1166     def _get_slice_axis(self, slice_obj: slice, axis: int):

~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
   1111     def _get_label(self, label, axis: int):
   1112         # GH#5667 this will fail if the label is not present in the axis.
-> 1113         return self.obj.xs(label, axis=axis)
   1114 
   1115     def _handle_lowerdim_multi_index_axis0(self, tup: tuple):

~/miniconda3/lib/python3.9/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3774                 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
   3775         else:
-> 3776             loc = index.get_loc(key)
   3777 
   3778             if isinstance(loc, np.ndarray):

~/miniconda3/lib/python3.9/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    386                 except ValueError as err:
    387                     raise KeyError(key) from err
--> 388             raise KeyError(key)
    389         return super().get_loc(key, method=method, tolerance=tolerance)
    390 

KeyError: 1.5

CodePudding user response：

You should read the index as a MultiIndex, but you need to convert the strings to interval. You can use to_interval (all credits to korakot):

def to_interval(istr):
    c_left = istr[0]=='['
    c_right = istr[-1]==']'
    closed = {(True, False): 'left',
              (False, True): 'right',
              (True, True): 'both',
              (False, False): 'neither'
              }[c_left, c_right]
    left, right = map(int, istr[1:-1].split(','))
    return pd.Interval(left, right, closed)

df_test = pd.read_csv('test.csv',  index_col=[0,1], converters={0: to_interval,1: to_interval})

Test:

df_test.loc[(1.5, -1)]
#value    0.254337
#Name: ((1, 2], (-2, 0]), dtype: float64