I have calculated the mean value of DataFrame by two groups and saved the results to CSV file.
Then, I tried to read it again by read_csv()
, but the .loc()
function doesn't work for the loaded DataFrame.
Here's the code example:
import pandas as pd
import numpy as np
np.random.seed(100)
df = pd.DataFrame(np.random.randn(100, 3), columns=['a', 'b', 'value'])
a_bins = np.arange(-3, 4, 1)
b_bins = np.arange(-2, 4, 2)
# calculate the mean value
df['a_bins'] = pd.cut(df['a'], bins=a_bins)
df['b_bins'] = pd.cut(df['b'], bins=b_bins)
df_value_bin = df.groupby(['a_bins','b_bins']).agg({'value':'mean'})
# save to csv file
df_value_bin.to_csv('test.csv')
# read the exported file
df_test = pd.read_csv('test.csv')
When I type:
df_value_bin.loc[(1.5, -1)]
I got this output
value 0.254337
Name: ((1, 2], (-2, 0]), dtype: float64
But, if I use the same method to locate the value from the loaded CSV file:
df_test.loc[(1.5, -1)]
I got this Keyerror:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/tmp/ipykernel_33836/4042082162.py in <module>
----> 1 df_test.loc[(1.5, -1)]
~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
923 with suppress(KeyError, IndexError):
924 return self.obj._get_value(*key, takeable=self._takeable)
--> 925 return self._getitem_tuple(key)
926 else:
927 # we by definition only have the 0th axis
~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1098 def _getitem_tuple(self, tup: tuple):
1099 with suppress(IndexingError):
-> 1100 return self._getitem_lowerdim(tup)
1101
1102 # no multi-index, so validate all of the indexers
~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_lowerdim(self, tup)
836 # We don't need to check for tuples here because those are
837 # caught by the _is_nested_tuple_indexer check above.
--> 838 section = self._getitem_axis(key, axis=i)
839
840 # We should never have a scalar section here, because
~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1162 # fall thru to straight lookup
1163 self._validate_key(key, axis)
-> 1164 return self._get_label(key, axis=axis)
1165
1166 def _get_slice_axis(self, slice_obj: slice, axis: int):
~/miniconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
1111 def _get_label(self, label, axis: int):
1112 # GH#5667 this will fail if the label is not present in the axis.
-> 1113 return self.obj.xs(label, axis=axis)
1114
1115 def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
~/miniconda3/lib/python3.9/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
3774 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
3775 else:
-> 3776 loc = index.get_loc(key)
3777
3778 if isinstance(loc, np.ndarray):
~/miniconda3/lib/python3.9/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
386 except ValueError as err:
387 raise KeyError(key) from err
--> 388 raise KeyError(key)
389 return super().get_loc(key, method=method, tolerance=tolerance)
390
KeyError: 1.5
CodePudding user response:
You should read the index as a MultiIndex
, but you need to convert the strings to interval. You can use to_interval
(all credits to korakot):
def to_interval(istr):
c_left = istr[0]=='['
c_right = istr[-1]==']'
closed = {(True, False): 'left',
(False, True): 'right',
(True, True): 'both',
(False, False): 'neither'
}[c_left, c_right]
left, right = map(int, istr[1:-1].split(','))
return pd.Interval(left, right, closed)
df_test = pd.read_csv('test.csv', index_col=[0,1], converters={0: to_interval,1: to_interval})
Test:
df_test.loc[(1.5, -1)]
#value 0.254337
#Name: ((1, 2], (-2, 0]), dtype: float64