I need to remap a column of strings with other strings, but some strings are related to more than one other string, so I need to fit multiple values in to some elements in the resulting column. I assumed I would do that with a list, so the column of strings would be converted into a column of lists of strings of length 1 or more, like this:
embark_town mapped_column
0 Southampton [A, B]
1 Cherbourg [C]
2 Southampton [A, B]
3 Southampton [A, B]
4 Southampton [A, B]
I tried to do this a typical way with a dictionary and pandas.Series.replace
, but got a value error, I'm assuming because pandas is not assuming that the lists of two elements are supposed to be a single element in the resulting Series.
import seaborn as sns
df = sns.load_dataset('titanic').iloc[:5]
mapping = {
'Southampton': ['A', 'B'],
'Cherbourg': ['C']
}
df.embark_town.replace(mapping)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [42], in <cell line: 10>()
3 df = sns.load_dataset('titanic').iloc[:5]
5 mapping = {
6 'Southampton': ['A', 'B'],
7 'Cherbourg': ['C']
8 }
---> 10 df.embark_town.replace(mapping)
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/series.py:4960, in Series.replace(self, to_replace, value, inplace, limit, regex, method)
4945 @doc(
4946 NDFrame.replace, # type: ignore[has-type]
4947 klass=_shared_doc_kwargs["klass"],
(...)
4958 method: str | lib.NoDefault = lib.no_default,
4959 ):
-> 4960 return super().replace(
4961 to_replace=to_replace,
4962 value=value,
4963 inplace=inplace,
4964 limit=limit,
4965 regex=regex,
4966 method=method,
4967 )
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/generic.py:6660, in NDFrame.replace(self, to_replace, value, inplace, limit, regex, method)
6657 else:
6658 to_replace, value = keys, values
-> 6660 return self.replace(
6661 to_replace, value, inplace=inplace, limit=limit, regex=regex
6662 )
6663 else:
6664
6665 # need a non-zero len on all axes
6666 if not self.size:
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/series.py:4960, in Series.replace(self, to_replace, value, inplace, limit, regex, method)
4945 @doc(
4946 NDFrame.replace, # type: ignore[has-type]
4947 klass=_shared_doc_kwargs["klass"],
(...)
4958 method: str | lib.NoDefault = lib.no_default,
4959 ):
-> 4960 return super().replace(
4961 to_replace=to_replace,
4962 value=value,
4963 inplace=inplace,
4964 limit=limit,
4965 regex=regex,
4966 method=method,
4967 )
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/generic.py:6709, in NDFrame.replace(self, to_replace, value, inplace, limit, regex, method)
6704 if len(to_replace) != len(value):
6705 raise ValueError(
6706 f"Replacement lists must match in length. "
6707 f"Expecting {len(to_replace)} got {len(value)} "
6708 )
-> 6709 new_data = self._mgr.replace_list(
6710 src_list=to_replace,
6711 dest_list=value,
6712 inplace=inplace,
6713 regex=regex,
6714 )
6716 elif to_replace is None:
6717 if not (
6718 is_re_compilable(regex)
6719 or is_list_like(regex)
6720 or is_dict_like(regex)
6721 ):
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/managers.py:458, in BaseBlockManager.replace_list(self, src_list, dest_list, inplace, regex)
455 """do a list replace"""
456 inplace = validate_bool_kwarg(inplace, "inplace")
--> 458 bm = self.apply(
459 "replace_list",
460 src_list=src_list,
461 dest_list=dest_list,
462 inplace=inplace,
463 regex=regex,
464 )
465 bm._consolidate_inplace()
466 return bm
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/managers.py:304, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
302 applied = b.apply(f, **kwargs)
303 else:
--> 304 applied = getattr(b, f)(**kwargs)
305 except (TypeError, NotImplementedError):
306 if not ignore_failures:
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/blocks.py:822, in Block.replace_list(self, src_list, dest_list, inplace, regex)
819 assert not isinstance(mib, bool)
820 m = mib[blk_num : blk_num 1]
--> 822 result = blk._replace_coerce(
823 to_replace=src,
824 value=dest,
825 mask=m,
826 inplace=inplace,
827 regex=regex,
828 )
829 if convert and blk.is_object and not all(x is None for x in dest_list):
830 # GH#44498 avoid unwanted cast-back
831 result = extend_blocks(
832 [b.convert(numeric=False, copy=True) for b in result]
833 )
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/blocks.py:886, in Block._replace_coerce(self, to_replace, value, mask, inplace, regex)
884 return [nb]
885 return [self] if inplace else [self.copy()]
--> 886 return self.replace(
887 to_replace=to_replace, value=value, inplace=inplace, mask=mask
888 )
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/blocks.py:691, in Block.replace(self, to_replace, value, inplace, mask)
689 elif self._can_hold_element(value):
690 blk = self if inplace else self.copy()
--> 691 putmask_inplace(blk.values, mask, value)
692 if not (self.is_object and value is None):
693 # if the user *explicitly* gave None, we keep None, otherwise
694 # may downcast to NaN
695 blocks = blk.convert(numeric=False, copy=False)
File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/array_algos/putmask.py:57, in putmask_inplace(values, mask, value)
55 values[mask] = value[mask]
56 else:
---> 57 values[mask] = value
58 else:
59 # GH#37833 np.putmask is more performant than __setitem__
60 np.putmask(values, mask, value)
ValueError: NumPy boolean array indexing assignment cannot assign 2 input values to the 4 output values where the mask is true
Edit: When I do the same thing with pandas.Series.apply
, there's no error. The desired result is achieved in this case, but if there were a value in the Series that's not in the dictionary there would be an error, whereas I would want the same behavior as .replace
where strings that aren't in the dictionary would just be left as is.
df.embark_town.apply(lambda x: mapping[x])
What does .apply
work and .replace
doesn't? Is there a solution that works in effect the same way as .replace
? What's the most efficient way of doing this transformation?
CodePudding user response:
Let's try Series.map
df['mapped_column'] = df['embark_town'].map(mapping)
CodePudding user response:
Use map function instead. df['embark_town'].map(mapping)
import seaborn as sns
df = sns.load_dataset('titanic').iloc[:5]
mapping = {
'Southampton': ['A', 'B'],
'Cherbourg': ['C']
}
df["embark_town"] = df["embark_town"].map(mapping)