Replace values in a string type pandas Series with lists-CodePudding

I need to remap a column of strings with other strings, but some strings are related to more than one other string, so I need to fit multiple values in to some elements in the resulting column. I assumed I would do that with a list, so the column of strings would be converted into a column of lists of strings of length 1 or more, like this:

    embark_town     mapped_column
0   Southampton     [A, B]
1   Cherbourg       [C]
2   Southampton     [A, B]
3   Southampton     [A, B]
4   Southampton     [A, B]

I tried to do this a typical way with a dictionary and pandas.Series.replace, but got a value error, I'm assuming because pandas is not assuming that the lists of two elements are supposed to be a single element in the resulting Series.

import seaborn as sns

df = sns.load_dataset('titanic').iloc[:5]

mapping = {
    'Southampton': ['A', 'B'],
    'Cherbourg': ['C']
}

df.embark_town.replace(mapping)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [42], in <cell line: 10>()
      3 df = sns.load_dataset('titanic').iloc[:5]
      5 mapping = {
      6     'Southampton': ['A', 'B'],
      7     'Cherbourg': ['C']
      8 }
---> 10 df.embark_town.replace(mapping)

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/series.py:4960, in Series.replace(self, to_replace, value, inplace, limit, regex, method)
   4945 @doc(
   4946     NDFrame.replace,  # type: ignore[has-type]
   4947     klass=_shared_doc_kwargs["klass"],
   (...)
   4958     method: str | lib.NoDefault = lib.no_default,
   4959 ):
-> 4960     return super().replace(
   4961         to_replace=to_replace,
   4962         value=value,
   4963         inplace=inplace,
   4964         limit=limit,
   4965         regex=regex,
   4966         method=method,
   4967     )

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/generic.py:6660, in NDFrame.replace(self, to_replace, value, inplace, limit, regex, method)
   6657     else:
   6658         to_replace, value = keys, values
-> 6660     return self.replace(
   6661         to_replace, value, inplace=inplace, limit=limit, regex=regex
   6662     )
   6663 else:
   6664 
   6665     # need a non-zero len on all axes
   6666     if not self.size:

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/series.py:4960, in Series.replace(self, to_replace, value, inplace, limit, regex, method)
   4945 @doc(
   4946     NDFrame.replace,  # type: ignore[has-type]
   4947     klass=_shared_doc_kwargs["klass"],
   (...)
   4958     method: str | lib.NoDefault = lib.no_default,
   4959 ):
-> 4960     return super().replace(
   4961         to_replace=to_replace,
   4962         value=value,
   4963         inplace=inplace,
   4964         limit=limit,
   4965         regex=regex,
   4966         method=method,
   4967     )

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/generic.py:6709, in NDFrame.replace(self, to_replace, value, inplace, limit, regex, method)
   6704     if len(to_replace) != len(value):
   6705         raise ValueError(
   6706             f"Replacement lists must match in length. "
   6707             f"Expecting {len(to_replace)} got {len(value)} "
   6708         )
-> 6709     new_data = self._mgr.replace_list(
   6710         src_list=to_replace,
   6711         dest_list=value,
   6712         inplace=inplace,
   6713         regex=regex,
   6714     )
   6716 elif to_replace is None:
   6717     if not (
   6718         is_re_compilable(regex)
   6719         or is_list_like(regex)
   6720         or is_dict_like(regex)
   6721     ):

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/managers.py:458, in BaseBlockManager.replace_list(self, src_list, dest_list, inplace, regex)
    455 """do a list replace"""
    456 inplace = validate_bool_kwarg(inplace, "inplace")
--> 458 bm = self.apply(
    459     "replace_list",
    460     src_list=src_list,
    461     dest_list=dest_list,
    462     inplace=inplace,
    463     regex=regex,
    464 )
    465 bm._consolidate_inplace()
    466 return bm

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/managers.py:304, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
    302         applied = b.apply(f, **kwargs)
    303     else:
--> 304         applied = getattr(b, f)(**kwargs)
    305 except (TypeError, NotImplementedError):
    306     if not ignore_failures:

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/blocks.py:822, in Block.replace_list(self, src_list, dest_list, inplace, regex)
    819     assert not isinstance(mib, bool)
    820     m = mib[blk_num : blk_num   1]
--> 822 result = blk._replace_coerce(
    823     to_replace=src,
    824     value=dest,
    825     mask=m,
    826     inplace=inplace,
    827     regex=regex,
    828 )
    829 if convert and blk.is_object and not all(x is None for x in dest_list):
    830     # GH#44498 avoid unwanted cast-back
    831     result = extend_blocks(
    832         [b.convert(numeric=False, copy=True) for b in result]
    833     )

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/blocks.py:886, in Block._replace_coerce(self, to_replace, value, mask, inplace, regex)
    884         return [nb]
    885     return [self] if inplace else [self.copy()]
--> 886 return self.replace(
    887     to_replace=to_replace, value=value, inplace=inplace, mask=mask
    888 )

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/internals/blocks.py:691, in Block.replace(self, to_replace, value, inplace, mask)
    689 elif self._can_hold_element(value):
    690     blk = self if inplace else self.copy()
--> 691     putmask_inplace(blk.values, mask, value)
    692     if not (self.is_object and value is None):
    693         # if the user *explicitly* gave None, we keep None, otherwise
    694         #  may downcast to NaN
    695         blocks = blk.convert(numeric=False, copy=False)

File /rproject/ist-as-ir/envs/conda_environment/lib/python3.8/site-packages/pandas/core/array_algos/putmask.py:57, in putmask_inplace(values, mask, value)
     55         values[mask] = value[mask]
     56     else:
---> 57         values[mask] = value
     58 else:
     59     # GH#37833 np.putmask is more performant than __setitem__
     60     np.putmask(values, mask, value)

ValueError: NumPy boolean array indexing assignment cannot assign 2 input values to the 4 output values where the mask is true

Edit: When I do the same thing with pandas.Series.apply, there's no error. The desired result is achieved in this case, but if there were a value in the Series that's not in the dictionary there would be an error, whereas I would want the same behavior as .replace where strings that aren't in the dictionary would just be left as is.

df.embark_town.apply(lambda x: mapping[x])

What does .apply work and .replace doesn't? Is there a solution that works in effect the same way as .replace? What's the most efficient way of doing this transformation?

CodePudding user response：

Let's try Series.map

df['mapped_column'] = df['embark_town'].map(mapping)

CodePudding user response：

Use map function instead. df['embark_town'].map(mapping)

import seaborn as sns

df = sns.load_dataset('titanic').iloc[:5]

mapping = {
    'Southampton': ['A', 'B'],
    'Cherbourg': ['C']
}

df["embark_town"] = df["embark_town"].map(mapping)