How do I replace/update columns in Pandas dataframe with new data?-CodePudding

I want to create subplots a group Bar Charts for each gene, where the Y-axis are the Gene Expression values and the X-axis are the time points for each patient ID.

Column ID explanation:

0h_P1_T1_TimeC1_PIDC4_Non-Survivor

Time point: substring before the first substring (e.g., 0h) Patient ID: substring after the first substring (e.g., P1)

Code:

import pandas as pd
import plotly.graph_objects as go
import numpy as np

for exp in treatment_df:
  for h in treatment_df.columns.str.split('_')[0][0]: # Get the "hours", which is the substring before the first underscore
    fig = go.Figure(data=[go.Bar(name=h, x=treatment_df.index, y=exp)])
    fig.show()

Traceback:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-33-82ce32460b0d> in <module>()
      2 for exp in treatment_df:
      3   for h in treatment_df.columns.str.split('_')[0][0]: # Get the "hours", which is the substring before the first underscore
----> 4     fig = go.Figure(data=[go.Bar(name=h, x=treatment_df.index, y=exp)])
      5     fig.show()

5 frames
/usr/local/lib/python3.7/dist-packages/plotly/graph_objs/_bar.py in __init__(self, arg, alignmentgroup, base, basesrc, cliponaxis, constraintext, customdata, customdatasrc, dx, dy, error_x, error_y, hoverinfo, hoverinfosrc, hoverlabel, hovertemplate, hovertemplatesrc, hovertext, hovertextsrc, ids, idssrc, insidetextanchor, insidetextfont, legendgroup, legendgrouptitle, legendrank, marker, meta, metasrc, name, offset, offsetgroup, offsetsrc, opacity, orientation, outsidetextfont, selected, selectedpoints, showlegend, stream, text, textangle, textfont, textposition, textpositionsrc, textsrc, texttemplate, texttemplatesrc, uid, uirevision, unselected, visible, width, widthsrc, x, x0, xaxis, xcalendar, xhoverformat, xperiod, xperiod0, xperiodalignment, xsrc, y, y0, yaxis, ycalendar, yhoverformat, yperiod, yperiod0, yperiodalignment, ysrc, **kwargs)
   3208         _v = y if y is not None else _v
   3209         if _v is not None:
-> 3210             self["y"] = _v
   3211         _v = arg.pop("y0", None)
   3212         _v = y0 if y0 is not None else _v

/usr/local/lib/python3.7/dist-packages/plotly/basedatatypes.py in __setitem__(self, prop, value)
   4825                 # ### Handle simple property ###
   4826                 else:
-> 4827                     self._set_prop(prop, value)
   4828             else:
   4829                 # Make sure properties dict is initialized

/usr/local/lib/python3.7/dist-packages/plotly/basedatatypes.py in _set_prop(self, prop, val)
   5169                 return
   5170             else:
-> 5171                 raise err
   5172 
   5173         # val is None

/usr/local/lib/python3.7/dist-packages/plotly/basedatatypes.py in _set_prop(self, prop, val)
   5164 
   5165         try:
-> 5166             val = validator.validate_coerce(val)
   5167         except ValueError as err:
   5168             if self._skip_invalid:

/usr/local/lib/python3.7/dist-packages/_plotly_utils/basevalidators.py in validate_coerce(self, v)
    403             v = to_scalar_or_list(v)
    404         else:
--> 405             self.raise_invalid_val(v)
    406         return v
    407 

/usr/local/lib/python3.7/dist-packages/_plotly_utils/basevalidators.py in raise_invalid_val(self, v, inds)
    297                 typ=type_str(v),
    298                 v=repr(v),
--> 299                 valid_clr_desc=self.description(),
    300             )
    301         )

ValueError: 
    Invalid value of type 'builtins.str' received for the 'y' property of bar
        Received value: '0h_P1_T1_TimeC1_PIDC4_Non-Survivor'

    The 'y' property is an array that may be specified as a tuple,
    list, numpy array, or pandas Series

Data:

treatment_df.iloc[0:3:,0:12].head().to_dict()

{'0h_P1_T1_TimeC1_PIDC4_Non-Survivor': {'DNAJC14': 0.23768844221105523,
  'DNAJC30': 0.12713567839195977,
  'DNAJC9': 0.15527638190954773},
 '0h_P2_T1_TimeC2_PIDC2_Survivor': {'DNAJC14': 0.2128966223132037,
  'DNAJC30': 0.11873080859774823,
  'DNAJC9': 0.09518935516888441},
 '12h_P1_T4_TimeC2_PIDC4_Non-Survivor': {'DNAJC14': 0.26175869120654405,
  'DNAJC30': 0.114519427402863,
  'DNAJC9': 0.11758691206543971},
 '12h_P2_T4_TimeC3_PIDC2_Survivor': {'DNAJC14': 0.2473118279569893,
  'DNAJC30': 0.13333333333333336,
  'DNAJC9': 0.12688172043010748},
 '24h_P1_T5_TimeC4_PIDC4_Non-Survivor': {'DNAJC14': 0.2416666666666666,
  'DNAJC30': 0.13541666666666666,
  'DNAJC9': 0.05937499999999994},
 '24h_P2_T5_TimeC3_PIDC2_Survivor': {'DNAJC14': 0.23474663908996893,
  'DNAJC30': 0.10237849017580147,
  'DNAJC9': 0.12616339193381598},
 '48h_P1_T6_TimeC3_PIDC1_Non-Survivor': {'DNAJC14': 0.22303664921465965,
  'DNAJC30': 0.13821989528795814,
  'DNAJC9': 0.13403141361256546},
 '48h_P2_T6_TimeC3_PIDC3_Survivor': {'DNAJC14': 0.19415983606557383,
  'DNAJC30': 0.11424180327868855,
  'DNAJC9': 0.1316598360655738},
 '4h_P1_T2_TimeC1_PIDC4_Non-Survivor': {'DNAJC14': 0.2543323139653414,
  'DNAJC30': 0.13608562691131498,
  'DNAJC9': 0.11162079510703361},
 '4h_P2_T2_TimeC2_PIDC1_Survivor': {'DNAJC14': 0.22369765066394287,
  'DNAJC30': 0.11542390194075587,
  'DNAJC9': 0.09703779366700718},
 '8h_P1_T3_TimeC4_PIDC4_Non-Survivor': {'DNAJC14': 0.2451282051282051,
  'DNAJC30': 0.11282051282051278,
  'DNAJC9': 0.09641025641025636},
 '8h_P2_T3_TimeC2_PIDC2_Survivor': {'DNAJC14': 0.22760800842992635,
  'DNAJC30': 0.16965226554267654,
  'DNAJC9': 0.12750263435194942}}

Expected output (does not have to be exactly like this):

CodePudding user response：

Just for the better view, transpose the dataframe

df = df.T

Derive Patient id and timeframe

df.loc[:, "patient_id"] = df.index.str.split('_').str[1]

Plot the graph

px.bar(df, x="timeframe", y="DNAJC9", color="patient_id", barmode="group",facet_col="patient_id")

Sample output:

To get separate color bars based on timeframe, keep the timeframe column as object

df.loc[:, "timeframe"] = df.index.str.split('_').str[0].str.replace('h', '')

Sample output:

CodePudding user response：

Add hour and patient:

df = df.T
df['hour'] = df.index.str.split('_').str[0]
df['hour'] = df['hour'].str.replace('h','')
df['patient'] = df.index.str.split('_').str[1]
df = df.astype({'hour':int})
df.sort_values(by=['hour','patient'], inplace=True)
df['hour'] = df['hour'].astype(str)   'h'
df

Plot

for dnajc in df.columns[:3]:
    ob = df[[dnajc, 'hour', 'patient']]
    ob.columns = ['value', 'hour', 'patient']
    fig = go.Figure(data=[go.Bar(x=ob['patient'], y=ob[ob['hour'] == hr]['value'], name=hr) for hr in ob['hour'].unique()])
    fig.update_layout(title=dnajc, xaxis_title='Patient', yaxis_title='Value', width=450, height=320)
    fig.show()

Subplot

df2 = df.copy()
df2 = df2.melt(id_vars=['hour','patient'], value_vars=df2.columns[:3], var_name='dnajc')
df2
###
   hour patient    dnajc     value
0    0h      P1  DNAJC14  0.237688
1    0h      P2  DNAJC14  0.212897
2    4h      P1  DNAJC14  0.254332
3    4h      P2  DNAJC14  0.223698
4    8h      P1  DNAJC14  0.245128
5    8h      P2  DNAJC14  0.227608
6   12h      P1  DNAJC14  0.261759
⋮     ⋮       ⋮      ⋮         ⋮
29   8h      P2   DNAJC9  0.127503
30  12h      P1   DNAJC9  0.117587
31  12h      P2   DNAJC9  0.126882
32  24h      P1   DNAJC9  0.059375
33  24h      P2   DNAJC9  0.126163
34  48h      P1   DNAJC9  0.134031
35  48h      P2   DNAJC9  0.131660

Plot

fig_px = px.bar(df2, x='patient', y='value', color='hour', facet_col='dnajc', barmode='group')
fig_px.show()