The program is based on the keras library DQN reinforcement learning program, perform well in the cartpole problem, but always can't convergence on mountaincar, I also have program by comparison with others, but always can't find the problem where, hope is predestined friends the human to be able to answer, or could you tell me some web sites to solve the problem, this is the first time to ask questions, really make uncertain, thank you thank you,
import numpy as np
The from tensorflow. Keras import models, the layers, optimizers
The import gym
Import the random
The from the collections import deque
BATCH_SIZE=64
TRAINING_EPISODE=1000
SAMPLE_EPISODE=3
LEARNING_EPISODE=3
The class model (object) :
Def __init__ (self, obs_num act_num) :
Self. Obs_num=obs_num
Self. Dense1_size=100
Self. Act_num=act_num
Def model_construct (self) :
Inputs=the layers. The Input (shape=(self. Obs_num), batch_size=batch_size)
X=the layers. Dense (self dense1_size, activation='relu') (inputs)
Outputs=the layers. Dense (self. Act_num) (x)
The model=models. Model (inputs=inputs and outputs=outputs)
Return the model
The class RL_algorithm (model) :
Def __init__ (self, obs_num act_num, learning_rate=0.001, r_delay=0.95, e_greedy=[0.1, 0.99, 0.01], memory_size=2000) :
Self. Obs_num=obs_num
Self. Act_num=act_num
Self. Step_num=0
Super (RL_algorithm, self) __init__ (obs_num=self. Obs_num act_num=self. Act_num)
The self. The model=self. Model_construct ()
Self.model.com from running (loss='mse, the optimizer=optimizers. Adam (learning_rate))
Self. Model_target=self. Model_construct ()
Self.model_target.com from running (loss='mse, the optimizer=optimizers. Adam (learning_rate))
Self. Model_target. Set_weights (self) model) get_weights ())
The self. The memory=deque (maxlen=memory_size)
Self. R_delay=r_delay
The self e_greedy, self e_greedy_decay, self. E_greedy_min=# e_greedy greedy initial value, damping ratio, the minimum
Def predict (self, obs) :
Act=np. Argmax (self) model) predict (obs))
Return the act
Def esample (self, obs) :
If np. Random. Uniform (0, 1) & gt; Self. E_greedy:
Act=self. Predict (obs)
The else:
Act=np. Random. Randint (self act_num)
Return the act
Def sync_target (self) :
Self. Model_target. Set_weights (self) model) get_weights ())
Def egreedy_update (self) :
If self. E_greedy & gt; Self. E_greedy_min:
Self. E_greedy *=self. E_greedy_decay
Def remember (self, data) :
The self. The memory. Append (data)
Def learn (self, obs, act, reward, obs_, done) :
Q_predict=self. Model. Predict (obs)
Q_target=self. Model_target. Predict (obs_)
For I in range (BATCH_SIZE) :
Q_predict [I, act [I]]=reward [I] + (1 - done [I]) * self r_delay * np. Max (Q_target [I:])
Loss=self. Model. Train_on_batch (obs, Q_predict)
The return loss
Def run_episode () :
Obs=env. Reset ()
The done=False
Reward_total=0
While not done:
Act=DQN. Esample (obs. Reshape ([1, 1]))
Obs_, reward, done, _=env. Step (act)
Reward_total +=reward
If done and reward_total & gt; - 200:
Reward=100
DQN. Remember ([obs, act, reward, obs_, done])
Obs=obs_
Return reward_total
Def learn_episode () :
DQN. Step_num +=1
Samples.=the random sample (DQN. Memory, BATCH_SIZE)
S, A, R, S_, D=[], [], [], [], []
For experiment in samples:
S.a ppend (experiment [0])
A.a ppend (experiment [1])
Of state Richard armitage ppend (experiment [2])
S_. Append (experiment [3])
Da ppend (experiment [4])
S=np. Array (S). Astype (np) float32)
A=np. Array (A)
R=np. Array (R). Astype (np) float32)
S_=np. Array (S_). Astype (np) float32)
D=np. Array (D). Astype (np) float32)
Loss=DQN. Learn (S, A, R, S_, D)
The return loss
Def test_episode () :
Obs=env. Reset ()
The done=False
Reward_total=0
Step=0
While not done:
Act=DQN. Predict (obs. Reshape ([1, 1]))
Obs_, reward, done, _=env. Step (act)
Reward_total +=reward
Obs=obs_
Step +=1
Return reward_total, step
Def train () :
Reward_max=- 200
For j in range (TRAINING_EPISODE) :
For I in range (SAMPLE_EPISODE) :
Reward=run_episode ()
If reward & gt; Reward_max:
Reward_max=reward
If len (DQN. Memory) & gt; 0.2 * DQN. Memory. Maxlen:
For I in range (LEARNING_EPISODE) :
Loss=learn_episode ()
If j % 50==0:
DQN. Sync_target ()
DQN. Egreedy_update ()
Reward, step=test_episode ()
Print (' training_step: 'j' and reward: 'reward,' reward_max: 'reward_max,
', complete_step: 'step,' loss: 'loss)
Def play () :
Obs=env. Reset ()
Env. Render ()
The done=False
Reward_total=0
While not done:
Act=DQN. Predict (obs. Reshape ([1, 1]))
Obs_, reward, done, _=env. Step (act)
Reward_total +=reward
Obs=obs_
Env. Render ()
Print (' play ', 'reward:' int (reward_sum))
Env=gym. Make (' MountainCar - where v0 ')
nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull