# 强化学习-基于pytorch的DDPG实现

DQN版见此文：

DDPG是AC架构下解决确定性策略问题的强化学习方案，废话不多说了，直接上代码。

## 算法逻辑

```import gym
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Actor类
class Actor(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, kernel_size=5, padding=2)
self.conv2 = nn.Conv2d(6, 16, kernel_size=5, padding=2)
self.linear1 = nn.Linear(16*24*24, 300)
self.linear2 = nn.Linear(300, 10)
self.linear3 = nn.Linear(10, 3)

def forward(self, x):
x = F.relu( F.max_pool2d(self.conv1(x)))
x = F.relu( F.max_pool2d(self.conv2(x)))
x = F.relu( self.linear1( x.view(-1,16*24*24)))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x

# critic类
class Critic(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, kernel_size=5, padding=2)
self.conv2 = nn.Conv2d(6, 16, kernel_size=5, padding=2)
self.linear1 = nn.Linear(16*24*24, 300)
self.linear2 = nn.Linear(300, 10)
self.linear3 = nn.Linear(13, 3)

def forward(self, x):
s, a = x
s = F.relu( F.max_pool2d(self.conv1(s)))
s = F.relu( F.max_pool2d(self.conv2(s)))
s = F.relu( self.linear1( s.view(-1,16*24*24)))
s = F.relu(self.linear2(s))
x = self.linear3( torch.cat([s, a], dim=1) )
return x

#智能体
class Agent(object):
def __init__(self, **kwargs ):
self.gamma = kwargs['gamma']
self.epsi_high = kwargs['epsi_high']
self.epsi_low = kwargs['epsi_low']
self.decay = kwargs['decay']
self.lr = kwargs['lr']
self.buffer = []
self.capacity = kwargs['capacity']
self.batch_size = kwargs['batch_size']
self.state_space_dim = kwargs['state_space_dim']
self.action_space_dim = kwargs['action_space_dim']
self.eval_net = Net(self.state_space_dim, 256, self.action_space_dim)
self.steps = 0

def act(self, s0):
self.steps += 1
epsi = self.epsi_low + (self.epsi_high-self.epsi_low)*( math.exp(-1.0 * self.steps/self.decay))
if random.random()<epsi:
a0 = random.randrange( self.action_space_dim)
else:
a0 = torch.argmax( self.eval_net( torch.tensor(s0, dtype=torch.float))).item()
return a0

def put(self, *transition):
if len( self.buffer)==self.capacity:
self.buffer.pop(0)
self.buffer.append(transition)

def learn(self):
if (len(self.buffer)) < self.batch_size:
return

samples = random.sample( self.buffer, self.batch_size)

s0, a0, r1, s1 = zip(*samples)
s0 = torch.tensor( s0, dtype=torch.float)
a0 = torch.tensor( a0, dtype=torch.long).view(self.batch_size, -1)
r1 = torch.tensor( r1, dtype=torch.float).view(self.batch_size, -1)
s1 = torch.tensor( s1, dtype=torch.float)

y_true = r1 + self.gamma * torch.max( self.eval_net(s1).detach(), dim=1)[0].view(self.batch_size, -1)
y_pred = self.eval_net(s0).gather(1, a0)

#         print( y_pred)
#         print(y_true)

loss_fn = nn.MSELoss()
loss = loss_fn(y_pred, y_true)

loss.backward()
self.optimizer.step()

def plot(self, score, mean):
from IPython import display
display.clear_output(wait=True)
display.display(plt.gcf())
plt.figure(figsize=(20,10))
plt.clf()

plt.title('Training...')
plt.xlabel('Episode')
plt.ylabel('Duration')
plt.plot(score)
plt.plot(mean)
plt.text(len(score)-1, score[-1], str(score[-1]))
plt.text(len(mean)-1, mean[-1], str(mean[-1]))

# 开始测试
env = gym.make('CartPole-v0')
params = {
'gamma': 0.8,
'epsi_high': 0.9,
'epsi_low': 0.05,
'decay': 200,
'lr': 0.001,
'capacity': 10000,
'batch_size': 64,
'state_space_dim': env.observation_space.shape[0],
'action_space_dim': env.action_space.n
}
agent = Agent(**params)

score = []
mean = []

for episode in range(1000):
s0 = env.reset()
total_reward = 1
while True:
env.render()
a0 = agent.act(s0)
s1, r1, done, _ = env.step(a0)

if done:
r1 = -1

agent.put(s0, a0, r1, s1)

if done:
break
total_reward += r1
s0 = s1
agent.learn()

score.append(total_reward)
mean.append( sum(score[-100:])/100)
agent.plot(score, mean)```