综合编程

强化学习-基于pytorch的DDPG实现

微信扫一扫,分享到朋友圈

强化学习-基于pytorch的DDPG实现
0

之前写过 DQN版的,不过DeepMind出品的DDPG,还是更吸引人。

强化学习很有意思,感兴趣一定要去了解下,可能是未来强人工智能的基础。

DQN版见此文:

花半楼:Pytorch DQN CartPole-v0 zhuanlan.zhihu.com

DDPG是AC架构下解决确定性策略问题的强化学习方案,废话不多说了,直接上代码。

算法逻辑


我的实现

import gym
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Actor类
class Actor(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5, padding=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, padding=2)
        self.linear1 = nn.Linear(16*24*24, 300)
        self.linear2 = nn.Linear(300, 10)
        self.linear3 = nn.Linear(10, 3)
        
    def forward(self, x):
        x = F.relu( F.max_pool2d(self.conv1(x)))
        x = F.relu( F.max_pool2d(self.conv2(x)))
        x = F.relu( self.linear1( x.view(-1,16*24*24)))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

# critic类
class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5, padding=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, padding=2)
        self.linear1 = nn.Linear(16*24*24, 300)
        self.linear2 = nn.Linear(300, 10)
        self.linear3 = nn.Linear(13, 3)
        
    def forward(self, x):
        s, a = x
        s = F.relu( F.max_pool2d(self.conv1(s)))
        s = F.relu( F.max_pool2d(self.conv2(s)))
        s = F.relu( self.linear1( s.view(-1,16*24*24)))
        s = F.relu(self.linear2(s))
        x = self.linear3( torch.cat([s, a], dim=1) )
        return x

#智能体
class Agent(object):
    def __init__(self, **kwargs ):
        self.gamma = kwargs['gamma']
        self.epsi_high = kwargs['epsi_high']
        self.epsi_low = kwargs['epsi_low']
        self.decay = kwargs['decay']
        self.lr = kwargs['lr']
        self.buffer = []
        self.capacity = kwargs['capacity']
        self.batch_size = kwargs['batch_size']
        self.state_space_dim = kwargs['state_space_dim']
        self.action_space_dim = kwargs['action_space_dim']
        self.eval_net = Net(self.state_space_dim, 256, self.action_space_dim)
        self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.lr)
        self.steps = 0
        
    def act(self, s0):
        self.steps += 1
        epsi = self.epsi_low + (self.epsi_high-self.epsi_low)*( math.exp(-1.0 * self.steps/self.decay))
        if random.random()<epsi:
            a0 = random.randrange( self.action_space_dim)
        else:
            a0 = torch.argmax( self.eval_net( torch.tensor(s0, dtype=torch.float))).item()
        return a0
    
    def put(self, *transition):
        if len( self.buffer)==self.capacity:
            self.buffer.pop(0)
        self.buffer.append(transition)
        
    def learn(self):
        if (len(self.buffer)) < self.batch_size:
            return
        
        samples = random.sample( self.buffer, self.batch_size)
        
        s0, a0, r1, s1 = zip(*samples)
        s0 = torch.tensor( s0, dtype=torch.float)
        a0 = torch.tensor( a0, dtype=torch.long).view(self.batch_size, -1)
        r1 = torch.tensor( r1, dtype=torch.float).view(self.batch_size, -1)
        s1 = torch.tensor( s1, dtype=torch.float)
        
        y_true = r1 + self.gamma * torch.max( self.eval_net(s1).detach(), dim=1)[0].view(self.batch_size, -1)
        y_pred = self.eval_net(s0).gather(1, a0)
        
#         print( y_pred)
#         print(y_true)
        
        loss_fn = nn.MSELoss()
        loss = loss_fn(y_pred, y_true)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def plot(self, score, mean):
        from IPython import display
        display.clear_output(wait=True)
        display.display(plt.gcf())
        plt.figure(figsize=(20,10))
        plt.clf()
    
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Duration')
        plt.plot(score)
        plt.plot(mean)
        plt.text(len(score)-1, score[-1], str(score[-1]))
        plt.text(len(mean)-1, mean[-1], str(mean[-1]))
        


# 开始测试
env = gym.make('CartPole-v0')
params = {
    'gamma': 0.8,
    'epsi_high': 0.9,
    'epsi_low': 0.05,
    'decay': 200, 
    'lr': 0.001,
    'capacity': 10000,
    'batch_size': 64,
    'state_space_dim': env.observation_space.shape[0],
    'action_space_dim': env.action_space.n   
}
agent = Agent(**params)

score = []
mean = []

for episode in range(1000):
    s0 = env.reset()
    total_reward = 1
    while True:
        env.render()
        a0 = agent.act(s0)
        s1, r1, done, _ = env.step(a0)
        
        if done:
            r1 = -1
            
        agent.put(s0, a0, r1, s1)
        
        if done:
            break
        total_reward += r1
        s0 = s1
        agent.learn()
        
    score.append(total_reward)
    mean.append( sum(score[-100:])/100)
    agent.plot(score, mean)

跑出来这种效果,基本就是稳了,可以去调整参数了。

阅读原文...


微信扫一扫,分享到朋友圈

强化学习-基于pytorch的DDPG实现
0

Python中文社区

Remix Icon 1.5.0 发布,开源图标集

上一篇

阿里腾讯同一天飙财报,谁凉谁热?

下一篇

评论已经被关闭。

插入图片

热门分类

往期推荐

强化学习-基于pytorch的DDPG实现

长按储存图像,分享给朋友