深度学习自定义自动求导函数

2023年5月1日08:07:33

官方示例：

import torch
from torch.autograd import Variable


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.

        """
        ctx.save_for_backward(input) # ctx 用来保存反向求导所需要的数据,也就是可以在backward（）函数中使用的变量。
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input #反向传播求梯度，如果该参数为网络需要更新的参数，那么该梯度会被保存，方便之后的参数更新或者优化。


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward() # 反向传播会将可训练参数梯度保存。

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()#梯度清零。
    w2.grad.data.zero_()

类继承：
torch.autograd.Function，只需要定义正向传播和反向传播函数，正向传播就是自己定义函数的计算方法；反向传播则是求导梯度，ctx这个东西就当做self来对待就行，可以用来存储反向求导要求的数据，比如正向传播的结果或者输入。

自定义Linear 操作：

import torch
from torch.autograd import Function
import warnings
warnings.filterwarnings("ignore")



class LinearFunction1(Function):
    """ 描述：在pytorch中自定义一个操作，并定义它的梯度求法"""
    @staticmethod
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)   # shape: n,m,  m nout
        # ctx.needs_input_grad = (False,True,True)
        output = torch.mm(input, weight)  # n,m; m,c_out
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
            # output += torch.unsqueeze(bias,dim=0).expand_as(output)
            # output += bias   #广播。
        # ctx.save_for_backward(output)
        return output

    @staticmethod
    def backward(ctx, grad_outputs):
        input, weight, bias = ctx.saved_tensors
        grad_input = None
        grad_weight = None
        grad_bias = None
        if ctx.needs_input_grad[0]:
            grad_input = grad_outputs @ (weight.t())   # n,c_out;c_out,m
        if ctx.needs_input_grad[1]:
            grad_weight = input.t() @ grad_outputs  # m,n    n,c_out
        if bias is not None and ctx.needs_input_grad[2]:

            grad_bias = grad_outputs.sum(0)

        return grad_input,grad_weight,grad_bias

# Inherit from Function
class LinearFunction(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())  # 20,20; 30,20 -> 20,30
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)   # 20 30 , 30 20  -> 20 20   或者 20 30 30 20
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)  # 30 20, 20 20 - > 30 20
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0)

        return grad_input, grad_weight, grad_bias

也就是定义正向传播和反向传播并保存需要的数据到context中即可，函数内的数学运算可以不是pytorch支持的运算而只需要是python支持支持的即可（个人理解是这样）。

测试操作是否正确：

from torch.autograd import gradcheck

linear = LinearFunction.apply   #这里使用上边的为什么不行，去个别名。
input = ( torch.randn(size=(20,20), dtype=torch.double, requires_grad=True),torch.randn(30,20,dtype=torch.double,requires_grad=True))
test = gradcheck(linear, input, eps=1e-6, atol=1e-4)
print(test)


linear = LinearFunction1.apply   #这里使用上边的为什么不行，去个别名。
input = (torch.randn(20,20,dtype=torch.double,requires_grad=True), torch.randn(20,30,dtype=torch.double,requires_grad=True))
test = gradcheck(linear, input, eps=1e-6, atol=1e-4)
print(test)

可以看到两种方都返回的是True,需要注意的是自己定义的操作输入的形状等问题而已。

用自己的操作来建立模型：

import torch.nn as nn
class Linear(nn.Module):
    def __init__(self, input_features, output_features, bias=True):
        super(Linear,self).__init__()

        self.input_features = input_features
        self.output_features = output_features

        self.weight = nn.Parameter(torch.randn(input_features, output_features))
        if bias:
            self.bias = nn.Parameter(torch.randn(output_features))
        else:
            self.register_parameter("bias", None)

        # self.weight.uniform(-0.1, 0.1)
        nn.init.kaiming_uniform(self.weight)
        if bias:
            nn.init.kaiming_uniform(self.bias)

    def forward(self,x):
        return LinearFunction1(x, self.weight, self.bias) # 调用自定义的操作。

在自定义操作的基础上建立的layer就与其他layer一样都可以自动求导和优化参数了。

不可导情况：

上边的线性变换是的运算是可导的情况，也就是可以从输出一步步的用导数或者运算来表达，如果遇到那种不可导的情况，也就是无法显式的表达导数该怎么办？那就是自己制定导数求法，比如近似求导或者干脆用另一个黑盒函数来进行代替求导。那么既然是黑盒函数，那么反向的传播的时候函数中间的值的梯度什么的就很难进行计算了，这种问题就需要对backward函数进行稍微的改变：

"""当某个操作是不可导的，但是你却用了近似的方法来代替。"""
from torch.autograd.function import once_differentiable

def un_differentibale_function(grad_output):
     "一些列不可导的神奇操作"
     grad_output_changed = None
     return  grad_output_changed


@staticmethod
@once_differentiable
def backward(ctx, grad_output):
    print(type(grad_output))
    grad_output_changed = un_differentibale_function(grad_output)
    grad_input = grad_output_changed
    return grad_input

@once_differentiable 神马意思，我也说不清，就当做隐式求导或者近似求导吧。

自定义Linear 操作：

用自己的操作来建立模型：

不可导情况：

热门文章

登录 找回密码

登录找回密码