LR Schedulers Implementation From Scratch

Deep Learning

Implementation

Implementation of cosine annealing and OneCycle learning rate schedulers from scratch using tinyai mini-framework

Published

Monday, 13/November/2023

Importing utilities (click to show/hide)

import torch,math,functools
import matplotlib.pyplot as plt
from functools import partial
import pdb
from tinyai.datasets import *
from tinyai.conv import *
from tinyai.learner import *
from tinyai.activations import *
from tinyai.init import *
from tinyai.sgd import *
from datasets import load_dataset
import torchvision.transforms.functional as TF,torch.nn.functional as F
from torch import tensor,nn,optim
import fastcore.all as fc
from torch.optim import lr_scheduler
from torcheval.metrics import MulticlassAccuracy

x = torch.linspace(0,10,10)
lr = 5
print(x,math.pi)

tensor([ 0.0000,  1.1111,  2.2222,  3.3333,  4.4444,  5.5556,  6.6667,  7.7778,
         8.8889, 10.0000]) 3.141592653589793

How we want our learning rate to look at.

def plot_thing(f,lr,steps):
    x= torch.linspace(0,math.pi,steps)
    plt.plot(x,(f(x) +1)/2 * lr)
plot_thing(partial(torch.cos),lr,steps=100)

Lets try in learner

Importing and transfroming dataset (click to show/hide)

xl,yl = 'image','label' # x label, y label
name = "fashion_mnist"
bs = 1024
xmean,xstd = 0.28, 0.35

@inplace
def transformi(b): b[xl] = [(TF.to_tensor(o)-xmean)/xstd for o in b[xl]]

dsd = load_dataset(name)
tds = dsd.with_transform(transformi)
dls = DataLoaders.from_dd(tds, bs, num_workers=4)

CosineAnnealingLR

First Version.

Cosine Annealing LR implementation from scratch, which had to be updated for the OneCycleLR

This version might be a little faster but take more memory.(not tested)

First Version. (click to show/hide)

class CosAnnLR():
    def __init__(self,tmax,optim):
        self.optim = optim
        self.tmax = tmax
        self.lr = optim.param_groups[0]['lr']
        self.values = self._init_values()
        self.cur_step = 0
    
    def _init_values(self):
        return (torch.cos(torch.linspace(0,math.pi,self.tmax))+1)/2*self.lr

    def step(self):
        self.optim.param_groups[0]['lr'] = self.values[self.cur_step]
        self.cur_step +=1

Second Version

CosineAnnealingLR implementation from scratch.

Second Version. (click to show/hide)

class CosAnnLR():
    def __init__(self,tmax,optim):
        self.optim = optim
        self.lr = optim.param_groups[0]['lr']
        self.tmax = tmax
        self.cur_step = 0
    
    def step(self):
        self.optim.param_groups[0]['lr'] =  (math.cos(self.cur_step/self.tmax * math.pi)+1)/2*self.lr
        self.cur_step +=1

def _lr(cb): return cb.pg['lr']  # Callback that will allow us to record LR during learning.

Preparing the learner for training.

Code for learner. (click to show/hide)

act_gr = partial(GeneralRelu, leak=0.1, sub=0.4)
metrics = MetricsCB(accuracy=MulticlassAccuracy())
astats = ActivationStats(fc.risinstance(GeneralRelu))
cbs = [DeviceCB(), metrics, ProgressCB(plot=True), astats]
iw = partial(init_weights, leaky=0.1)
set_seed(42)

lr,epochs = 1e-2,5
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
tmax = epochs * len(dls.train)
sched = partial(CosAnnLR,tmax)
#sched = partial(lr_scheduler.CosineAnnealingLR,T_max = tmax) 
# Testing if it works with pytorch's CosineAnnealingLR 

record = RecorderCB(lr=_lr)
xtra = [BatchSchedCB(sched),record]
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)

Code

learn.fit(epochs)

accuracy	loss	epoch	train
0.806	0.529	0	train
0.853	0.404	0	eval
0.876	0.338	1	train
0.872	0.349	1	eval
0.892	0.295	2	train
0.882	0.326	2	eval
0.904	0.264	3	train
0.887	0.316	3	eval
0.910	0.248	4	train
0.887	0.310	4	eval

Plot of learning rate throughout the learning process

Code

record.plot()

astats.color_dim()

astats.plot_stats()

Figure: Plot of Weight’s Means and Stdves throughout the learning process.

astats.dead_chart()

CosineAnnealing Summary.

After creating my own CosineAnnealing I decided to look for paper where it was introduced, and I found this paper.
Where we can find this equation. \[ \eta_{t} = \eta_{min}^{i} + \frac{1}{2}\left(\eta_{max}^{i}-\eta_{min}^{i}\right)\left(1+\cos\left(\frac{T_{cur}}{T_{i}}\pi\right)\right) \]

If we compared it to our code, it looks completely different.

(math.cos(cur_step/tmax * math.pi)+1)/2*lr

But if we read the paper further, the η and T could be translated to our code. Where: \[ \eta \text{ (eta) - is learning rate } \] \[ T_{cur} \text{ - is current step }\] \[ t_{i} \text{ - is our tmax}\]

\[ lr_{t} = lr_{min} + \frac{1}{2}\left(lr_{max}-lr_{min}\right)\left(1+\cos\left(\frac{\text{curstep}}{tmax}\pi\right)\right) \]

The paper’s equation introduces min & max learning rate, therefore the difference. But the rest is the same.

OneCycleLR

CLR should specify minmum and maximum learning rate boundaries and a step_size,
but this implementation doesn’t do that.
Adding minimum and maximum should be pretty straight forward, tho.
You also might want to add a 3rd phase where learning rate is at its maximum for 5-10% of the training.

class OneCycleLR:
    '''
    This version of OneCycle was create before looking up CosineAnnealing paper.
    '''
    def __init__(self, tmax, optim, warm_up:float = 0.30):
        self.optim = optim
        self.initial_lr = self.optim.param_groups[0]['lr']
        self.beta,self.beta_2 = self.optim.param_groups[0]['betas']
        self.max_beta, self.min_beta = self.beta + 0.05, self.beta - 0.05
        self.warm_up = warm_up
        self.warm_up_steps = int(tmax * self.warm_up)
        self.annealing_steps = tmax - self.warm_up_steps
        self.cur_step = 0

    def get_beta(self,phase:float,warming_up: bool):
        if warming_up:
            return self.min_beta + (self.max_beta - self.min_beta) * ((math.cos(math.pi * phase) +1) / 2)
        else:
            return self.max_beta + (self.min_beta - self.max_beta) * ((math.cos(math.pi * phase) +1) / 2)

    
    def step(self):
        
        # warm_up phase
        if self.cur_step <= self.warm_up_steps:
            # Increasing learning rate
            phase = self.cur_step / self.warm_up_steps
            adjusted_lr = (math.cos(phase * math.pi +math.pi)+1)/2*self.initial_lr
            adjusted_beta = self.get_beta(phase, warming_up = True)
        else:
            # Decreasing learning rate
            phase = (self.cur_step - self.warm_up_steps) / self.annealing_steps
            adjusted_lr = (math.cos(phase*math.pi) + 1) / 2 * self.initial_lr
            adjusted_beta = self.get_beta(phase, warming_up = False)

        # adjusted_lr min_max
        self.optim.param_groups[0]['lr'] = adjusted_lr
        self.optim.param_groups[0]['betas'] = (adjusted_beta,self.beta_2)
        self.cur_step += 1

def _beta1(cb): return cb.pg['betas'][0]
rec = RecorderCB(lr=_lr, mom=_beta1)

Preparing the learner for training.

Code for learner. (click to show/hide)

act_gr = partial(GeneralRelu, leak=0.1, sub=0.4)
metrics = MetricsCB(accuracy=MulticlassAccuracy())
astats = ActivationStats(fc.risinstance(GeneralRelu))
cbs = [DeviceCB(), metrics, ProgressCB(plot=True), astats]
iw = partial(init_weights, leaky=0.1)

set_seed(42)

lr,epochs = 1e-2,5
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
tmax = epochs * len(dls.train)
sched = partial(OneCycleLR,tmax)
#sched = partial(lr_scheduler.OneCycleLR,max_lr = lr,total_steps = tmax) # Testing if it works with pytorch's CosineAnnealingLR 

record = RecorderCB(lr=_lr, mom=_beta1)
xtra = [BatchSchedCB(sched),record]
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)

learn.fit(epochs)

accuracy	loss	epoch	train
0.723	0.827	0	train
0.822	0.485	0	eval
0.860	0.386	1	train
0.864	0.368	1	eval
0.887	0.310	2	train
0.877	0.338	2	eval
0.902	0.268	3	train
0.882	0.316	3	eval
0.912	0.242	4	train
0.888	0.303	4	eval

Note: If you happened to know why does the learning doesn’t go smoothly at the beginning, u can dm me on discord @afterhoursbilly

Plot of Learning Rate and Momentum throughout the learning process

Code

record.plot()

astats.plot_stats()

astats.dead_chart()

Figure: Plot of Weight’s that are = to 0.

astats.color_dim()

OneCycle Summary

Inspired by paper, & fast.ai 22part course

This CLR implements minmum and maximum learning rate boundaries

We could also add a phase where learning rate is at its maximum for 5-10% of the training.

class OneCycleLR:
    '''
    Modified version after looking up papers.
    '''
    def __init__(self, tmax, optim, warm_up:float = 0.30):
        self.optim = optim
        self.initial_lr,self.min_lr = self.optim.param_groups[0]['lr'],self.optim.param_groups[0]['lr']//20
        self.beta,self.beta_2 = self.optim.param_groups[0]['betas']
        self.max_beta, self.min_beta = self.beta + 0.05, self.beta - 0.05
        self.warm_up = warm_up
        self.warm_up_steps = int(tmax * self.warm_up)
        self.annealing_steps = tmax - self.warm_up_steps
        self.cur_step = 0

    def cosine_annealing(self,phase,min,max):
        return min + (max-min) * ((math.cos(math.pi * phase)+1)/2)
    
    def step(self):
        
        # warm_up phase
        if self.cur_step <= self.warm_up_steps:
            # Increasing learning rate
            phase = self.cur_step / self.warm_up_steps
            adjusted_lr = self.cosine_annealing(phase,self.initial_lr,self.min_lr)
            adjusted_beta = self.cosine_annealing(phase,self.min_beta,self.max_beta)
        else:
            # Decreasing learning rate
            phase = (self.cur_step - self.warm_up_steps) / self.annealing_steps
            adjusted_lr = self.cosine_annealing(phase,self.min_lr,self.initial_lr)
            adjusted_beta = self.cosine_annealing(phase,self.max_beta,self.min_beta)

        # adjusted_lr min_max
        self.optim.param_groups[0]['lr'] = adjusted_lr
        self.optim.param_groups[0]['betas'] = (adjusted_beta,self.beta_2)
        self.cur_step += 1

lr,epochs = 1e-2,5
model = get_model(act_gr, norm=nn.BatchNorm2d).apply(iw)
tmax = epochs * len(dls.train)
sched = partial(OneCycleLR,tmax)

record = RecorderCB(lr=_lr, mom=_beta1)
xtra = [BatchSchedCB(sched),record]
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

accuracy	loss	epoch	train
0.696	0.921	0	train
0.825	0.476	0	eval
0.857	0.391	1	train
0.861	0.385	1	eval
0.884	0.317	2	train
0.875	0.348	2	eval
0.900	0.272	3	train
0.882	0.322	3	eval
0.913	0.241	4	train
0.886	0.315	4	eval