Importing utilities (click to show/hide)
import torch,math,functools
import matplotlib.pyplot as plt
from functools import partial
import pdb
from tinyai.datasets import *
from tinyai.conv import *
from tinyai.learner import *
from tinyai.activations import *
from tinyai.init import *
from tinyai.sgd import *
from datasets import load_dataset
import torchvision.transforms.functional as TF,torch.nn.functional as F
from torch import tensor,nn,optim
import fastcore.all as fc
from torch.optim import lr_scheduler
from torcheval.metrics import MulticlassAccuracy
x = torch.linspace(0 ,10 ,10 )
lr = 5
print (x,math.pi)
tensor([ 0.0000, 1.1111, 2.2222, 3.3333, 4.4444, 5.5556, 6.6667, 7.7778,
8.8889, 10.0000]) 3.141592653589793
How we want our learning rate to look at.
def plot_thing(f,lr,steps):
x= torch.linspace(0 ,math.pi,steps)
plt.plot(x,(f(x) + 1 )/ 2 * lr)
plot_thing(partial(torch.cos),lr,steps= 100 )
Lets try in learner
Importing and transfroming dataset (click to show/hide)
xl,yl = 'image' ,'label' # x label, y label
name = "fashion_mnist"
bs = 1024
xmean,xstd = 0.28 , 0.35
@inplace
def transformi(b): b[xl] = [(TF.to_tensor(o)- xmean)/ xstd for o in b[xl]]
dsd = load_dataset(name)
tds = dsd.with_transform(transformi)
dls = DataLoaders.from_dd(tds, bs, num_workers= 4 )
CosineAnnealingLR
First Version.
Cosine Annealing LR implementation from scratch, which had to be updated for the OneCycleLR
This version might be a little faster but take more memory.(not tested)
First Version. (click to show/hide)
class CosAnnLR():
def __init__ (self ,tmax,optim):
self .optim = optim
self .tmax = tmax
self .lr = optim.param_groups[0 ]['lr' ]
self .values = self ._init_values()
self .cur_step = 0
def _init_values(self ):
return (torch.cos(torch.linspace(0 ,math.pi,self .tmax))+ 1 )/ 2 * self .lr
def step(self ):
self .optim.param_groups[0 ]['lr' ] = self .values[self .cur_step]
self .cur_step += 1
Second Version
CosineAnnealingLR implementation from scratch.
Second Version. (click to show/hide)
class CosAnnLR():
def __init__ (self ,tmax,optim):
self .optim = optim
self .lr = optim.param_groups[0 ]['lr' ]
self .tmax = tmax
self .cur_step = 0
def step(self ):
self .optim.param_groups[0 ]['lr' ] = (math.cos(self .cur_step/ self .tmax * math.pi)+ 1 )/ 2 * self .lr
self .cur_step += 1
def _lr(cb): return cb.pg['lr' ] # Callback that will allow us to record LR during learning.
Preparing the learner for training.
Code for learner. (click to show/hide)
act_gr = partial(GeneralRelu, leak= 0.1 , sub= 0.4 )
metrics = MetricsCB(accuracy= MulticlassAccuracy())
astats = ActivationStats(fc.risinstance(GeneralRelu))
cbs = [DeviceCB(), metrics, ProgressCB(plot= True ), astats]
iw = partial(init_weights, leaky= 0.1 )
set_seed(42 )
lr,epochs = 1e-2 ,5
model = get_model(act_gr, norm= nn.BatchNorm2d).apply (iw)
tmax = epochs * len (dls.train)
sched = partial(CosAnnLR,tmax)
#sched = partial(lr_scheduler.CosineAnnealingLR,T_max = tmax)
# Testing if it works with pytorch's CosineAnnealingLR
record = RecorderCB(lr= _lr)
xtra = [BatchSchedCB(sched),record]
learn = TrainLearner(model, dls, F.cross_entropy, lr= lr, cbs= cbs+ xtra, opt_func= optim.AdamW)
Code
0.806
0.529
0
train
0.853
0.404
0
eval
0.876
0.338
1
train
0.872
0.349
1
eval
0.892
0.295
2
train
0.882
0.326
2
eval
0.904
0.264
3
train
0.887
0.316
3
eval
0.910
0.248
4
train
0.887
0.310
4
eval
Plot of learning rate throughout the learning process
CosineAnnealing Summary.
After creating my own CosineAnnealing
I decided to look for paper where it was introduced, and I found this paper .
Where we can find this equation. \[ \eta_{t} = \eta_{min}^{i} + \frac{1}{2}\left(\eta_{max}^{i}-\eta_{min}^{i}\right)\left(1+\cos\left(\frac{T_{cur}}{T_{i}}\pi\right)\right) \]
If we compared it to our code, it looks completely different.
(math.cos(cur_step/ tmax * math.pi)+ 1 )/ 2 * lr
But if we read the paper further, the η and T could be translated to our code. Where: \[ \eta \text{ (eta) - is learning rate } \] \[ T_{cur} \text{ - is current step }\] \[ t_{i} \text{ - is our tmax}\]
\[ lr_{t} = lr_{min} + \frac{1}{2}\left(lr_{max}-lr_{min}\right)\left(1+\cos\left(\frac{\text{curstep}}{tmax}\pi\right)\right) \]
The paper’s equation introduces min
& max
learning rate, therefore the difference. But the rest is the same.
OneCycleLR
CLR should specify minmum
and maximum
learning rate boundaries and a step_size
,
but this implementation doesn’t do that.
Adding minimum and maximum should be pretty straight forward, tho.
You also might want to add a 3rd phase where learning rate is at its maximum for 5-10% of the training.
class OneCycleLR:
'''
This version of OneCycle was create before looking up CosineAnnealing paper.
'''
def __init__ (self , tmax, optim, warm_up:float = 0.30 ):
self .optim = optim
self .initial_lr = self .optim.param_groups[0 ]['lr' ]
self .beta,self .beta_2 = self .optim.param_groups[0 ]['betas' ]
self .max_beta, self .min_beta = self .beta + 0.05 , self .beta - 0.05
self .warm_up = warm_up
self .warm_up_steps = int (tmax * self .warm_up)
self .annealing_steps = tmax - self .warm_up_steps
self .cur_step = 0
def get_beta(self ,phase:float ,warming_up: bool ):
if warming_up:
return self .min_beta + (self .max_beta - self .min_beta) * ((math.cos(math.pi * phase) + 1 ) / 2 )
else :
return self .max_beta + (self .min_beta - self .max_beta) * ((math.cos(math.pi * phase) + 1 ) / 2 )
def step(self ):
# warm_up phase
if self .cur_step <= self .warm_up_steps:
# Increasing learning rate
phase = self .cur_step / self .warm_up_steps
adjusted_lr = (math.cos(phase * math.pi + math.pi)+ 1 )/ 2 * self .initial_lr
adjusted_beta = self .get_beta(phase, warming_up = True )
else :
# Decreasing learning rate
phase = (self .cur_step - self .warm_up_steps) / self .annealing_steps
adjusted_lr = (math.cos(phase* math.pi) + 1 ) / 2 * self .initial_lr
adjusted_beta = self .get_beta(phase, warming_up = False )
# adjusted_lr min_max
self .optim.param_groups[0 ]['lr' ] = adjusted_lr
self .optim.param_groups[0 ]['betas' ] = (adjusted_beta,self .beta_2)
self .cur_step += 1
def _beta1(cb): return cb.pg['betas' ][0 ]
rec = RecorderCB(lr= _lr, mom= _beta1)
Preparing the learner for training.
Code for learner. (click to show/hide)
act_gr = partial(GeneralRelu, leak= 0.1 , sub= 0.4 )
metrics = MetricsCB(accuracy= MulticlassAccuracy())
astats = ActivationStats(fc.risinstance(GeneralRelu))
cbs = [DeviceCB(), metrics, ProgressCB(plot= True ), astats]
iw = partial(init_weights, leaky= 0.1 )
set_seed(42 )
lr,epochs = 1e-2 ,5
model = get_model(act_gr, norm= nn.BatchNorm2d).apply (iw)
tmax = epochs * len (dls.train)
sched = partial(OneCycleLR,tmax)
#sched = partial(lr_scheduler.OneCycleLR,max_lr = lr,total_steps = tmax) # Testing if it works with pytorch's CosineAnnealingLR
record = RecorderCB(lr= _lr, mom= _beta1)
xtra = [BatchSchedCB(sched),record]
learn = TrainLearner(model, dls, F.cross_entropy, lr= lr, cbs= cbs+ xtra, opt_func= optim.AdamW)
0.723
0.827
0
train
0.822
0.485
0
eval
0.860
0.386
1
train
0.864
0.368
1
eval
0.887
0.310
2
train
0.877
0.338
2
eval
0.902
0.268
3
train
0.882
0.316
3
eval
0.912
0.242
4
train
0.888
0.303
4
eval
Note: If you happened to know why does the learning doesn’t go smoothly at the beginning, u can dm me on discord @afterhoursbilly
Plot of Learning Rate and Momentum throughout the learning process
OneCycle Summary
Inspired by paper , & fast.ai 22part course
This CLR implements minmum
and maximum
learning rate boundaries
We could also add a phase where learning rate is at its maximum for 5-10% of the training.
class OneCycleLR:
'''
Modified version after looking up papers.
'''
def __init__ (self , tmax, optim, warm_up:float = 0.30 ):
self .optim = optim
self .initial_lr,self .min_lr = self .optim.param_groups[0 ]['lr' ],self .optim.param_groups[0 ]['lr' ]// 20
self .beta,self .beta_2 = self .optim.param_groups[0 ]['betas' ]
self .max_beta, self .min_beta = self .beta + 0.05 , self .beta - 0.05
self .warm_up = warm_up
self .warm_up_steps = int (tmax * self .warm_up)
self .annealing_steps = tmax - self .warm_up_steps
self .cur_step = 0
def cosine_annealing(self ,phase,min ,max ):
return min + (max - min ) * ((math.cos(math.pi * phase)+ 1 )/ 2 )
def step(self ):
# warm_up phase
if self .cur_step <= self .warm_up_steps:
# Increasing learning rate
phase = self .cur_step / self .warm_up_steps
adjusted_lr = self .cosine_annealing(phase,self .initial_lr,self .min_lr)
adjusted_beta = self .cosine_annealing(phase,self .min_beta,self .max_beta)
else :
# Decreasing learning rate
phase = (self .cur_step - self .warm_up_steps) / self .annealing_steps
adjusted_lr = self .cosine_annealing(phase,self .min_lr,self .initial_lr)
adjusted_beta = self .cosine_annealing(phase,self .max_beta,self .min_beta)
# adjusted_lr min_max
self .optim.param_groups[0 ]['lr' ] = adjusted_lr
self .optim.param_groups[0 ]['betas' ] = (adjusted_beta,self .beta_2)
self .cur_step += 1
lr,epochs = 1e-2 ,5
model = get_model(act_gr, norm= nn.BatchNorm2d).apply (iw)
tmax = epochs * len (dls.train)
sched = partial(OneCycleLR,tmax)
record = RecorderCB(lr= _lr, mom= _beta1)
xtra = [BatchSchedCB(sched),record]
learn = TrainLearner(model, dls, F.cross_entropy, lr= lr, cbs= cbs+ xtra, opt_func= optim.AdamW)
learn.fit(epochs)
0.696
0.921
0
train
0.825
0.476
0
eval
0.857
0.391
1
train
0.861
0.385
1
eval
0.884
0.317
2
train
0.875
0.348
2
eval
0.900
0.272
3
train
0.882
0.322
3
eval
0.913
0.241
4
train
0.886
0.315
4
eval
Back to top