This notebook demonstrates how to finetune CLIP ViT-B/32 model on a single GPU.
from fastai.vision.all import *
from fastai.distributed import *
from self_supervised.multimodal.clip import *
import clip
# from zero_optimizer import ZeroRedundancyOptimizer # only works with multi-gpu / distributed training
Download COCO 2014 Train, Valid and Annotations
trainpath = Path("../../../coco/train2014/")
validpath = Path("../../../coco/val2014/")
annospath = Path("../../../coco/annotations/")
train_images = get_image_files(trainpath)
valid_images = get_image_files(validpath)
len(train_images), len(valid_images)
caption_paths = annospath.ls().filter(lambda o: 'captions' in o.name)
fn2captions = {}
for p in caption_paths:
caps = json.loads(open(p).read())
id2fn = {o['id']: o['file_name'] for o in caps['images']}
fn2cap = {id2fn[o['image_id']]: o['caption'] for o in caps['annotations']}
fn2captions.update(fn2cap)
assert len(fn2captions) == (len(train_images) + len(valid_images))
all_images = train_images[:10000] + valid_images[:5000]; len(all_images),len(fn2captions)
def read_image(fn): return PILImage.create(fn)
def read_text(fn): return fn2captions[fn.name]
def dummy_targ(o): return 0
clip_stats = ([0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711])
clip_tokenizer = ClipTokenizer()
size,bs = 224,128
split_func = lambda fn: True if "val2014" in str(fn) else False
dsets = Datasets(all_images, tfms=[read_image, read_text, dummy_targ],
n_inp=2,
splits=FuncSplitter(split_func)(all_images))
item_tfms = [RandomResizedCrop(size, min_scale=0.9), clip_tokenizer, ToTensor()]
batch_tfms = [IntToFloatTensor, Normalize.from_stats(*clip_stats)]
train_dl = TfmdDL(dsets.train, shuffle=True, bs=bs, after_item=item_tfms, after_batch=batch_tfms, drop_last=True)
valid_dl = TfmdDL(dsets.valid, shuffle=False, bs=bs*2, after_item=item_tfms, after_batch=batch_tfms)
dls = DataLoaders(train_dl, valid_dl, device=default_device())
for i in range(5): dls.train_ds[i][0].show(title=dls.train_ds[i][1])
clip_trainer_cb = CLIPTrainer()
cbs = [clip_trainer_cb]
opt_func = ranger
arch = 'vitb32'
do_finetune = True
use_grad_check = True
grad_check_nchunks = 2
finetune_modelname = 'ViT-B/32'
vitb32_config_dict = vitb32_config(size, clip_tokenizer.context_length, clip_tokenizer.vocab_size)
clip_model = CLIP(**vitb32_config_dict, checkpoint=use_grad_check, checkpoint_nchunks=grad_check_nchunks)
if do_finetune:
print("Loading pretrained model..")
clip_pretrained_model, _ = clip.load(finetune_modelname, jit=False)
clip_model.load_state_dict(clip_pretrained_model.state_dict())
learner = Learner(dls, clip_model, loss_func=noop, cbs=cbs, opt_func=opt_func,
metrics=[RetrievalAtK(k=5),
RetrievalAtK(k=20),
RetrievalAtK(k="mean"),
RetrievalAtK(k="median")])
learner.to_fp16();
learner.fit_flat_cos(10, 1e-5, pct_start=0.25)