I expect everyone to have concepts regarding :

1. Non Maximum Suppression
2. Convolutional Neural Networks
3. PyTorch
4. OpenCV
5. Anchor Box
6. Bounding Box

YOLO v3 paper ==> Check here!

I have followed the awesome explanation by Ayoosh Kathuria on the paperspace blog with some modifications in the code.

So let's dive in to the code. We start by importing the necessary libraries.

```from __future__ import division

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import cv2
import time
import os
import os.path as osp
import pickle as pkl
import random
import pandas as pd
```

Here we have written some utility functions.

```def unique(tensor):
tensor_np = tensor.cpu().numpy()
unique_np = np.unique(tensor_np)
unique_tensor = torch.from_numpy(unique_np)

tensor_res = tensor.new(unique_tensor.shape)
tensor_res.copy_(unique_tensor)
return tensor_res

def bbox_iou(box1, box2):
"""
Returns the IoU of two bounding boxes

"""
#Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]

#get the corrdinates of the intersection rectangle
inter_rect_x1 =  torch.max(b1_x1, b2_x1)
inter_rect_y1 =  torch.max(b1_y1, b2_y1)
inter_rect_x2 =  torch.min(b1_x2, b2_x2)
inter_rect_y2 =  torch.min(b1_y2, b2_y2)

#Intersection area
inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(inter_rect_y2 - inter_rect_y1 + 1, min=0)

#Union Area
b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)

iou = inter_area / (b1_area + b2_area - inter_area)

return iou

def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):

batch_size = prediction.size(0)
print(prediction.size)
stride =  inp_dim // prediction.size(2)
grid_size = inp_dim // stride
bbox_attrs = 5 + num_classes
num_anchors = len(anchors)

prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
prediction = prediction.transpose(1,2).contiguous()
prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
anchors = [(a/stride, a/stride) for a in anchors]

#Sigmoid the  centre_X, centre_Y. and object confidencce
prediction[:,:,0] = torch.sigmoid(prediction[:,:,0])
prediction[:,:,1] = torch.sigmoid(prediction[:,:,1])
prediction[:,:,4] = torch.sigmoid(prediction[:,:,4])

grid = np.arange(grid_size)
a,b = np.meshgrid(grid, grid)

x_offset = torch.FloatTensor(a).view(-1,1)
y_offset = torch.FloatTensor(b).view(-1,1)

if CUDA:
x_offset = x_offset.cuda()
y_offset = y_offset.cuda()

x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)

prediction[:,:,:2] += x_y_offset

#log space transform height and the width
anchors = torch.FloatTensor(anchors)

if CUDA:
anchors = anchors.cuda()

anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors

prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes]))

prediction[:,:,:4] *= stride

return prediction

def write_results(prediction, confidence, num_classes, nms_conf):

box_corner = prediction.new(prediction.shape)
box_corner[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
box_corner[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
box_corner[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2)
box_corner[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
prediction[:,:,:4] = box_corner[:,:,:4]

batch_size = prediction.size(0)

write = False

for ind in range(batch_size):
image_pred = prediction[ind]          #image Tensor
#confidence thresholding
#NMS

max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1)
max_conf = max_conf.float().unsqueeze(1)
max_conf_score = max_conf_score.float().unsqueeze(1)
seq = (image_pred[:,:5], max_conf, max_conf_score)
image_pred = torch.cat(seq, 1)

non_zero_ind =  (torch.nonzero(image_pred[:,4]))
try:
image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7)
except:
continue

if image_pred_.shape == 0:
continue

#Get the various classes detected in the image
img_classes = unique(image_pred_[:,-1])  # -1 index holds the class index

for cls in img_classes:
#perform NMS

#get the detections with one particular class

#sort the detections such that the entry with the maximum objectness
#confidence is at the top
conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )
image_pred_class = image_pred_class[conf_sort_index]
idx = image_pred_class.size(0)   #Number of detections

for i in range(idx):
#Get the IOUs of all boxes that come after the one we are looking at
#in the loop
try:
ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
except ValueError:
break

except IndexError:
break

#Zero out all the detections that have IoU > treshhold

#Remove the non-zero entries
non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze()
image_pred_class = image_pred_class[non_zero_ind].view(-1,7)

batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind)      #Repeat the batch_id for as many detections of the class cls in the image
seq = batch_ind, image_pred_class

if not write:
output = torch.cat(seq,1)
write = True
else:
out = torch.cat(seq,1)
output = torch.cat((output,out))

try:
return output
except:
return 0

def letterbox_image(img, inp_dim):
'''resize image with unchanged aspect ratio using padding'''
img_w, img_h = img.shape, img.shape
w, h = inp_dim
new_w = int(img_w * min(w/img_w, h/img_h))
new_h = int(img_h * min(w/img_w, h/img_h))
resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)

canvas = np.full((inp_dim, inp_dim, 3), fill_value = 128)

canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w,  :] = resized_image

return canvas

def prep_image(img, inp_dim):
"""
Prepare image for inputting to the neural network.

Converts Numpy Array to PyTorch's input format
"""
img = (letterbox_image(img, (inp_dim, inp_dim)))
img = img[:,:,::-1].transpose((2,0,1)).copy()
img = torch.from_numpy(img).float().div(255.0).unsqueeze(0)
return img

fp = open(namesfile, "r")
return names
```

Now, we construct the network. We have a `cfg` file to help u create the network.

```    #img = cv2.imread("dog-cycle-car.png")
#img = cv2.resize(img, (416,416))          #Resize to the input dimension
#img_ =  img[:,:,::-1].transpose((2,0,1))  # BGR -> RGB | H X W C -> C X H X W
#img_ = img_[np.newaxis,:,:,:]/255.0       #Add a channel at 0 (for batch) | Normalise
#img_ = torch.from_numpy(img_).float()     #Convert to float
#img_ = Variable(img_)                     # Convert to Variable
#return img_

def parse_cfg(cfgfile):
"""
Takes a configuration file

Returns a list of blocks. Each blocks describes a block in the neural
network to be built. Block is represented as a dictionary in the list

"""

file = open(cfgfile, 'r')
lines = file.read().split('\n')                        # store the lines in a list
lines = [x for x in lines if len(x) > 0]               # get read of the empty lines
lines = [x for x in lines if x != '#']              # get rid of comments
lines = [x.rstrip().lstrip() for x in lines]           # get rid of fringe whitespaces

block = {}
blocks = []

for line in lines:
if line == "[":               # This marks the start of a new block
if len(block) != 0:          # If block is not empty, implies it is storing values of previous block.
blocks.append(block)     # add it the blocks list
block = {}               # re-init the block
block["type"] = line[1:-1].rstrip()
else:
key,value = line.split("=")
block[key.rstrip()] = value.lstrip()
blocks.append(block)

return blocks

class EmptyLayer(nn.Module):
def __init__(self):
super(EmptyLayer, self).__init__()

class DetectionLayer(nn.Module):
def __init__(self, anchors):
super(DetectionLayer, self).__init__()
self.anchors = anchors

def create_modules(blocks):
net_info = blocks     #Captures the information about the input and pre-processing
module_list = nn.ModuleList()
prev_filters = 3
output_filters = []

for index, x in enumerate(blocks[1:]):
module = nn.Sequential()

#check the type of block
#create a new module for the block
#append to module_list

#If it's a convolutional layer
if (x["type"] == "convolutional"):
#Get the info about the layer
activation = x["activation"]
try:
batch_normalize = int(x["batch_normalize"])
bias = False
except:
batch_normalize = 0
bias = True

filters= int(x["filters"])
kernel_size = int(x["size"])
stride = int(x["stride"])

pad = (kernel_size - 1) // 2
else:

conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias)

if batch_normalize:
bn = nn.BatchNorm2d(filters)

#Check the activation.
#It is either Linear or a Leaky ReLU for YOLO
if activation == "leaky":
activn = nn.LeakyReLU(0.1, inplace = True)

#If it's an upsampling layer
#We use Bilinear2dUpsampling
elif (x["type"] == "upsample"):
stride = int(x["stride"])
upsample = nn.Upsample(scale_factor = 2, mode = "nearest")

#If it is a route layer
elif (x["type"] == "route"):
x["layers"] = x["layers"].split(',')
#Start  of a route
start = int(x["layers"])
#end, if there exists one.
try:
end = int(x["layers"])
except:
end = 0
#Positive anotation
if start > 0:
start = start - index
if end > 0:
end = end - index
route = EmptyLayer()
if end < 0:
filters = output_filters[index + start] + output_filters[index + end]
else:
filters= output_filters[index + start]

#shortcut corresponds to skip connection
elif x["type"] == "shortcut":
shortcut = EmptyLayer()

#Yolo is the detection layer
elif x["type"] == "yolo":

anchors = x["anchors"].split(",")
anchors = [int(a) for a in anchors]
anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
anchors = [anchors[i] for i in mask]

detection = DetectionLayer(anchors)

module_list.append(module)
prev_filters = filters
output_filters.append(filters)

return (net_info, module_list)

class Darknet(nn.Module):
def __init__(self, cfgfile):
super(Darknet, self).__init__()
self.blocks = parse_cfg(cfgfile)
self.net_info, self.module_list = create_modules(self.blocks)

def forward(self, x, CUDA):
modules = self.blocks[1:]
outputs = {}   #We cache the outputs for the route layer

write = 0
for i, module in enumerate(modules):
module_type = (module["type"])

if module_type == "convolutional" or module_type == "upsample":
x = self.module_list[i](x)

elif module_type == "route":
layers = module["layers"]
layers = [int(a) for a in layers]

if (layers) > 0:
layers = layers - i

if len(layers) == 1:
x = outputs[i + (layers)]

else:
if (layers) > 0:
layers = layers - i

map1 = outputs[i + layers]
map2 = outputs[i + layers]
x = torch.cat((map1, map2), 1)

elif  module_type == "shortcut":
from_ = int(module["from"])
x = outputs[i-1] + outputs[i+from_]

elif module_type == 'yolo':
anchors = self.module_list[i].anchors
print("Detection Layer => " ,i)
print("Anchors selected => " ,anchors)
#Get the input dimensions
inp_dim = int (self.net_info["height"])

#Get the number of classes
num_classes = int (module["classes"])

#Transform
x = x.data
print("Size before transform => " ,x.size())
x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)
print("Size after transform => " ,x.size())
if not write:              #if no collector has been intialised.
detections = x
write = 1

else:
detections = torch.cat((detections, x), 1)

outputs[i] = x

return detections
#Open the weights file
fp = open(weightfile, "rb")

#The first 5 values are header information
# 1. Major version number
# 2. Minor Version Number
# 3. Subversion number
# 4,5. Images seen by the network (during training)
header = np.fromfile(fp, dtype = np.int32, count = 5)

weights = np.fromfile(fp, dtype = np.float32)

ptr = 0
for i in range(len(self.module_list)):
module_type = self.blocks[i + 1]["type"]

#If module_type is convolutional load weights
#Otherwise ignore.

if module_type == "convolutional":
model = self.module_list[i]
try:
batch_normalize = int(self.blocks[i+1]["batch_normalize"])
except:
batch_normalize = 0

conv = model

if (batch_normalize):
bn = model

#Get the number of weights of Batch Norm Layer
num_bn_biases = bn.bias.numel()

bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
ptr += num_bn_biases

bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
ptr  += num_bn_biases

bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
ptr  += num_bn_biases

bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
ptr  += num_bn_biases

#Cast the loaded weights into dims of model weights.
bn_biases = bn_biases.view_as(bn.bias.data)
bn_weights = bn_weights.view_as(bn.weight.data)
bn_running_mean = bn_running_mean.view_as(bn.running_mean)
bn_running_var = bn_running_var.view_as(bn.running_var)

#Copy the data to model
bn.bias.data.copy_(bn_biases)
bn.weight.data.copy_(bn_weights)
bn.running_mean.copy_(bn_running_mean)
bn.running_var.copy_(bn_running_var)

else:
#Number of biases
num_biases = conv.bias.numel()

conv_biases = torch.from_numpy(weights[ptr: ptr + num_biases])
ptr = ptr + num_biases

#reshape the loaded weights according to the dims of the model weights
conv_biases = conv_biases.view_as(conv.bias.data)

#Finally copy the data
conv.bias.data.copy_(conv_biases)

#Let us load the weights for the Convolutional layers
num_weights = conv.weight.numel()

#Do the same as above for weights
conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
ptr = ptr + num_weights

conv_weights = conv_weights.view_as(conv.weight.data)
conv.weight.data.copy_(conv_weights)
```

We will load the pretrained weights & the class names of the COCO dataset. After downloading the files, you should comment the below two lines of code.

```!wget https://pjreddie.com/media/files/yolov3.weights
```
```!wget https://github.com/pjreddie/darknet/blob/master/data/coco.names
```

Now we load a single image or batches of image, and change the batch size accordingly. We also set the confidence score and the NMS Threshold here. Next, we will load the class names using the `load_classes` function.

We have set up the path => `/content/drive/My Drive/test-img` in Google Drive for importing inbound test images and path => `/content/drive/My Drive/results` to get the outbound test images. When running the code, you are expected to create file pathe of your own.

```images = '/content/drive/My Drive/test-img/'
batch_size = 1
confidence = 0.5
nms_thesh = 0.4
start = 0
CUDA = torch.cuda.is_available()

```

Here we will load the model with the cfg file and the weights file.

```print("Loading network.....")
model = Darknet('yolov3.cfg')

#model.net_info["height"] = 416
inp_dim = int(model.net_info["height"])
assert inp_dim % 32 == 0
assert inp_dim > 32
```
```Loading network.....
```

We will now evaluate the model.

```model.eval()

#Detection phase
try:
imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images)]
imlist = []
imlist.append(osp.join(osp.realpath('.'), images))
except FileNotFoundError:
print ("No file or directory with the name {}".format(images))
exit()

if not os.path.exists('/content/drive/My Drive/results'):
os.makedirs('/content/drive/My Drive/results')

im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))]))
im_dim_list = [(x.shape, x.shape) for x in loaded_ims]
im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2)

leftover = 0
if (len(im_dim_list) % batch_size):
leftover = 1

if batch_size != 1:
num_batches = len(imlist) // batch_size + leftover
im_batches = [torch.cat((im_batches[i*batch_size : min((i +  1)*batch_size,
len(im_batches))]))  for i in range(num_batches)]

write = 0

if CUDA:
im_dim_list = im_dim_list.cuda()

start_det_loop = time.time()
for i, batch in enumerate(im_batches):
start = time.time()
if CUDA:
batch = batch.cuda()
prediction = model(Variable(batch), CUDA)

prediction = write_results(prediction, confidence, num_classes = 80, nms_conf = nms_thesh)

end = time.time()

if type(prediction) == int:

for im_num, image in enumerate(imlist[i*batch_size: min((i +  1)*batch_size, len(imlist))]):
im_id = i*batch_size + im_num
print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size))
print("{0:20s} {1:s}".format("Objects Detected:", ""))
print("----------------------------------------------------------")
continue

prediction[:,0] += i*batch_size    #transform the atribute from index in batch to index in imlist

if not write:                      #If we have't initialised output
output = prediction
write = 1
else:
output = torch.cat((output,prediction))

for im_num, image in enumerate(imlist[i*batch_size: min((i +  1)*batch_size, len(imlist))]):
im_id = i*batch_size + im_num
objs = [classes[int(x[-1])] for x in output if int(x) == im_id]
print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size))
print("{0:20s} {1:s}".format("Objects Detected:", " ".join(objs)))
print("----------------------------------------------------------")

if CUDA:
torch.cuda.synchronize()
try:
output
except NameError:
exit()

im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())

scaling_factor = torch.min(416/im_dim_list,1).view(-1,1)

output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2

output[:,1:5] /= scaling_factor

for i in range(output.shape):
output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])
output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])

output_recast = time.time()

draw = time.time()

def write(x, results):
c1 = tuple(x[1:3].int())
c2 = tuple(x[3:5].int())
img = results[int(x)]
cls = int(x[-1])
color = random.choice(colors)
label = "{0}".format(classes[cls])
cv2.rectangle(img, c1, c2, color, 6)
t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)
c2 = c1 + t_size + 3, c1 + t_size + 4
cv2.rectangle(img, c1, c2, color, 6)
cv2.putText(img, label, (c1, c1 + t_size + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
return img

det_names = pd.Series(imlist).apply(lambda x: "{}/det_{}".format('/content/drive/My Drive/results',x.split("/")[-1]))

end = time.time()

print("SUMMARY")
print("----------------------------------------------------------")
print("{:25s}: {}".format("Task", "Time Taken (in seconds)"))
print()
print("{:25s}: {:2.3f}".format("Detection (" + str(len(imlist)) +  " images)", output_recast - start_det_loop))
print("{:25s}: {:2.3f}".format("Output Processing", class_load - output_recast))
print("{:25s}: {:2.3f}".format("Drawing Boxes", end - draw))
print("{:25s}: {:2.3f}".format("Average time_per_img", (end - load_batch)/len(imlist)))
print("----------------------------------------------------------")

torch.cuda.empty_cache()

```
```Detection Layer =>  82
Anchors selected =>  [(116, 90), (156, 198), (373, 326)]
Size before transform =>  torch.Size([1, 255, 13, 13])
<built-in method size of Tensor object at 0x7f900f935630>
Size after transform =>  torch.Size([1, 507, 85])
Detection Layer =>  94
Anchors selected =>  [(30, 61), (62, 45), (59, 119)]
Size before transform =>  torch.Size([1, 255, 26, 26])
<built-in method size of Tensor object at 0x7f900f9424c8>
Size after transform =>  torch.Size([1, 2028, 85])
Detection Layer =>  106
Anchors selected =>  [(10, 13), (16, 30), (33, 23)]
Size before transform =>  torch.Size([1, 255, 52, 52])
<built-in method size of Tensor object at 0x7f900f942828>
Size after transform =>  torch.Size([1, 8112, 85])
cricket.jpg          predicted in  1.275 seconds
Objects Detected:                        <script crossorigin="anonymous" defer="defer" integrity="sha512-JDDF8W8Wl5vopo9t4K4NtIEUMCYov3ZjVpv9lC1SDUxhejU+ILu8V3l6BhkaIRMYJioQWj9am9tJSTvND+8wJg==" type="application/javascript" data-module-id="./chunk-drag-drop.js" data-src="https://github.githubassets.com/assets/chunk-drag-drop-2430c5f1.js"></script>
----------------------------------------------------------
SUMMARY
----------------------------------------------------------
Task                     : Time Taken (in seconds)