From e5f508b9d483ea64633cb3ab778624d8d0133b8b Mon Sep 17 00:00:00 2001
From: atroyn <anton.troynikov@gmail.com>
Date: Tue, 13 Dec 2022 16:44:18 -0800
Subject: [PATCH] More comments

---
 detect.py            |  3 +++
 models/yolo.py       |  4 ++--
 utils/dataloaders.py |  3 ++-
 utils/general.py     | 18 +++++++++---------
 4 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/detect.py b/detect.py
index f275b82c..8e42fbe1 100644
--- a/detect.py
+++ b/detect.py
@@ -128,6 +128,9 @@ def run(
         with dt[2]:
             pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
 
+        # Second-stage classifier (optional)
+        # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)
+
         # Process predictions
         for i, det in enumerate(pred):  # per image
             seen += 1
diff --git a/models/yolo.py b/models/yolo.py
index db792720..5a3a41df 100644
--- a/models/yolo.py
+++ b/models/yolo.py
@@ -34,7 +34,7 @@
 except ImportError:
     thop = None
 
-
+# We modify the original Detect class to output the embeddings along with the predictions
 class Detect(nn.Module):
     # YOLOv5 Detect head for detection models
     stride = None  # strides computed during build
@@ -77,7 +77,7 @@ def forward(self, x):
                     wh = (wh * 2) ** 2 * self.anchor_grid[i]  # wh
                     y = torch.cat((xy, wh, conf), 4)
                 z.append(y.view(bs, self.na * nx * ny, self.no))
-                embeddings.append(x[i].view(bs, self.na * nx * ny, self.no))
+                embeddings.append(x[i].view(bs, self.na * nx * ny, self.no)) # The embeddings are the raw output of the last conv layer, in the same shape as the predictions
 
         return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x, torch.cat(embeddings, 1)) if self.with_embeddings else (torch.cat(z, 1), x)
 
diff --git a/utils/dataloaders.py b/utils/dataloaders.py
index 8672a81f..734778e8 100644
--- a/utils/dataloaders.py
+++ b/utils/dataloaders.py
@@ -151,6 +151,7 @@ def create_dataloader(path,
                   worker_init_fn=seed_worker,
                   generator=generator), dataset
 
+# A new dataloader for cases where we have input images but no labels. 
 def create_imageloader(path, imgsz, batch_size, stride, workers):
     dataset = LoadImages(path, imgsz, stride=int(stride), auto=False, n_workers=workers)
     batch_size = min(batch_size, len(dataset))
@@ -241,7 +242,7 @@ def __next__(self):
         self.frame += 1
         return str(self.screen), im, im0, None, s  # screen, img, original img, im0s, s
 
-
+# An iterable dataset that loads images, compatible with the IterableDataset interface. 
 class LoadImages(IterableDataset):
     # YOLOv5 image/video dataloader, i.e. `python detect.py --source image.jpg/vid.mp4`
     def __init__(self, path, img_size=640, stride=32, auto=True, transforms=None, vid_stride=1, n_workers=0):
diff --git a/utils/general.py b/utils/general.py
index 830465af..2bc8b917 100644
--- a/utils/general.py
+++ b/utils/general.py
@@ -841,7 +841,8 @@ def clip_segments(boxes, shape):
         boxes[:, 0] = boxes[:, 0].clip(0, shape[1])  # x
         boxes[:, 1] = boxes[:, 1].clip(0, shape[0])  # y
 
-
+# We modify the original non_max_suppression function to return the embeddings as well
+# In practice, this means making sure that they are extracted and filtered alongside the predictions
 def non_max_suppression(
         prediction,
         conf_thres=0.25,
@@ -866,7 +867,6 @@ def non_max_suppression(
             embedding = prediction[2] # Last part of the tuple has raw conv. output
         prediction = prediction[0]  # select only inference output
 
-
     device = prediction.device
     mps = 'mps' in device.type  # Apple MPS
     if mps:  # MPS not fully supported yet, convert tensors to CPU before NMS
@@ -901,7 +901,7 @@ def non_max_suppression(
         # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
         x = x[xc[xi]]  # confidence
         if with_embeddings:
-            e = embedding[xi][xc[xi]]
+            e = embedding[xi][xc[xi]] # Filter to the same indices as the predictions
 
         # Cat apriori labels if autolabelling
         if labels and len(labels[xi]):
@@ -928,18 +928,18 @@ def non_max_suppression(
             i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T
             x = torch.cat((box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1)
             if with_embeddings:
-                e = e[i]
+                e = e[i] # Filter to the same indices as the predictions
         else:  # best class only
             conf, j = x[:, 5:mi].max(1, keepdim=True)
             x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
             if with_embeddings:
-                e = e[conf.view(-1) > conf_thres]
+                e = e[conf.view(-1) > conf_thres] # Filter to the same indices as the predictions. Note that no concatenation is needed here.
 
         # Filter by class
         if classes is not None:
             x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
             if with_embeddings:
-                e = e[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+                e = e[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] # Filter to the same indices as the predictions
 
         # Apply finite constraint
         # if not torch.isfinite(x).all():
@@ -952,11 +952,11 @@ def non_max_suppression(
         elif n > max_nms:  # excess boxes
             x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
             if with_embeddings:
-                e = e[x[:, 4].argsort(descending=True)[:max_nms]]
+                e = e[x[:, 4].argsort(descending=True)[:max_nms]] # Filter to the same indices as the predictions
         else:
             x = x[x[:, 4].argsort(descending=True)]  # sort by confidence
             if with_embeddings:
-                e = e[x[:, 4].argsort(descending=True)]
+                e = e[x[:, 4].argsort(descending=True)] # Filter to the same indices as the predictions
 
         # Batched NMS
         c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
@@ -974,7 +974,7 @@ def non_max_suppression(
 
         output[xi] = x[i]
         if with_embeddings:
-            embedding_output[xi] = e[i]
+            embedding_output[xi] = e[i] # Assign the embeddings to the output
         if mps:
             output[xi] = output[xi].to(device)
             if with_embeddings: