From e5f508b9d483ea64633cb3ab778624d8d0133b8b Mon Sep 17 00:00:00 2001 From: atroyn Date: Tue, 13 Dec 2022 16:44:18 -0800 Subject: [PATCH] More comments --- detect.py | 3 +++ models/yolo.py | 4 ++-- utils/dataloaders.py | 3 ++- utils/general.py | 18 +++++++++--------- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/detect.py b/detect.py index f275b82c..8e42fbe1 100644 --- a/detect.py +++ b/detect.py @@ -128,6 +128,9 @@ def run( with dt[2]: pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det) + # Second-stage classifier (optional) + # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s) + # Process predictions for i, det in enumerate(pred): # per image seen += 1 diff --git a/models/yolo.py b/models/yolo.py index db792720..5a3a41df 100644 --- a/models/yolo.py +++ b/models/yolo.py @@ -34,7 +34,7 @@ except ImportError: thop = None - +# We modify the original Detect class to output the embeddings along with the predictions class Detect(nn.Module): # YOLOv5 Detect head for detection models stride = None # strides computed during build @@ -77,7 +77,7 @@ def forward(self, x): wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh y = torch.cat((xy, wh, conf), 4) z.append(y.view(bs, self.na * nx * ny, self.no)) - embeddings.append(x[i].view(bs, self.na * nx * ny, self.no)) + embeddings.append(x[i].view(bs, self.na * nx * ny, self.no)) # The embeddings are the raw output of the last conv layer, in the same shape as the predictions return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x, torch.cat(embeddings, 1)) if self.with_embeddings else (torch.cat(z, 1), x) diff --git a/utils/dataloaders.py b/utils/dataloaders.py index 8672a81f..734778e8 100644 --- a/utils/dataloaders.py +++ b/utils/dataloaders.py @@ -151,6 +151,7 @@ def create_dataloader(path, worker_init_fn=seed_worker, generator=generator), dataset +# A new dataloader for cases where we have input images but no labels. def create_imageloader(path, imgsz, batch_size, stride, workers): dataset = LoadImages(path, imgsz, stride=int(stride), auto=False, n_workers=workers) batch_size = min(batch_size, len(dataset)) @@ -241,7 +242,7 @@ def __next__(self): self.frame += 1 return str(self.screen), im, im0, None, s # screen, img, original img, im0s, s - +# An iterable dataset that loads images, compatible with the IterableDataset interface. class LoadImages(IterableDataset): # YOLOv5 image/video dataloader, i.e. `python detect.py --source image.jpg/vid.mp4` def __init__(self, path, img_size=640, stride=32, auto=True, transforms=None, vid_stride=1, n_workers=0): diff --git a/utils/general.py b/utils/general.py index 830465af..2bc8b917 100644 --- a/utils/general.py +++ b/utils/general.py @@ -841,7 +841,8 @@ def clip_segments(boxes, shape): boxes[:, 0] = boxes[:, 0].clip(0, shape[1]) # x boxes[:, 1] = boxes[:, 1].clip(0, shape[0]) # y - +# We modify the original non_max_suppression function to return the embeddings as well +# In practice, this means making sure that they are extracted and filtered alongside the predictions def non_max_suppression( prediction, conf_thres=0.25, @@ -866,7 +867,6 @@ def non_max_suppression( embedding = prediction[2] # Last part of the tuple has raw conv. output prediction = prediction[0] # select only inference output - device = prediction.device mps = 'mps' in device.type # Apple MPS if mps: # MPS not fully supported yet, convert tensors to CPU before NMS @@ -901,7 +901,7 @@ def non_max_suppression( # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height x = x[xc[xi]] # confidence if with_embeddings: - e = embedding[xi][xc[xi]] + e = embedding[xi][xc[xi]] # Filter to the same indices as the predictions # Cat apriori labels if autolabelling if labels and len(labels[xi]): @@ -928,18 +928,18 @@ def non_max_suppression( i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T x = torch.cat((box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1) if with_embeddings: - e = e[i] + e = e[i] # Filter to the same indices as the predictions else: # best class only conf, j = x[:, 5:mi].max(1, keepdim=True) x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres] if with_embeddings: - e = e[conf.view(-1) > conf_thres] + e = e[conf.view(-1) > conf_thres] # Filter to the same indices as the predictions. Note that no concatenation is needed here. # Filter by class if classes is not None: x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] if with_embeddings: - e = e[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] + e = e[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] # Filter to the same indices as the predictions # Apply finite constraint # if not torch.isfinite(x).all(): @@ -952,11 +952,11 @@ def non_max_suppression( elif n > max_nms: # excess boxes x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence if with_embeddings: - e = e[x[:, 4].argsort(descending=True)[:max_nms]] + e = e[x[:, 4].argsort(descending=True)[:max_nms]] # Filter to the same indices as the predictions else: x = x[x[:, 4].argsort(descending=True)] # sort by confidence if with_embeddings: - e = e[x[:, 4].argsort(descending=True)] + e = e[x[:, 4].argsort(descending=True)] # Filter to the same indices as the predictions # Batched NMS c = x[:, 5:6] * (0 if agnostic else max_wh) # classes @@ -974,7 +974,7 @@ def non_max_suppression( output[xi] = x[i] if with_embeddings: - embedding_output[xi] = e[i] + embedding_output[xi] = e[i] # Assign the embeddings to the output if mps: output[xi] = output[xi].to(device) if with_embeddings: