diff --git a/feat/FastDetector.py b/feat/FastDetector.py deleted file mode 100644 index 95be7069..00000000 --- a/feat/FastDetector.py +++ /dev/null @@ -1,669 +0,0 @@ -import json -from tqdm import tqdm -import numpy as np -import pandas as pd -from huggingface_hub import hf_hub_download, PyTorchModelHubMixin -from collections import OrderedDict - -from feat.emo_detectors.ResMaskNet.resmasknet_test import ( - ResMasking, -) -from feat.identity_detectors.facenet.facenet_model import InceptionResnetV1 -from feat.facepose_detectors.img2pose.deps.models import ( - FasterDoFRCNN, - postprocess_img2pose, -) -from feat.au_detectors.StatLearning.SL_test import XGBClassifier, SVMClassifier -from feat.emo_detectors.StatLearning.EmoSL_test import EmoSVMClassifier -from feat.landmark_detectors.mobilefacenet_test import MobileFaceNet -from feat.landmark_detectors.basenet_test import MobileNet_GDConv -from feat.landmark_detectors.pfld_compressed_test import PFLDInference -from feat.pretrained import load_model_weights, AU_LANDMARK_MAP -from feat.utils import ( - set_torch_device, - openface_2d_landmark_columns, - FEAT_EMOTION_COLUMNS, - FEAT_FACEBOX_COLUMNS, - FEAT_FACEPOSE_COLUMNS_6D, - FEAT_IDENTITY_COLUMNS, -) -from feat.utils.io import get_resource_path -from feat.utils.image_operations import ( - convert_image_to_tensor, - extract_face_from_bbox_torch, - inverse_transform_landmarks_torch, - extract_hog_features, - convert_bbox_output, - compute_original_image_size, -) -from feat.data import Fex, ImageDataset, TensorDataset, VideoDataset -from skops.io import load, get_untrusted_types -from safetensors.torch import load_file -import torch -import torch.nn as nn -from torch.utils.data import DataLoader -from torchvision.models.detection.backbone_utils import resnet_fpn_backbone -from torchvision.transforms import Compose, Normalize -import sys - -sys.modules["__main__"].__dict__["XGBClassifier"] = XGBClassifier -sys.modules["__main__"].__dict__["SVMClassifier"] = SVMClassifier -sys.modules["__main__"].__dict__["EmoSVMClassifier"] = EmoSVMClassifier - - -class FastDetector(nn.Module, PyTorchModelHubMixin): - def __init__( - self, - landmark_model="mobilefacenet", - au_model="xgb", - emotion_model="resmasknet", - identity_model="facenet", - device="cpu", - ): - super(FastDetector, self).__init__() - - self.info = dict( - face_model="img2pose", - landmark_model=None, - emotion_model=None, - facepose_model="img2pose", - au_model=None, - identity_model=None, - ) - self.device = set_torch_device(device) - - # Load Model Configurations - facepose_config_file = hf_hub_download( - repo_id="py-feat/img2pose", - filename="config.json", - cache_dir=get_resource_path(), - ) - with open(facepose_config_file, "r") as f: - facepose_config = json.load(f) - - # Initialize img2pose - backbone = resnet_fpn_backbone(backbone_name="resnet18", weights=None) - backbone.eval() - backbone.to(self.device) - self.facepose_detector = FasterDoFRCNN( - backbone=backbone, - num_classes=2, - min_size=facepose_config["min_size"], - max_size=facepose_config["max_size"], - pose_mean=torch.tensor(facepose_config["pose_mean"]), - pose_stddev=torch.tensor(facepose_config["pose_stddev"]), - threed_68_points=torch.tensor(facepose_config["threed_points"]), - rpn_pre_nms_top_n_test=facepose_config["rpn_pre_nms_top_n_test"], - rpn_post_nms_top_n_test=facepose_config["rpn_post_nms_top_n_test"], - bbox_x_factor=facepose_config["bbox_x_factor"], - bbox_y_factor=facepose_config["bbox_y_factor"], - expand_forehead=facepose_config["expand_forehead"], - ) - facepose_model_file = hf_hub_download( - repo_id="py-feat/img2pose", - filename="model.safetensors", - cache_dir=get_resource_path(), - ) - facepose_checkpoint = load_file(facepose_model_file) - self.facepose_detector.load_state_dict(facepose_checkpoint, load_model_weights) - self.facepose_detector.eval() - self.facepose_detector.to(self.device) - # self.facepose_detector = torch.compile(self.facepose_detector) - - # Initialize Landmark Detector - self.info["landmark_model"] = landmark_model - if landmark_model is not None: - if landmark_model == "mobilefacenet": - self.face_size = 112 - self.landmark_detector = MobileFaceNet( - [self.face_size, self.face_size], 136, device=self.device - ) - landmark_model_file = hf_hub_download( - repo_id="py-feat/mobilefacenet", - filename="mobilefacenet_model_best.pth.tar", - cache_dir=get_resource_path(), - ) - landmark_state_dict = torch.load( - landmark_model_file, map_location=self.device, weights_only=True - )["state_dict"] # Ensure Model weights are Float32 for MPS - elif landmark_model == "mobilenet": - self.face_size = 224 - self.landmark_detector = MobileNet_GDConv(136) - landmark_model_file = hf_hub_download( - repo_id="py-feat/mobilenet", - filename="mobilenet_224_model_best_gdconv_external.pth.tar", - cache_dir=get_resource_path(), - ) - mobilenet_state_dict = torch.load( - landmark_model_file, map_location=self.device, weights_only=True - )["state_dict"] # Ensure Model weights are Float32 for MPS - landmark_state_dict = OrderedDict() - for k, v in mobilenet_state_dict.items(): - if "module." in k: - k = k.replace("module.", "") - landmark_state_dict[k] = v - elif landmark_model == "pfld": - self.face_size = 112 - self.landmark_detector = PFLDInference() - landmark_model_file = hf_hub_download( - repo_id="py-feat/pfld", - filename="pfld_model_best.pth.tar", - cache_dir=get_resource_path(), - ) - landmark_state_dict = torch.load( - landmark_model_file, map_location=self.device, weights_only=True - )["state_dict"] # Ensure Model weights are Float32 for MPS - else: - raise ValueError("{landmark_model} is not currently supported.") - self.landmark_detector.load_state_dict(landmark_state_dict) - self.landmark_detector.eval() - self.landmark_detector.to(self.device) - # self.landmark_detector = torch.compile(self.landmark_detector) - else: - self.landmark_detector = None - - # Initialize AU Detector - self.info["au_model"] = au_model - if au_model is not None: - if self.landmark_detector is not None: - if au_model == "xgb": - self.au_detector = XGBClassifier() - au_model_path = hf_hub_download( - repo_id="py-feat/xgb_au", - filename="xgb_au_classifier.skops", - cache_dir=get_resource_path(), - ) - - elif au_model == "svm": - self.au_detector = SVMClassifier() - au_model_path = hf_hub_download( - repo_id="py-feat/svm_au", - filename="svm_au_classifier.skops", - cache_dir=get_resource_path(), - ) - else: - raise ValueError("{au_model} is not currently supported.") - - au_unknown_types = get_untrusted_types(file=au_model_path) - loaded_au_model = load(au_model_path, trusted=au_unknown_types) - self.au_detector.load_weights( - scaler_upper=loaded_au_model.scaler_upper, - pca_model_upper=loaded_au_model.pca_model_upper, - scaler_lower=loaded_au_model.scaler_lower, - pca_model_lower=loaded_au_model.pca_model_lower, - scaler_full=loaded_au_model.scaler_full, - pca_model_full=loaded_au_model.pca_model_full, - classifiers=loaded_au_model.classifiers, - ) - else: - raise ValueError( - "Landmark Detector is required for AU Detection with {au_model}." - ) - else: - self.au_detector = None - - # Initialize Emotion Detector - self.info["emotion_model"] = emotion_model - if emotion_model is not None: - if emotion_model == "resmasknet": - emotion_config_file = hf_hub_download( - repo_id="py-feat/resmasknet", - filename="config.json", - cache_dir=get_resource_path(), - ) - with open(emotion_config_file, "r") as f: - emotion_config = json.load(f) - - self.emotion_detector = ResMasking( - "", in_channels=emotion_config["in_channels"] - ) - self.emotion_detector.fc = nn.Sequential( - nn.Dropout(0.4), nn.Linear(512, emotion_config["num_classes"]) - ) - emotion_model_file = hf_hub_download( - repo_id="py-feat/resmasknet", - filename="ResMaskNet_Z_resmasking_dropout1_rot30.pth", - cache_dir=get_resource_path(), - ) - emotion_checkpoint = torch.load( - emotion_model_file, map_location=device, weights_only=True - )["net"] - self.emotion_detector.load_state_dict(emotion_checkpoint) - self.emotion_detector.eval() - self.emotion_detector.to(self.device) - # self.emotion_detector = torch.compile(self.emotion_detector) - elif emotion_model == "svm": - if self.landmark_detector is not None: - self.emotion_detector = EmoSVMClassifier() - emotion_model_path = hf_hub_download( - repo_id="py-feat/svm_emo", - filename="svm_emo_classifier.skops", - cache_dir=get_resource_path(), - ) - emotion_unknown_types = get_untrusted_types(file=emotion_model_path) - loaded_emotion_model = load( - emotion_model_path, trusted=emotion_unknown_types - ) - self.emotion_detector.load_weights( - scaler_full=loaded_emotion_model.scaler_full, - pca_model_full=loaded_emotion_model.pca_model_full, - classifiers=loaded_emotion_model.classifiers, - ) - else: - raise ValueError( - "Landmark Detector is required for Emotion Detection with {emotion_model}." - ) - - else: - raise ValueError("{emotion_model} is not currently supported.") - else: - self.emotion_detector = None - - # Initialize Identity Detecctor - facenet - self.info["identity_model"] = identity_model - if identity_model is not None: - if identity_model == "facenet": - self.identity_detector = InceptionResnetV1( - pretrained=None, - classify=False, - num_classes=None, - dropout_prob=0.6, - device=self.device, - ) - self.identity_detector.logits = nn.Linear(512, 8631) - identity_model_file = hf_hub_download( - repo_id="py-feat/facenet", - filename="facenet_20180402_114759_vggface2.pth", - cache_dir=get_resource_path(), - ) - self.identity_detector.load_state_dict( - torch.load( - identity_model_file, map_location=device, weights_only=True - ) - ) - self.identity_detector.eval() - self.identity_detector.to(self.device) - # self.identity_detector = torch.compile(self.identity_detector) - else: - raise ValueError("{identity_model} is not currently supported.") - else: - self.identity_detector = None - - @torch.inference_mode() - def detect_faces(self, images, face_size=112, face_detection_threshold=0.5): - """ - detect faces and poses in a batch of images using img2pose - - Args: - img (torch.Tensor): Tensor of shape (B, C, H, W) representing the images - face_size (int): Output size to resize face after cropping. - - Returns: - Fex: Prediction results dataframe - """ - - # img2pose - frames = convert_image_to_tensor(images, img_type="float32") / 255.0 - frames.to(self.device) - - batch_results = [] - for i in range(frames.size(0)): - single_frame = frames[i, ...].unsqueeze(0) # Extract single image from batch - img2pose_output = self.facepose_detector(single_frame.to(self.device)) - img2pose_output = postprocess_img2pose( - img2pose_output[0], detection_threshold=face_detection_threshold - ) - bbox = img2pose_output["boxes"] - poses = img2pose_output["dofs"] - facescores = img2pose_output["scores"] - - # Extract faces from bbox - if bbox.numel() != 0: - extracted_faces, new_bbox = extract_face_from_bbox_torch( - single_frame, bbox, face_size=face_size - ) - else: # No Face Detected - let's test of nans will work - extracted_faces = torch.zeros((1, 3, face_size, face_size)) - # bbox = torch.zeros((1,4)) - # new_bbox = torch.zeros((1,4)) - # facescores = torch.zeros((1)) - # poses = torch.zeros((1,6)) - # extracted_faces = torch.full((1, 3, face_size, face_size), float('nan')) - bbox = torch.full((1, 4), float("nan")) - new_bbox = torch.full((1, 4), float("nan")) - facescores = torch.zeros((1)) - poses = torch.full((1, 6), float("nan")) - - frame_results = { - "face_id": i, - "faces": extracted_faces, - "boxes": bbox, - "new_boxes": new_bbox, - "poses": poses, - "scores": facescores, - } - - # Extract Faces separately for Resmasknet - if self.info["emotion_model"] == "resmasknet": - if torch.all(torch.isnan(bbox)): # No Face Detected - frame_results["resmasknet_faces"] = torch.full( - (1, 3, 224, 224), float("nan") - ) - # frame_results["resmasknet_faces"] = torch.zeros((1, 3, 224, 224)) - else: - resmasknet_faces, _ = extract_face_from_bbox_torch( - single_frame, bbox, expand_bbox=1.1, face_size=224 - ) - frame_results["resmasknet_faces"] = resmasknet_faces - - batch_results.append(frame_results) - - return batch_results - - @torch.inference_mode() - def forward(self, faces_data): - """ - Run Model Inference on detected faces. - - Args: - faces_data (list of dict): Detected faces and associated data from `detect_faces`. - - Returns: - Fex: Prediction results dataframe - """ - - extracted_faces = torch.cat([face["faces"] for face in faces_data], dim=0) - new_bboxes = torch.cat([face["new_boxes"] for face in faces_data], dim=0) - n_faces = extracted_faces.shape[0] - - if self.landmark_detector is not None: - if self.info["landmark_model"].lower() == "mobilenet": - extracted_faces = Compose( - [Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])] - )(extracted_faces) - landmarks = self.landmark_detector.forward( - extracted_faces.to(self.device) - ) - if self.info["landmark_model"].lower() == "mobilefacenet": - landmarks = self.landmark_detector.forward( - extracted_faces.to(self.device) - )[0] - else: - landmarks = self.landmark_detector.forward( - extracted_faces.to(self.device) - ) - new_landmarks = inverse_transform_landmarks_torch(landmarks, new_bboxes) - else: - new_landmarks = torch.full((n_faces, 136), float("nan")) - - if self.emotion_detector is not None: - if self.info["emotion_model"] == "resmasknet": - resmasknet_faces = torch.cat( - [face["resmasknet_faces"] for face in faces_data], dim=0 - ) - emotions = self.emotion_detector.forward(resmasknet_faces.to(self.device)) - emotions = torch.softmax(emotions, 1) - elif self.info["emotion_model"] == "svm": - hog_features, emo_new_landmarks = extract_hog_features( - extracted_faces, landmarks - ) - emotions = self.emotion_detector.detect_emo( - frame=hog_features, landmarks=[emo_new_landmarks] - ) - emotions = torch.tensor(emotions) - else: - emotions = torch.full((n_faces, 7), float("nan")) - - if self.identity_detector is not None: - identity_embeddings = self.identity_detector.forward( - extracted_faces.to(self.device) - ) - else: - identity_embeddings = torch.full((n_faces, 512), float("nan")) - - if self.au_detector is not None: - hog_features, au_new_landmarks = extract_hog_features( - extracted_faces, landmarks - ) - aus = self.au_detector.detect_au( - frame=hog_features, landmarks=[au_new_landmarks] - ) - else: - aus = torch.full((n_faces, 20), float("nan")) - - # Create Fex Output Representation - bboxes = torch.cat( - [ - convert_bbox_output( - face_output["new_boxes"].to(self.device), - face_output["scores"].to(self.device), - ) - for face_output in faces_data - ], - dim=0, - ) - feat_faceboxes = pd.DataFrame( - bboxes.cpu().detach().numpy(), - columns=FEAT_FACEBOX_COLUMNS, - ) - - poses = torch.cat( - [face_output["poses"].to(self.device) for face_output in faces_data], dim=0 - ) - feat_poses = pd.DataFrame( - poses.cpu().detach().numpy(), columns=FEAT_FACEPOSE_COLUMNS_6D - ) - - reshape_landmarks = new_landmarks.reshape(new_landmarks.shape[0], 68, 2) - reordered_landmarks = torch.cat( - [reshape_landmarks[:, :, 0], reshape_landmarks[:, :, 1]], dim=1 - ) - feat_landmarks = pd.DataFrame( - reordered_landmarks.cpu().detach().numpy(), - columns=openface_2d_landmark_columns, - ) - - feat_aus = pd.DataFrame(aus, columns=AU_LANDMARK_MAP["Feat"]) - - feat_emotions = pd.DataFrame( - emotions.cpu().detach().numpy(), columns=FEAT_EMOTION_COLUMNS - ) - - feat_identities = pd.DataFrame( - identity_embeddings.cpu().detach().numpy(), columns=FEAT_IDENTITY_COLUMNS[1:] - ) - - return Fex( - pd.concat( - [ - feat_faceboxes, - feat_landmarks, - feat_poses, - feat_aus, - feat_emotions, - feat_identities, - ], - axis=1, - ), - au_columns=AU_LANDMARK_MAP["Feat"], - emotion_columns=FEAT_EMOTION_COLUMNS, - facebox_columns=FEAT_FACEBOX_COLUMNS, - landmark_columns=openface_2d_landmark_columns, - facepose_columns=FEAT_FACEPOSE_COLUMNS_6D, - identity_columns=FEAT_IDENTITY_COLUMNS[1:], - detector="Feat", - face_model=self.info["face_model"], - landmark_model=self.info["landmark_model"], - au_model=self.info["au_model"], - emotion_model=self.info["emotion_model"], - facepose_model=self.info["facepose_model"], - identity_model=self.info["identity_model"], - ) - - def detect( - self, - inputs, - data_type="image", - output_size=None, - batch_size=1, - num_workers=0, - pin_memory=False, - face_identity_threshold=0.8, - face_detection_threshold=0.5, - skip_frames=None, - progress_bar=True, - **kwargs, - ): - """ - Detects FEX from one or more image files. - - Args: - inputs (list of str, torch.Tensor): Path to a list of paths to image files or torch.Tensor of images (B, C, H, W) - data_type (str): type of data to be processed; Default 'image' ['image', 'tensor', 'video'] - output_size (int): image size to rescale all image preserving aspect ratio. - batch_size (int): how many batches of images you want to run at one shot. - num_workers (int): how many subprocesses to use for data loading. - pin_memory (bool): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them. - face_identity_threshold (float): value between 0-1 to determine similarity of person using face identity embeddings; Default >= 0.8 - face_detection_threshold (float): value between 0-1 to determine if a face was detected; Default >= 0.5 - skip_frames (int or None): number of frames to skip to speed up inference (video only); Default None - progress_bar (bool): Whether to show the tqdm progress bar. Default is True. - **kwargs: additional detector-specific kwargs - - Returns: - pd.DataFrame: Concatenated results for all images in the batch - """ - - if data_type.lower() == "image": - data_loader = DataLoader( - ImageDataset( - inputs, - output_size=output_size, - preserve_aspect_ratio=True, - padding=True, - ), - num_workers=num_workers, - batch_size=batch_size, - pin_memory=pin_memory, - shuffle=False, - ) - elif data_type.lower() == "tensor": - data_loader = DataLoader( - TensorDataset(inputs), - batch_size=batch_size, - shuffle=False, - num_workers=num_workers, - pin_memory=pin_memory, - ) - elif data_type.lower() == "video": - dataset = VideoDataset( - inputs, skip_frames=skip_frames, output_size=output_size - ) - data_loader = DataLoader( - dataset, - num_workers=num_workers, - batch_size=batch_size, - pin_memory=pin_memory, - shuffle=False, - ) - - data_iterator = tqdm(data_loader) if progress_bar else data_loader - - batch_output = [] - frame_counter = 0 - - try: - _ = next(enumerate(tqdm(data_loader))) - except RuntimeError as e: - raise ValueError( - f"When using `batch_size > 1`, all images must either have the same dimension or `output_size` should be something other than `None` to pad images prior to processing\n{e}" - ) - - for batch_id, batch_data in enumerate(data_iterator): - faces_data = self.detect_faces( - batch_data["Image"], - face_size=self.face_size if hasattr(self, "face_size") else 112, - face_detection_threshold=face_detection_threshold, - ) - batch_results = self.forward(faces_data) - - # Create metadata for each frame - file_names = [] - frame_ids = [] - for i, face in enumerate(faces_data): - n_faces = len(face["scores"]) - if data_type.lower() == "video": - current_frame_id = batch_data["Frame"].detach().numpy()[i] - else: - current_frame_id = frame_counter + i - frame_ids.append(np.repeat(current_frame_id, n_faces)) - file_names.append(np.repeat(batch_data["FileName"][i], n_faces)) - batch_results["input"] = np.concatenate(file_names) - batch_results["frame"] = np.concatenate(frame_ids) - - # Invert the face boxes and landmarks based on the padded output size - for j, frame_idx in enumerate(batch_results["frame"].unique()): - batch_results.loc[ - batch_results["frame"] == frame_idx, ["FrameHeight", "FrameWidth"] - ] = ( - compute_original_image_size(batch_data)[j, :] - .repeat( - len( - batch_results.loc[ - batch_results["frame"] == frame_idx, "frame" - ] - ), - 1, - ) - .numpy() - ) - batch_results.loc[batch_results["frame"] == frame_idx, "FaceRectX"] = ( - batch_results.loc[batch_results["frame"] == frame_idx, "FaceRectX"] - - batch_data["Padding"]["Left"].detach().numpy()[j] - ) / batch_data["Scale"].detach().numpy()[j] - batch_results.loc[batch_results["frame"] == frame_idx, "FaceRectY"] = ( - batch_results.loc[batch_results["frame"] == frame_idx, "FaceRectY"] - - batch_data["Padding"]["Top"].detach().numpy()[j] - ) / batch_data["Scale"].detach().numpy()[j] - batch_results.loc[ - batch_results["frame"] == frame_idx, "FaceRectWidth" - ] = ( - ( - batch_results.loc[ - batch_results["frame"] == frame_idx, "FaceRectWidth" - ] - ) - / batch_data["Scale"].detach().numpy()[j] - ) - batch_results.loc[ - batch_results["frame"] == frame_idx, "FaceRectHeight" - ] = ( - ( - batch_results.loc[ - batch_results["frame"] == frame_idx, "FaceRectHeight" - ] - ) - / batch_data["Scale"].detach().numpy()[j] - ) - - for i in range(68): - batch_results.loc[batch_results["frame"] == frame_idx, f"x_{i}"] = ( - batch_results.loc[batch_results["frame"] == frame_idx, f"x_{i}"] - - batch_data["Padding"]["Left"].detach().numpy()[j] - ) / batch_data["Scale"].detach().numpy()[j] - batch_results.loc[batch_results["frame"] == frame_idx, f"y_{i}"] = ( - batch_results.loc[batch_results["frame"] == frame_idx, f"y_{i}"] - - batch_data["Padding"]["Top"].detach().numpy()[j] - ) / batch_data["Scale"].detach().numpy()[j] - - batch_output.append(batch_results) - frame_counter += 1 * batch_size - batch_output = pd.concat(batch_output) - batch_output.reset_index(drop=True, inplace=True) - if data_type.lower() == "video": - batch_output["approx_time"] = [ - dataset.calc_approx_frame_time(x) - for x in batch_output["frame"].to_numpy() - ] - batch_output.compute_identities(threshold=face_identity_threshold, inplace=True) - return batch_output diff --git a/feat/MPDetector.py b/feat/MPDetector.py index c73d7c3e..aac1d06f 100644 --- a/feat/MPDetector.py +++ b/feat/MPDetector.py @@ -50,10 +50,8 @@ def get_camera_intrinsics(batch_hw_tensor, focal_length=None): Computes the camera intrinsic matrix for a batch of images. Args: - batch_hw_tensor (torch.Tensor): A tensor of shape [B, 2] where B is the batch size, - and each entry contains [H, W] for the height and width of the images. - focal_length (torch.Tensor, optional): A tensor of shape [B] representing the focal length for each image in the batch. - If None, the focal length will default to the image width for each image. + batch_hw_tensor (torch.Tensor): A tensor of shape [B, 2] where B is the batch size, and each entry contains [H, W] for the height and width of the images. + focal_length (torch.Tensor, optional): A tensor of shape [B] representing the focal length for each image in the batch. If None, the focal length will default to the image width for each image. Returns: K (torch.Tensor): A tensor of shape [B, 3, 3] containing the camera intrinsic matrices for each image in the batch. @@ -310,6 +308,157 @@ def estimate_face_pose(pts_3d, K, max_iter=100, lr=1e-3, return_euler_angles=Tru return R, t +def plot_face_landmarks( + fex, + frame_idx, + ax=None, + oval_color="white", + oval_linestyle="-", + oval_linewidth=3, + tesselation_color="gray", + tesselation_linestyle="-", + tesselation_linewidth=1, + mouth_color="white", + mouth_linestyle="-", + mouth_linewidth=3, + eye_color="navy", + eye_linestyle="-", + eye_linewidth=2, + iris_color="skyblue", + iris_linestyle="-", + iris_linewidth=2, +): + """Plots face landmarks on the given frame using specified styles for each part. + + Args: + fex: DataFrame containing face landmarks (x, y coordinates). + frame_idx: Index of the frame to plot. + ax: Matplotlib axis to draw on. If None, a new axis is created. + oval_color, tesselation_color, mouth_color, eye_color, iris_color: Colors for each face part. + oval_linestyle, tesselation_linestyle, mouth_linestyle, eye_linestyle, iris_linestyle: Linestyle for each face part. + oval_linewidth, tesselation_linewidth, mouth_linewidth, eye_linewidth, iris_linewidth: Linewidth for each face part. + n_faces: Number of faces in the frame. If None, will be determined from fex. + """ + if ax is None: + fig, ax = plt.subplots(figsize=(10, 10)) + + # Get frame data + fex_frame = fex.query("frame == @frame_idx") + n_faces_frame = fex_frame.shape[0] + + # Add the frame image + ax.imshow(Image.open(fex_frame["input"].unique()[0])) + + # Helper function to draw lines for a set of connections + def draw_connections(face_idx, connections, color, linestyle, linewidth): + for connection in connections: + start = connection.start + end = connection.end + line = plt.Line2D( + [fex.loc[face_idx, f"x_{start}"], fex.loc[face_idx, f"x_{end}"]], + [fex.loc[face_idx, f"y_{start}"], fex.loc[face_idx, f"y_{end}"]], + color=color, + linestyle=linestyle, + linewidth=linewidth, + ) + ax.add_line(line) + + # Face tessellation + for face in range(n_faces_frame): + draw_connections( + face, + FaceLandmarksConnections.FACE_LANDMARKS_TESSELATION, + tesselation_color, + tesselation_linestyle, + tesselation_linewidth, + ) + + # Mouth + for face in range(n_faces_frame): + draw_connections( + face, + FaceLandmarksConnections.FACE_LANDMARKS_LIPS, + mouth_color, + mouth_linestyle, + mouth_linewidth, + ) + + # Left iris + for face in range(n_faces_frame): + draw_connections( + face, + FaceLandmarksConnections.FACE_LANDMARKS_LEFT_IRIS, + iris_color, + iris_linestyle, + iris_linewidth, + ) + + # Left eye + for face in range(n_faces_frame): + draw_connections( + face, + FaceLandmarksConnections.FACE_LANDMARKS_LEFT_EYE, + eye_color, + eye_linestyle, + eye_linewidth, + ) + + # Left eyebrow + for face in range(n_faces_frame): + draw_connections( + face, + FaceLandmarksConnections.FACE_LANDMARKS_LEFT_EYEBROW, + eye_color, + eye_linestyle, + eye_linewidth, + ) + + # Right iris + for face in range(n_faces_frame): + draw_connections( + face, + FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_IRIS, + iris_color, + iris_linestyle, + iris_linewidth, + ) + + # Right eye + for face in range(n_faces_frame): + draw_connections( + face, + FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_EYE, + eye_color, + eye_linestyle, + eye_linewidth, + ) + + # Right eyebrow + for face in range(n_faces_frame): + draw_connections( + face, + FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_EYEBROW, + eye_color, + eye_linestyle, + eye_linewidth, + ) + + # Face oval + for face in range(n_faces_frame): + draw_connections( + face, + FaceLandmarksConnections.FACE_LANDMARKS_FACE_OVAL, + oval_color, + oval_linestyle, + oval_linewidth, + ) + + # Optionally turn off axis for a clean plot + ax.axis("off") + + return ax + + class MPDetector(nn.Module, PyTorchModelHubMixin): def __init__( self, diff --git a/feat/detector.py b/feat/detector.py index 4a450183..24d7f697 100644 --- a/feat/detector.py +++ b/feat/detector.py @@ -1,1404 +1,673 @@ -""" -Main Detector class. The Detector class wraps other pre-trained models -(e.g. face detector, au detector) and provides a high-level API to make it easier to -perform detection -""" - -import os +import json +from tqdm import tqdm import numpy as np import pandas as pd -from skimage.feature import hog +from huggingface_hub import hf_hub_download, PyTorchModelHubMixin +from collections import OrderedDict + +from feat.emo_detectors.ResMaskNet.resmasknet_test import ( + ResMasking, +) +from feat.identity_detectors.facenet.facenet_model import InceptionResnetV1 +from feat.facepose_detectors.img2pose.deps.models import ( + FasterDoFRCNN, + postprocess_img2pose, +) +from feat.au_detectors.StatLearning.SL_test import XGBClassifier, SVMClassifier +from feat.emo_detectors.StatLearning.EmoSL_test import EmoSVMClassifier +from feat.landmark_detectors.mobilefacenet_test import MobileFaceNet +from feat.landmark_detectors.basenet_test import MobileNet_GDConv +from feat.landmark_detectors.pfld_compressed_test import PFLDInference +from feat.pretrained import load_model_weights, AU_LANDMARK_MAP from feat.utils import ( + set_torch_device, openface_2d_landmark_columns, FEAT_EMOTION_COLUMNS, FEAT_FACEBOX_COLUMNS, - FEAT_FACEPOSE_COLUMNS_3D, FEAT_FACEPOSE_COLUMNS_6D, - FEAT_TIME_COLUMNS, FEAT_IDENTITY_COLUMNS, - set_torch_device, - is_list_of_lists_empty, ) from feat.utils.io import get_resource_path from feat.utils.image_operations import ( - extract_face_from_landmarks, - extract_face_from_bbox, convert_image_to_tensor, - BBox, -) -from feat.pretrained import ( - get_pretrained_models, - fetch_model, - AU_LANDMARK_MAP, - load_model_weights, -) -from feat.data import ( - Fex, - ImageDataset, - VideoDataset, - _inverse_face_transform, - _inverse_landmark_transform, + extract_face_from_bbox_torch, + inverse_transform_landmarks_torch, + extract_hog_features, + convert_bbox_output, + compute_original_image_size, ) +from feat.data import Fex, ImageDataset, TensorDataset, VideoDataset +from skops.io import load, get_untrusted_types +from safetensors.torch import load_file import torch +import torch.nn as nn from torch.utils.data import DataLoader -from torchvision.transforms import Compose, Normalize -import torchvision.transforms as transforms from torchvision.models.detection.backbone_utils import resnet_fpn_backbone -from feat.facepose_detectors.img2pose.deps.models import postprocess_img2pose -import logging +from torchvision.transforms import Compose, Normalize +import sys import warnings -from tqdm import tqdm -from huggingface_hub import hf_hub_download -from safetensors.torch import load_file -import json + +sys.modules["__main__"].__dict__["XGBClassifier"] = XGBClassifier +sys.modules["__main__"].__dict__["SVMClassifier"] = SVMClassifier +sys.modules["__main__"].__dict__["EmoSVMClassifier"] = EmoSVMClassifier # Supress sklearn warning about pickled estimators and diff sklearn versions warnings.filterwarnings("ignore", category=UserWarning, module="sklearn") -class Detector(object): +class Detector(nn.Module, PyTorchModelHubMixin): def __init__( self, - face_model="retinaface", landmark_model="mobilefacenet", au_model="xgb", emotion_model="resmasknet", - facepose_model="img2pose", identity_model="facenet", device="cpu", - n_jobs=1, - verbose=False, - **kwargs, ): - """Detector class to detect FEX from images or videos. - - Detector is a class used to detect faces, facial landmarks, emotions, and action units from images and videos. - - Args: - n_jobs (int, default=1): Number of processes to use for extraction. - device (str): specify device to process data (default='cpu'), can be - ['auto', 'cpu', 'cuda', 'mps'] - verbose (bool): print logging and debug messages during operation - **kwargs: you can pass each detector specific kwargs using a dictionary - like: `face_model_kwargs = {...}, au_model_kwargs={...}, ...` - - Attributes: - info (dict): - n_jobs (int): Number of jobs to be used in parallel. - face_model (str, default=retinaface): Name of face detection model - landmark_model (str, default=mobilenet): Nam eof landmark model - au_model (str, default=svm): Name of Action Unit detection model - emotion_model (str, default=resmasknet): Path to emotion detection model. - facepose_model (str, default=img2pose): Name of headpose detection model. - identity_model (str, default=facenet): Name of identity detection model. - face_detection_columns (list): Column names for face detection ouput (x, y, w, h) - face_landmark_columns (list): Column names for face landmark output (x0, y0, x1, y1, ...) - emotion_model_columns (list): Column names for emotion model output - emotion_model_columns (list): Column names for emotion model output - mapper (dict): Class names for emotion model output by index. - input_shape (dict) - - face_detector: face detector object - face_landmark: face_landmark object - emotion_model: emotion_model object + super(Detector, self).__init__() - Examples: - >> detector = Detector(n_jobs=1) - >> detector.detect_image(["input.jpg"]) - >> detector.detect_video("input.mp4") - """ - - # Initial info dict with model names only self.info = dict( - face_model=None, + face_model="img2pose", landmark_model=None, emotion_model=None, - facepose_model=None, + facepose_model="img2pose", au_model=None, identity_model=None, - n_jobs=n_jobs, ) - self.verbose = verbose - # Setup verbosity - if self.verbose: - logging.basicConfig(level=logging.INFO) - logging.info("Verbose logging enabled") - - # Setup device self.device = set_torch_device(device) - # Load Model Configs - with open(os.path.join(get_resource_path(), "model_config.json"), "r") as file: - self.model_configs = json.load(file) - # Verify model names and download if necessary - face, landmark, au, emotion, facepose, identity = get_pretrained_models( - face_model, - landmark_model, - au_model, - emotion_model, - facepose_model, - identity_model, - verbose, + # Load Model Configurations + facepose_config_file = hf_hub_download( + repo_id="py-feat/img2pose", + filename="config.json", + cache_dir=get_resource_path(), ) - - self._init_detectors( - face, - landmark, - au, - emotion, - facepose, - identity, - openface_2d_landmark_columns, - **kwargs, + with open(facepose_config_file, "r") as f: + facepose_config = json.load(f) + + # Initialize img2pose + backbone = resnet_fpn_backbone(backbone_name="resnet18", weights=None) + backbone.eval() + backbone.to(self.device) + self.facepose_detector = FasterDoFRCNN( + backbone=backbone, + num_classes=2, + min_size=facepose_config["min_size"], + max_size=facepose_config["max_size"], + pose_mean=torch.tensor(facepose_config["pose_mean"]), + pose_stddev=torch.tensor(facepose_config["pose_stddev"]), + threed_68_points=torch.tensor(facepose_config["threed_points"]), + rpn_pre_nms_top_n_test=facepose_config["rpn_pre_nms_top_n_test"], + rpn_post_nms_top_n_test=facepose_config["rpn_post_nms_top_n_test"], + bbox_x_factor=facepose_config["bbox_x_factor"], + bbox_y_factor=facepose_config["bbox_y_factor"], + expand_forehead=facepose_config["expand_forehead"], ) - - def __repr__(self): - return f"{self.__class__.__module__}.{self.__class__.__name__}(device={self.device}; face_model={self.info['face_model']}, landmark_model={self.info['landmark_model']}, au_model={self.info['au_model']}, emotion_model={self.info['emotion_model']}, facepose_model={self.info['facepose_model']}, identity_model={self.info['identity_model']})" - - def __getitem__(self, i): - return self.info[i] - - def _init_detectors( - self, - face, - landmark, - au, - emotion, - facepose, - identity, - openface_2d_landmark_columns, - **kwargs, - ): - """Helper function called by __init__ and change_model to (re)initialize one of - the supported detectors""" - - # Keyword arguments than can be passed to the underlying models - face_model_kwargs = kwargs.pop("face_model_kwargs", dict()) - landmark_model_kwargs = kwargs.pop("landmark_model_kwargs", dict()) - au_model_kwargs = kwargs.pop("au_model_kwargs", dict()) - emotion_model_kwargs = kwargs.pop("emotion_model_kwargs", dict()) - facepose_model_kwargs = kwargs.pop("facepose_model_kwargs", dict()) - identity_model_kwargs = kwargs.pop("identity_model_kwargs", dict()) - - # Initialize model instances and any additional post init setup - # Only initialize a model if the currently initialized model is diff than the - # requested one. Lets us re-use this with .change_model - - # FACE MODEL - if self.info["face_model"] != face: - logging.info(f"Loading Face model: {face}") - self.face_detector = fetch_model("face_model", face) - self.info["face_model"] = face - self.info["face_detection_columns"] = FEAT_FACEBOX_COLUMNS - predictions = np.full_like(np.atleast_2d(FEAT_FACEBOX_COLUMNS), np.nan) - empty_facebox = pd.DataFrame(predictions, columns=FEAT_FACEBOX_COLUMNS) - self._empty_facebox = empty_facebox - if self.face_detector is not None: - if "img2pose" in face: - self.face_detector = self.face_detector( - constrained="img2pose-c" == face, - device=self.device, - **face_model_kwargs, - ) - else: - self.face_detector = self.face_detector( - device=self.device, **face_model_kwargs - ) - - # LANDMARK MODEL - if self.info["landmark_model"] != landmark: - logging.info(f"Loading Facial Landmark model: {landmark}") - self.landmark_detector = fetch_model("landmark_model", landmark) - if self.landmark_detector is not None: - if landmark == "mobilenet": - self.landmark_detector = self.landmark_detector( - 136, **landmark_model_kwargs - ) - self.landmark_detector.from_pretrained( - f"py-feat/{landmark}", cache_dir=get_resource_path() - ) - - # checkpoint = torch.load( - # os.path.join( - # get_resource_path(), - # "mobilenet_224_model_best_gdconv_external.pth.tar", - # ), - # map_location=self.device, - # ) - # ################################## - # state_dict = checkpoint["state_dict"] - # from collections import OrderedDict - - # new_state_dict = OrderedDict() - # for k, v in state_dict.items(): - # if "module." in k: - # k = k.replace("module.", "") - # new_state_dict[k] = v - # self.landmark_detector.load_state_dict(new_state_dict) - # ##################################### - - elif landmark == "pfld": - self.landmark_detector = self.landmark_detector( - **landmark_model_kwargs - ) - self.landmark_detector.from_pretrained( - f"py-feat/{landmark}", cache_dir=get_resource_path() - ) - - # checkpoint = torch.load( - # os.path.join(get_resource_path(), "pfld_model_best.pth.tar"), - # map_location=self.device, - # ) - # self.landmark_detector.load_state_dict(checkpoint["state_dict"]) - elif landmark == "mobilefacenet": - self.landmark_detector = self.landmark_detector( - [112, 112], 136, **landmark_model_kwargs - ) - self.landmark_detector.from_pretrained( - f"py-feat/{landmark}", cache_dir=get_resource_path() - ) - - # checkpoint = torch.load( - # os.path.join( - # get_resource_path(), "mobilefacenet_model_best.pth.tar" - # ), - # map_location=self.device, - # ) - # self.landmark_detector.load_state_dict(checkpoint["state_dict"]) - self.landmark_detector.eval() - self.landmark_detector.to(self.device) - - self.info["landmark_model"] = landmark - self.info["mapper"] = openface_2d_landmark_columns - self.info["face_landmark_columns"] = openface_2d_landmark_columns - predictions = np.full_like( - np.atleast_2d(openface_2d_landmark_columns), np.nan - ) - empty_landmarks = pd.DataFrame( - predictions, columns=openface_2d_landmark_columns - ) - self._empty_landmark = empty_landmarks - - # FACEPOSE MODEL - if self.info["facepose_model"] != facepose: - logging.info(f"Loading facepose model: {facepose}") - self.facepose_detector = fetch_model("facepose_model", facepose) - if "img2pose" in facepose: - backbone = resnet_fpn_backbone( - backbone_name=f"resnet{self.model_configs['img2pose']['depth']}", - weights=None, + facepose_model_file = hf_hub_download( + repo_id="py-feat/img2pose", + filename="model.safetensors", + cache_dir=get_resource_path(), + ) + facepose_checkpoint = load_file(facepose_model_file) + self.facepose_detector.load_state_dict(facepose_checkpoint, load_model_weights) + self.facepose_detector.eval() + self.facepose_detector.to(self.device) + # self.facepose_detector = torch.compile(self.facepose_detector) + + # Initialize Landmark Detector + self.info["landmark_model"] = landmark_model + if landmark_model is not None: + if landmark_model == "mobilefacenet": + self.face_size = 112 + self.landmark_detector = MobileFaceNet( + [self.face_size, self.face_size], 136, device=self.device + ) + landmark_model_file = hf_hub_download( + repo_id="py-feat/mobilefacenet", + filename="mobilefacenet_model_best.pth.tar", + cache_dir=get_resource_path(), ) - self.facepose_detector = self.facepose_detector( - backbone=backbone, - num_classes=2, - min_size=self.model_configs["img2pose"]["min_size"], - max_size=self.model_configs["img2pose"]["max_size"], - pose_mean=torch.tensor(self.model_configs["img2pose"]["pose_mean"]), - pose_stddev=torch.tensor( - self.model_configs["img2pose"]["pose_stddev"] - ), - threed_68_points=torch.tensor( - self.model_configs["img2pose"]["threed_points"] - ), - rpn_pre_nms_top_n_test=self.model_configs["img2pose"][ - "rpn_pre_nms_top_n_test" - ], - rpn_post_nms_top_n_test=self.model_configs["img2pose"][ - "rpn_post_nms_top_n_test" - ], - bbox_x_factor=self.model_configs["img2pose"]["bbox_x_factor"], - bbox_y_factor=self.model_configs["img2pose"]["bbox_y_factor"], - expand_forehead=self.model_configs["img2pose"]["expand_forehead"], - **facepose_model_kwargs, + landmark_state_dict = torch.load( + landmark_model_file, map_location=self.device, weights_only=True + )["state_dict"] # Ensure Model weights are Float32 for MPS + elif landmark_model == "mobilenet": + self.face_size = 224 + self.landmark_detector = MobileNet_GDConv(136) + landmark_model_file = hf_hub_download( + repo_id="py-feat/mobilenet", + filename="mobilenet_224_model_best_gdconv_external.pth.tar", + cache_dir=get_resource_path(), ) - - # self.facepose_detector = self.facepose_detector( - # constrained="img2pose-c" == face, - # device=self.device, - # **facepose_model_kwargs, - # ) - facepose_model_file = hf_hub_download( - repo_id="py-feat/img2pose", - filename="model.safetensors", + mobilenet_state_dict = torch.load( + landmark_model_file, map_location=self.device, weights_only=True + )["state_dict"] # Ensure Model weights are Float32 for MPS + landmark_state_dict = OrderedDict() + for k, v in mobilenet_state_dict.items(): + if "module." in k: + k = k.replace("module.", "") + landmark_state_dict[k] = v + elif landmark_model == "pfld": + self.face_size = 112 + self.landmark_detector = PFLDInference() + landmark_model_file = hf_hub_download( + repo_id="py-feat/pfld", + filename="pfld_model_best.pth.tar", cache_dir=get_resource_path(), ) - facepose_checkpoint = load_file(facepose_model_file) - self.facepose_detector.load_state_dict(facepose_checkpoint) - self.facepose_detector.eval() - self.facepose_detector.to(self.device) + landmark_state_dict = torch.load( + landmark_model_file, map_location=self.device, weights_only=True + )["state_dict"] # Ensure Model weights are Float32 for MPS else: - self.facepose_detector = self.facepose_detector(**facepose_model_kwargs) - self.info["facepose_model"] = facepose + raise ValueError("{landmark_model} is not currently supported.") + self.landmark_detector.load_state_dict(landmark_state_dict) + self.landmark_detector.eval() + self.landmark_detector.to(self.device) + # self.landmark_detector = torch.compile(self.landmark_detector) + else: + self.landmark_detector = None - pose_dof = facepose_model_kwargs.get("RETURN_DIM", 3) - self.info["facepose_model_columns"] = ( - FEAT_FACEPOSE_COLUMNS_3D if pose_dof == 3 else FEAT_FACEPOSE_COLUMNS_6D - ) - predictions = np.full_like( - np.atleast_2d(self.info["facepose_model_columns"]), np.nan - ) - empty_facepose = pd.DataFrame( - predictions, columns=self.info["facepose_model_columns"] - ) - self._empty_facepose = empty_facepose + # Initialize AU Detector + self.info["au_model"] = au_model + if au_model is not None: + if self.landmark_detector is not None: + if au_model == "xgb": + self.au_detector = XGBClassifier() + au_model_path = hf_hub_download( + repo_id="py-feat/xgb_au", + filename="xgb_au_classifier.skops", + cache_dir=get_resource_path(), + ) - # AU MODEL - if self.info["au_model"] != au: - logging.info(f"Loading AU model: {au}") - self.au_model = fetch_model("au_model", au) - self.info["au_model"] = au - if self.info["au_model"] in ["svm", "xgb"]: - self.info["au_presence_columns"] = AU_LANDMARK_MAP["Feat"] + elif au_model == "svm": + self.au_detector = SVMClassifier() + au_model_path = hf_hub_download( + repo_id="py-feat/svm_au", + filename="svm_au_classifier.skops", + cache_dir=get_resource_path(), + ) + else: + raise ValueError("{au_model} is not currently supported.") + + au_unknown_types = get_untrusted_types(file=au_model_path) + loaded_au_model = load(au_model_path, trusted=au_unknown_types) + self.au_detector.load_weights( + scaler_upper=loaded_au_model.scaler_upper, + pca_model_upper=loaded_au_model.pca_model_upper, + scaler_lower=loaded_au_model.scaler_lower, + pca_model_lower=loaded_au_model.pca_model_lower, + scaler_full=loaded_au_model.scaler_full, + pca_model_full=loaded_au_model.pca_model_full, + classifiers=loaded_au_model.classifiers, + ) else: - self.info["au_presence_columns"] = AU_LANDMARK_MAP[self.info["au_model"]] - if self.au_model is not None: - self.au_model = self.au_model(**au_model_kwargs) - au_weights = load_model_weights( - model_type="au", model=au, location="huggingface" + raise ValueError( + "Landmark Detector is required for AU Detection with {au_model}." ) - self.au_model.load_weights( - au_weights["scaler_upper"], - au_weights["pca_model_upper"], - au_weights["scaler_lower"], - au_weights["pca_model_lower"], - au_weights["scaler_full"], - au_weights["pca_model_full"], - au_weights["au_classifiers"], + else: + self.au_detector = None + + # Initialize Emotion Detector + self.info["emotion_model"] = emotion_model + if emotion_model is not None: + if emotion_model == "resmasknet": + emotion_config_file = hf_hub_download( + repo_id="py-feat/resmasknet", + filename="config.json", + cache_dir=get_resource_path(), ) + with open(emotion_config_file, "r") as f: + emotion_config = json.load(f) - predictions = np.full_like( - np.atleast_2d(self.info["au_presence_columns"]), np.nan + self.emotion_detector = ResMasking( + "", in_channels=emotion_config["in_channels"] ) - empty_au_occurs = pd.DataFrame( - predictions, columns=self.info["au_presence_columns"] + self.emotion_detector.fc = nn.Sequential( + nn.Dropout(0.4), nn.Linear(512, emotion_config["num_classes"]) ) - self._empty_auoccurence = empty_au_occurs - - # EMOTION MODEL - if self.info["emotion_model"] != emotion: - logging.info(f"Loading emotion model: {emotion}") - self.emotion_model = fetch_model("emotion_model", emotion) - self.info["emotion_model"] = emotion - if self.emotion_model is not None: - if emotion == "resmasknet": - self.emotion_model = self.emotion_model( - device=self.device, **emotion_model_kwargs + emotion_model_file = hf_hub_download( + repo_id="py-feat/resmasknet", + filename="ResMaskNet_Z_resmasking_dropout1_rot30.pth", + cache_dir=get_resource_path(), + ) + emotion_checkpoint = torch.load( + emotion_model_file, map_location=device, weights_only=True + )["net"] + self.emotion_detector.load_state_dict(emotion_checkpoint) + self.emotion_detector.eval() + self.emotion_detector.to(self.device) + # self.emotion_detector = torch.compile(self.emotion_detector) + elif emotion_model == "svm": + if self.landmark_detector is not None: + self.emotion_detector = EmoSVMClassifier() + emotion_model_path = hf_hub_download( + repo_id="py-feat/svm_emo", + filename="svm_emo_classifier.skops", + cache_dir=get_resource_path(), ) - elif emotion == "svm": - self.emotion_model = self.emotion_model(**emotion_model_kwargs) - emo_weights = load_model_weights( - model_type="emotion", model=emotion, location="huggingface" + emotion_unknown_types = get_untrusted_types(file=emotion_model_path) + loaded_emotion_model = load( + emotion_model_path, trusted=emotion_unknown_types ) - self.emotion_model.load_weights( - emo_weights["scaler_full"], - emo_weights["pca_model_full"], - emo_weights["emo_classifiers"], + self.emotion_detector.load_weights( + scaler_full=loaded_emotion_model.scaler_full, + pca_model_full=loaded_emotion_model.pca_model_full, + classifiers=loaded_emotion_model.classifiers, + ) + else: + raise ValueError( + "Landmark Detector is required for Emotion Detection with {emotion_model}." ) - self.info["emotion_model_columns"] = FEAT_EMOTION_COLUMNS - predictions = np.full_like(np.atleast_2d(FEAT_EMOTION_COLUMNS), np.nan) - empty_emotion = pd.DataFrame(predictions, columns=FEAT_EMOTION_COLUMNS) - self._empty_emotion = empty_emotion - - # IDENTITY MODEL - if self.info["identity_model"] != identity: - logging.info(f"Loading Identity model: {identity}") - self.identity_model = fetch_model("identity_model", identity) - self.info["identity_model"] = identity - self.info["identity_model_columns"] = FEAT_IDENTITY_COLUMNS - predictions = np.full_like(np.atleast_2d(FEAT_IDENTITY_COLUMNS), np.nan) - empty_identity = pd.DataFrame(predictions, columns=FEAT_IDENTITY_COLUMNS) - self._empty_identity = empty_identity - if self.identity_model is not None: - self.identity_model = self.identity_model( - device=self.device, **identity_model_kwargs + else: + raise ValueError("{emotion_model} is not currently supported.") + else: + self.emotion_detector = None + + # Initialize Identity Detecctor - facenet + self.info["identity_model"] = identity_model + if identity_model is not None: + if identity_model == "facenet": + self.identity_detector = InceptionResnetV1( + pretrained=None, + classify=False, + num_classes=None, + dropout_prob=0.6, + device=self.device, + ) + self.identity_detector.logits = nn.Linear(512, 8631) + identity_model_file = hf_hub_download( + repo_id="py-feat/facenet", + filename="facenet_20180402_114759_vggface2.pth", + cache_dir=get_resource_path(), ) - - self.info["output_columns"] = ( - FEAT_TIME_COLUMNS - + self.info["face_detection_columns"] - + self.info["face_landmark_columns"] - + self.info["au_presence_columns"] - + self.info["facepose_model_columns"] - + self.info["emotion_model_columns"] - + self.info["identity_model_columns"] - + ["input"] - ) - - def change_model(self, **kwargs): - """Swap one or more pre-trained detector models for another one. Just pass in - the the new models to use as kwargs, e.g. emotion_model='svm'""" - - face_model = kwargs.get("face_model", self.info["face_model"]) - landmark_model = kwargs.get("landmark_model", self.info["landmark_model"]) - au_model = kwargs.get("au_model", self.info["au_model"]) - emotion_model = kwargs.get("emotion_model", self.info["emotion_model"]) - facepose_model = kwargs.get("facepose_model", self.info["facepose_model"]) - identity_model = kwargs.get("identity_model", self.info["identity_model"]) - - # Verify model names and download if necessary - face, landmark, au, emotion, facepose, identity = get_pretrained_models( - face_model, - landmark_model, - au_model, - emotion_model, - facepose_model, - identity_model, - self.verbose, - ) - for requested, current_name in zip( - [face, landmark, au, emotion, facepose, identity], - [ - "face_model", - "landmark_model", - "au_model", - "emotion_model", - "facepose_model", - "identity_model", - ], - ): - if requested != self.info[current_name]: - print( - f"Changing {current_name} from {self.info[current_name]} -> {requested}" + self.identity_detector.load_state_dict( + torch.load( + identity_model_file, map_location=device, weights_only=True + ) ) + self.identity_detector.eval() + self.identity_detector.to(self.device) + # self.identity_detector = torch.compile(self.identity_detector) + else: + raise ValueError("{identity_model} is not currently supported.") + else: + self.identity_detector = None - self._init_detectors( - face, - landmark, - au, - emotion, - facepose, - identity, - openface_2d_landmark_columns, - ) - - def detect_faces(self, frame, threshold=0.5, **face_model_kwargs): - """Detect faces from image or video frame + @torch.inference_mode() + def detect_faces(self, images, face_size=112, face_detection_threshold=0.5): + """ + detect faces and poses in a batch of images using img2pose Args: - frame (np.ndarray): 3d (single) or 4d (multiple) image array - threshold (float): threshold for detectiong faces (default=0.5) + img (torch.Tensor): Tensor of shape (B, C, H, W) representing the images + face_size (int): Output size to resize face after cropping. Returns: - list: list of lists with the same length as the number of frames. Each list - item is a list containing the (x1, y1, x2, y2) coordinates of each detected - face in that frame. - + Fex: Prediction results dataframe """ - logging.info("detecting faces...") + # img2pose + frames = convert_image_to_tensor(images, img_type="float32") / 255.0 + frames.to(self.device) - frame = convert_image_to_tensor(frame, img_type="float32") - - if "img2pose" in self.info["face_model"]: - frame = frame / 255 - # faces, poses = self.face_detector(frame, **face_model_kwargs) + batch_results = [] + for i in range(frames.size(0)): + single_frame = frames[i, ...].unsqueeze(0) # Extract single image from batch + img2pose_output = self.facepose_detector(single_frame.to(self.device)) img2pose_output = postprocess_img2pose( - self.facepose_detector(frame, **face_model_kwargs) + img2pose_output[0], detection_threshold=face_detection_threshold ) - faces = img2pose_output["boxes"] - _ = img2pose_output["dofs"][ - :, :3 - ] # Only returning xyz for now not translation - else: - faces = self.face_detector(frame, **face_model_kwargs) - - if is_list_of_lists_empty(faces): - logging.warning("Warning: NO FACE is detected") + bbox = img2pose_output["boxes"] + poses = img2pose_output["dofs"] + facescores = img2pose_output["scores"] + + # Extract faces from bbox + if bbox.numel() != 0: + extracted_faces, new_bbox = extract_face_from_bbox_torch( + single_frame, bbox, face_size=face_size + ) + else: # No Face Detected - let's test of nans will work + extracted_faces = torch.zeros((1, 3, face_size, face_size)) + # bbox = torch.zeros((1,4)) + # new_bbox = torch.zeros((1,4)) + # facescores = torch.zeros((1)) + # poses = torch.zeros((1,6)) + # extracted_faces = torch.full((1, 3, face_size, face_size), float('nan')) + bbox = torch.full((1, 4), float("nan")) + new_bbox = torch.full((1, 4), float("nan")) + facescores = torch.zeros((1)) + poses = torch.full((1, 6), float("nan")) + + frame_results = { + "face_id": i, + "faces": extracted_faces, + "boxes": bbox, + "new_boxes": new_bbox, + "poses": poses, + "scores": facescores, + } + + # Extract Faces separately for Resmasknet + if self.info["emotion_model"] == "resmasknet": + if torch.all(torch.isnan(bbox)): # No Face Detected + frame_results["resmasknet_faces"] = torch.full( + (1, 3, 224, 224), float("nan") + ) + # frame_results["resmasknet_faces"] = torch.zeros((1, 3, 224, 224)) + else: + resmasknet_faces, _ = extract_face_from_bbox_torch( + single_frame, bbox, expand_bbox=1.1, face_size=224 + ) + frame_results["resmasknet_faces"] = resmasknet_faces - thresholded_face = [] - for fframe in faces: # first level is each frame - fframe_x = [] - for fface in fframe: # second level is each face within a frame - if fface[4] >= threshold: # set thresholds - fframe_x.append(fface) - thresholded_face.append(fframe_x) + batch_results.append(frame_results) - return thresholded_face + return batch_results - def detect_landmarks(self, frame, detected_faces, **landmark_model_kwargs): - """Detect landmarks from image or video frame + @torch.inference_mode() + def forward(self, faces_data): + """ + Run Model Inference on detected faces. Args: - frame (np.ndarray): 3d (single) or 4d (multiple) image array - detected_faces (array): + faces_data (list of dict): Detected faces and associated data from `detect_faces`. Returns: - list: x and y landmark coordinates (1,68,2) - - Examples: - >>> from feat import Detector - >>> from feat.utils import read_pictures - >>> img_data = read_pictures(['my_image.jpg']) - >>> detector = Detector() - >>> detected_faces = detector.detect_faces(frame) - >>> detector.detect_landmarks(frame, detected_faces) + Fex: Prediction results dataframe """ - logging.info("detecting landmarks...") - frame = convert_image_to_tensor(frame) - - if is_list_of_lists_empty(detected_faces): - list_concat = detected_faces - else: - if self.info["landmark_model"]: - if self.info["landmark_model"].lower() == "mobilenet": - out_size = 224 - else: - out_size = 112 - - extracted_faces, new_bbox = extract_face_from_bbox( - frame, detected_faces, face_size=out_size - ) - - extracted_faces = extracted_faces / 255.0 + extracted_faces = torch.cat([face["faces"] for face in faces_data], dim=0) + new_bboxes = torch.cat([face["new_boxes"] for face in faces_data], dim=0) + n_faces = extracted_faces.shape[0] + if self.landmark_detector is not None: if self.info["landmark_model"].lower() == "mobilenet": extracted_faces = Compose( [Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])] )(extracted_faces) - - # Run Landmark Model - if self.info["landmark_model"].lower() == "mobilefacenet": - landmark = ( - self.landmark_detector(extracted_faces, **landmark_model_kwargs)[0] - .cpu() - .data.numpy() + landmarks = self.landmark_detector.forward( + extracted_faces.to(self.device) ) + if self.info["landmark_model"].lower() == "mobilefacenet": + landmarks = self.landmark_detector.forward( + extracted_faces.to(self.device) + )[0] else: - landmark = ( - self.landmark_detector(extracted_faces, **landmark_model_kwargs) - .cpu() - .data.numpy() + landmarks = self.landmark_detector.forward( + extracted_faces.to(self.device) ) - - landmark = landmark.reshape(landmark.shape[0], -1, 2) - - landmark_results = [] - for ik in range(landmark.shape[0]): - landmark_results.append( - new_bbox[ik].inverse_transform_landmark(landmark[ik, :, :]) - ) - - length_index = [len(x) for x in detected_faces] - new_lens = np.insert(np.cumsum(length_index), 0, 0) - list_concat = [] - for ij in range(len(length_index)): - list_concat.append(landmark_results[new_lens[ij] : new_lens[ij + 1]]) - - return list_concat - - def detect_facepose(self, frame, landmarks=None, **facepose_model_kwargs): - """Detect facepose from image or video frame. - - When used with img2pose, returns *all* detected poses, and facebox and landmarks - are ignored. Use `detect_face` method in order to obtain bounding boxes - corresponding to the detected poses returned by this method. - - Args: - frame (np.ndarray): list of images - landmarks (np.ndarray | None, optional): (num_images, num_faces, 68, 2) - landmarks for the faces contained in list of images; Default None and - ignored for img2pose and img2pose-c detectors - - Returns: - list: poses (num_images, num_faces, [pitch, roll, yaw]) - Euler angles (in - degrees) for each face within in each image} - - """ - - logging.info("detecting poses...") - # Normalize Data - frame = convert_image_to_tensor(frame, img_type="float32") / 255 - - output = {} - if "img2pose" in self.info["facepose_model"]: - img2pose_output = self.facepose_detector(frame, **facepose_model_kwargs) - img2pose_output = postprocess_img2pose(img2pose_output[0]) - output["faces"] = img2pose_output["boxes"] - output["poses"] = img2pose_output[ - "dofs" - ] # Only returning xyz for now not translation + new_landmarks = inverse_transform_landmarks_torch(landmarks, new_bboxes) else: - output["poses"] = self.facepose_detector( - frame, landmarks, **facepose_model_kwargs - ) - - return output - - def detect_aus(self, frame, landmarks, **au_model_kwargs): - """Detect Action Units from image or video frame + new_landmarks = torch.full((n_faces, 136), float("nan")) - Args: - frame (np.ndarray): image loaded in array format (n, m, 3) - landmarks (array): 68 landmarks used to localize face. - - Returns: - array: Action Unit predictions - - Examples: - >>> from feat import Detector - >>> from feat.utils import read_pictures - >>> frame = read_pictures(['my_image.jpg']) - >>> detector = Detector() - >>> detector.detect_aus(frame) - """ - - logging.info("detecting aus...") - frame = convert_image_to_tensor(frame, img_type="float32") - - if is_list_of_lists_empty(landmarks): - return landmarks - else: - if self["au_model"].lower() in ["svm", "xgb"]: - # transform = Grayscale(3) - # frame = transform(frame) - hog_features, new_landmarks = self._batch_hog( - frames=frame, landmarks=landmarks + if self.emotion_detector is not None: + if self.info["emotion_model"] == "resmasknet": + resmasknet_faces = torch.cat( + [face["resmasknet_faces"] for face in faces_data], dim=0 ) - au_predictions = self.au_model.detect_au( - frame=hog_features, landmarks=new_landmarks, **au_model_kwargs + emotions = self.emotion_detector.forward(resmasknet_faces.to(self.device)) + emotions = torch.softmax(emotions, 1) + elif self.info["emotion_model"] == "svm": + hog_features, emo_new_landmarks = extract_hog_features( + extracted_faces, landmarks ) - else: - au_predictions = self.au_model.detect_au( - frame, landmarks=landmarks, **au_model_kwargs + emotions = self.emotion_detector.detect_emo( + frame=hog_features, landmarks=[emo_new_landmarks] ) - - return self._convert_detector_output(landmarks, au_predictions) - - def _batch_hog(self, frames, landmarks): - """ - Helper function used in batch processing hog features - - Args: - frames: a batch of frames - landmarks: a list of list of detected landmarks - - Returns: - hog_features: a numpy array of hog features for each detected landmark - landmarks: updated landmarks - """ - - hog_features = [] - new_landmark_frames = [] - for i, frame_landmark in enumerate(landmarks): - if len(frame_landmark) != 0: - new_landmarks_faces = [] - for j in range(len(frame_landmark)): - convex_hull, new_landmark = extract_face_from_landmarks( - frame=frames[i], - landmarks=frame_landmark[j], - face_size=112, - ) - - hog_features.append( - hog( - transforms.ToPILImage()(convex_hull[0] / 255.0), - orientations=8, - pixels_per_cell=(8, 8), - cells_per_block=(2, 2), - visualize=False, - channel_axis=-1, - ).reshape(1, -1) - ) - - new_landmarks_faces.append(new_landmark) - new_landmark_frames.append(new_landmarks_faces) - else: - hog_features.append( - np.zeros((1, 5408)) - ) # LC: Need to confirm this size is fixed. - new_landmark_frames.append([np.zeros((68, 2))]) - - hog_features = np.concatenate(hog_features) - - return (hog_features, new_landmark_frames) - - def detect_emotions(self, frame, facebox, landmarks, **emotion_model_kwargs): - """Detect emotions from image or video frame - - Args: - frame ([type]): [description] - facebox ([type]): [description] - landmarks ([type]): [description] - - Returns: - array: Action Unit predictions - - Examples: - >>> from feat import Detector - >>> from feat.utils import read_pictures - >>> img_data = read_pictures(['my_image.jpg']) - >>> detector = Detector() - >>> detected_faces = detector.detect_faces(frame) - >>> detected_landmarks = detector.detect_landmarks(frame, detected_faces) - >>> detector.detect_emotions(frame, detected_faces, detected_landmarks) - """ - - logging.info("detecting emotions...") - frame = convert_image_to_tensor(frame, img_type="float32") - - if is_list_of_lists_empty(facebox): - return facebox + emotions = torch.tensor(emotions) else: - if self.info["emotion_model"].lower() == "resmasknet": - return self._convert_detector_output( - facebox, - self.emotion_model.detect_emo(frame, facebox, **emotion_model_kwargs), - ) - - elif self.info["emotion_model"].lower() == "svm": - hog_features, new_landmarks = self._batch_hog( - frames=frame, landmarks=landmarks - ) - return self._convert_detector_output( - landmarks, - self.emotion_model.detect_emo( - frame=hog_features, - landmarks=new_landmarks, - **emotion_model_kwargs, - ), - ) - - else: - raise ValueError( - "Cannot recognize input emo model! Please try to re-type emotion model" - ) - - def detect_identity(self, frame, facebox, **identity_model_kwargs): - """Detects identity of faces from image or video frame using face representation embeddings - - Args: - frame (np.ndarray): 3d (single) or 4d (multiple) image array - threshold (float): threshold for matching identity (default=0.8) - - Returns: - list: list of lists with the same length as the number of frames. Each list - item is a list containing the (x1, y1, x2, y2) coordinates of each detected - face in that frame. + emotions = torch.full((n_faces, 7), float("nan")) - """ - - logging.info("detecting identity...") - - frame = convert_image_to_tensor(frame, img_type="float32") / 255 - - if is_list_of_lists_empty(facebox): - return facebox - else: - extracted_faces, new_bbox = extract_face_from_bbox(frame, facebox) - face_embeddings = self.identity_model( - extracted_faces, **identity_model_kwargs + if self.identity_detector is not None: + identity_embeddings = self.identity_detector.forward( + extracted_faces.to(self.device) ) - return self._convert_detector_output(facebox, face_embeddings.numpy()) - - def _run_detection_waterfall( - self, - batch_data, - face_detection_threshold, - face_model_kwargs, - landmark_model_kwargs, - facepose_model_kwargs, - emotion_model_kwargs, - au_model_kwargs, - identity_model_kwargs, - suppress_torchvision_warnings=True, - ): - """ - Main detection "waterfall." Calls each individual detector in the sequence - required to support any interactions between detections. Called - behind-the-scenes by .detect_image() and .detect_video() - - Args: - batch_data (dict): singleton item from iterating over the output of a DataLoader - face_detection_threshold (float): value between 0-1 - face_model_kwargs (dict): face model kwargs - landmark_model_kwargs (dict): landmark model kwargs - facepose_model_kwargs (dict): facepose model kwargs - emotion_model_kwargs (dict): emotion model kwargs - au_model_kwargs (dict): au model kwargs - identity_model_kwargs (dict): identity model kwargs - - Returns: - tuple: faces, landmarks, poses, aus, emotions, identities - """ - - # Reset warnings - warnings.filterwarnings("default", category=UserWarning, module="torchvision") + else: + identity_embeddings = torch.full((n_faces, 512), float("nan")) - if suppress_torchvision_warnings: - warnings.filterwarnings("ignore", category=UserWarning, module="torchvision") + if self.au_detector is not None: + hog_features, au_new_landmarks = extract_hog_features( + extracted_faces, landmarks + ) + aus = self.au_detector.detect_au( + frame=hog_features, landmarks=[au_new_landmarks] + ) + else: + aus = torch.full((n_faces, 20), float("nan")) - faces = self.detect_faces( - batch_data["Image"], - threshold=face_detection_threshold, - **face_model_kwargs, + # Create Fex Output Representation + bboxes = torch.cat( + [ + convert_bbox_output( + face_output["new_boxes"].to(self.device), + face_output["scores"].to(self.device), + ) + for face_output in faces_data + ], + dim=0, + ) + feat_faceboxes = pd.DataFrame( + bboxes.cpu().detach().numpy(), + columns=FEAT_FACEBOX_COLUMNS, ) - landmarks = self.detect_landmarks( - batch_data["Image"], - detected_faces=faces, - **landmark_model_kwargs, + poses = torch.cat( + [face_output["poses"].to(self.device) for face_output in faces_data], dim=0 + ) + feat_poses = pd.DataFrame( + poses.cpu().detach().numpy(), columns=FEAT_FACEPOSE_COLUMNS_6D ) - poses_dict = self.detect_facepose( - batch_data["Image"], landmarks, **facepose_model_kwargs + reshape_landmarks = new_landmarks.reshape(new_landmarks.shape[0], 68, 2) + reordered_landmarks = torch.cat( + [reshape_landmarks[:, :, 0], reshape_landmarks[:, :, 1]], dim=1 + ) + feat_landmarks = pd.DataFrame( + reordered_landmarks.cpu().detach().numpy(), + columns=openface_2d_landmark_columns, ) - aus = self.detect_aus(batch_data["Image"], landmarks, **au_model_kwargs) + feat_aus = pd.DataFrame(aus, columns=AU_LANDMARK_MAP["Feat"]) - emotions = self.detect_emotions( - batch_data["Image"], faces, landmarks, **emotion_model_kwargs + feat_emotions = pd.DataFrame( + emotions.cpu().detach().numpy(), columns=FEAT_EMOTION_COLUMNS ) - identities = self.detect_identity( - batch_data["Image"], - faces, - **identity_model_kwargs, + feat_identities = pd.DataFrame( + identity_embeddings.cpu().detach().numpy(), columns=FEAT_IDENTITY_COLUMNS[1:] ) - faces = _inverse_face_transform(faces, batch_data) - landmarks = _inverse_landmark_transform(landmarks, batch_data) - - # match faces to poses - sometimes face detector finds different faces than pose detector. - faces, poses = self._match_faces_to_poses( - faces, poses_dict["faces"], poses_dict["poses"] + return Fex( + pd.concat( + [ + feat_faceboxes, + feat_landmarks, + feat_poses, + feat_aus, + feat_emotions, + feat_identities, + ], + axis=1, + ), + au_columns=AU_LANDMARK_MAP["Feat"], + emotion_columns=FEAT_EMOTION_COLUMNS, + facebox_columns=FEAT_FACEBOX_COLUMNS, + landmark_columns=openface_2d_landmark_columns, + facepose_columns=FEAT_FACEPOSE_COLUMNS_6D, + identity_columns=FEAT_IDENTITY_COLUMNS[1:], + detector="Feat", + face_model=self.info["face_model"], + landmark_model=self.info["landmark_model"], + au_model=self.info["au_model"], + emotion_model=self.info["emotion_model"], + facepose_model=self.info["facepose_model"], + identity_model=self.info["identity_model"], ) - return faces, landmarks, poses, aus, emotions, identities - - def detect_image( + def detect( self, - input_file_list, + inputs, + data_type="image", output_size=None, batch_size=1, num_workers=0, pin_memory=False, - frame_counter=0, - face_detection_threshold=0.5, face_identity_threshold=0.8, + face_detection_threshold=0.5, + skip_frames=None, + progress_bar=True, **kwargs, ): """ - Detects FEX from one or more image files. If you want to speed up detection you - can process multiple images in batches by setting `batch_size > 1`. However, all - images must have **the same dimensions** to be processed in batches. Py-feat can - automatically adjust image sizes by using the `output_size=int`. Common - output-sizes include 256 and 512. - - **NOTE: Currently batch processing images gives slightly different AU detection results due to the way that py-feat integrates the underlying models. You can examine the degree of tolerance by checking out the results of `test_detection_and_batching_with_diff_img_sizes` in our test-suite** + Detects FEX from one or more image files. Args: - input_file_list (list of str): Path to a list of paths to image files. + inputs (list of str, torch.Tensor): Path to a list of paths to image files or torch.Tensor of images (B, C, H, W) + data_type (str): type of data to be processed; Default 'image' ['image', 'tensor', 'video'] output_size (int): image size to rescale all image preserving aspect ratio. - Will raise an error if not set and batch_size > 1 but images are not the same size batch_size (int): how many batches of images you want to run at one shot. - Larger gives faster speed but is more memory-consuming. Images must be the - same size to be run in batches! - num_workers (int): how many subprocesses to use for data loading. ``0`` means that the data will be loaded in the main process. - pin_memory (bool): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them. If your data elements are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type - frame_counter (int): starting value to count frames - face_detection_threshold (float): value between 0-1 to report a detection based on the - confidence of the face detector; Default >= 0.5 + num_workers (int): how many subprocesses to use for data loading. + pin_memory (bool): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them. face_identity_threshold (float): value between 0-1 to determine similarity of person using face identity embeddings; Default >= 0.8 - **kwargs: you can pass each detector specific kwargs using a dictionary - like: `face_model_kwargs = {...}, au_model_kwargs={...}, ...` + face_detection_threshold (float): value between 0-1 to determine if a face was detected; Default >= 0.5 + skip_frames (int or None): number of frames to skip to speed up inference (video only); Default None + progress_bar (bool): Whether to show the tqdm progress bar. Default is True. + **kwargs: additional detector-specific kwargs Returns: - Fex: Prediction results dataframe + pd.DataFrame: Concatenated results for all images in the batch """ - # Keyword arguments than can be passed to the underlying models - face_model_kwargs = kwargs.pop("face_model_kwargs", dict()) - landmark_model_kwargs = kwargs.pop("landmark_model_kwargs", dict()) - au_model_kwargs = kwargs.pop("au_model_kwargs", dict()) - emotion_model_kwargs = kwargs.pop("emotion_model_kwargs", dict()) - facepose_model_kwargs = kwargs.pop("facepose_model_kwargs", dict()) - identity_model_kwargs = kwargs.pop("identity_model_kwargs", dict()) - - data_loader = DataLoader( - ImageDataset( - input_file_list, - output_size=output_size, - preserve_aspect_ratio=True, - padding=True, - ), - num_workers=num_workers, - batch_size=batch_size, - pin_memory=pin_memory, - shuffle=False, - ) - - if self.info["landmark_model"] == "mobilenet" and batch_size > 1: - warnings.warn( - "Currently using mobilenet for landmark detection with batch_size > 1 may lead to erroneous detections. We recommend either setting batch_size=1 or using mobilefacenet as the landmark detection model. You can follow this issue for more: https://github.com/cosanlab/py-feat/issues/151" + if data_type.lower() == "image": + data_loader = DataLoader( + ImageDataset( + inputs, + output_size=output_size, + preserve_aspect_ratio=True, + padding=True, + ), + num_workers=num_workers, + batch_size=batch_size, + pin_memory=pin_memory, + shuffle=False, ) - - try: - batch_output = [] - - for batch_id, batch_data in enumerate(tqdm(data_loader)): - ( - faces, - landmarks, - poses, - aus, - emotions, - identities, - ) = self._run_detection_waterfall( - batch_data, - face_detection_threshold, - face_model_kwargs, - landmark_model_kwargs, - facepose_model_kwargs, - emotion_model_kwargs, - au_model_kwargs, - identity_model_kwargs, - ) - - output = self._create_fex( - faces, - landmarks, - poses, - aus, - emotions, - identities, - batch_data["FileNames"], - frame_counter, - ) - batch_output.append(output) - frame_counter += 1 * batch_size - - batch_output = pd.concat(batch_output) - batch_output.reset_index(drop=True, inplace=True) - batch_output.compute_identities( - threshold=face_identity_threshold, inplace=True + elif data_type.lower() == "tensor": + data_loader = DataLoader( + TensorDataset(inputs), + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + pin_memory=pin_memory, ) - return batch_output - except RuntimeError as e: - raise ValueError( - f"when using a batch_size > 1 all images must have the same dimensions or output_size must not be None so py-feat can rescale images to output_size. See pytorch error: \n{e}" + elif data_type.lower() == "video": + dataset = VideoDataset( + inputs, skip_frames=skip_frames, output_size=output_size + ) + data_loader = DataLoader( + dataset, + num_workers=num_workers, + batch_size=batch_size, + pin_memory=pin_memory, + shuffle=False, ) - def detect_video( - self, - video_path, - skip_frames=None, - output_size=700, - batch_size=1, - num_workers=0, - pin_memory=False, - face_detection_threshold=0.5, - face_identity_threshold=0.8, - **kwargs, - ): - """Detects FEX from a video file. - - Args: - video_path (str): Path to a video file. - skip_frames (int or None): number of frames to skip (speeds up inference, - but less temporal information); Default None - output_size (int): image size to rescale all imagee preserving aspect ratio - batch_size (int): how many batches of images you want to run at one shot. Larger gives faster speed but is more memory-consuming - num_workers (int): how many subprocesses to use for data loading. ``0`` means that the data will be loaded in the main process. - pin_memory (bool): If ``True``, the data loader will copy Tensors - into CUDA pinned memory before returning them. If your data elements - are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type - face_detection_threshold (float): value between 0-1 to report a detection based on the - confidence of the face detector; Default >= 0.5 - face_identity_threshold (float): value between 0-1 to determine similarity of person using face identity embeddings; Default >= 0.8 - - Returns: - Fex: Prediction results dataframe - """ - - # Keyword arguments than can be passed to the underlying models - face_model_kwargs = kwargs.pop("face_model_kwargs", dict()) - landmark_model_kwargs = kwargs.pop("landmark_model_kwargs", dict()) - au_model_kwargs = kwargs.pop("au_model_kwargs", dict()) - emotion_model_kwargs = kwargs.pop("emotion_model_kwargs", dict()) - facepose_model_kwargs = kwargs.pop("facepose_model_kwargs", dict()) - identity_model_kwargs = kwargs.pop("identity_model_kwargs", dict()) - - dataset = VideoDataset( - video_path, skip_frames=skip_frames, output_size=output_size - ) - - data_loader = DataLoader( - dataset, - num_workers=num_workers, - batch_size=batch_size, - pin_memory=pin_memory, - shuffle=False, - ) + data_iterator = tqdm(data_loader) if progress_bar else data_loader batch_output = [] + frame_counter = 0 - for batch_data in tqdm(data_loader): - ( - faces, - landmarks, - poses, - aus, - emotions, - identities, - ) = self._run_detection_waterfall( - batch_data, - face_detection_threshold, - face_model_kwargs, - landmark_model_kwargs, - facepose_model_kwargs, - emotion_model_kwargs, - au_model_kwargs, - identity_model_kwargs, + try: + _ = next(enumerate(tqdm(data_loader))) + except RuntimeError as e: + raise ValueError( + f"When using `batch_size > 1`, all images must either have the same dimension or `output_size` should be something other than `None` to pad images prior to processing\n{e}" ) - frames = list(batch_data["Frame"].numpy()) - - output = self._create_fex( - faces, - landmarks, - poses, - aus, - emotions, - identities, - batch_data["FileName"], - frames, + for batch_id, batch_data in enumerate(data_iterator): + faces_data = self.detect_faces( + batch_data["Image"], + face_size=self.face_size if hasattr(self, "face_size") else 112, + face_detection_threshold=face_detection_threshold, ) - - batch_output.append(output) - - batch_output = pd.concat(batch_output) - batch_output.reset_index(drop=True, inplace=True) - batch_output["approx_time"] = [ - dataset.calc_approx_frame_time(x) for x in batch_output["frame"].to_numpy() - ] - batch_output.compute_identities(threshold=face_identity_threshold, inplace=True) - - return batch_output.set_index("frame", drop=False) - - def _create_fex( - self, - faces, - landmarks, - poses, - aus, - emotions, - identities, - file_names, - frame_counter, - ): - """Helper function to create a Fex instance using detector output - - Args: - faces: output of detect_faces() - landmarks: output of detect_landmarks() - poses: output of dectect_facepose() - aus: output of detect_aus() - emotions: output of detect_emotions() - identities: output of detect_identities() - file_names: file name of input image - frame_counter: starting value for frame counter, useful for integrating batches - - Returns: - Fex object - """ - - logging.info("creating fex output...") - - out = [] - for i, frame in enumerate(faces): - if not frame: - facebox_df = pd.DataFrame( - {x: np.nan for x in self.info["face_detection_columns"]}, - columns=self.info["face_detection_columns"], - index=[i], - ) - facepose_df = pd.DataFrame( - {x: np.nan for x in self.info["facepose_model_columns"]}, - columns=self.info["facepose_model_columns"], - index=[i], - ) - landmarks_df = pd.DataFrame( - {x: np.nan for x in self.info["face_landmark_columns"]}, - columns=self.info["face_landmark_columns"], - index=[i], - ) - aus_df = pd.DataFrame( - {x: np.nan for x in self.info["au_presence_columns"]}, - columns=self.info["au_presence_columns"], - index=[i], - ) - emotions_df = pd.DataFrame( - {x: np.nan for x in self.info["emotion_model_columns"]}, - columns=self.info["emotion_model_columns"], - index=[i], - ) - identity_df = pd.DataFrame( - {x: np.nan for x in self.info["identity_model_columns"]}, - columns=self.info["identity_model_columns"], - index=[i], - ) - input_df = pd.DataFrame(file_names[i], columns=["input"], index=[i]) - tmp_df = pd.concat( - [ - facebox_df, - landmarks_df, - facepose_df, - aus_df, - emotions_df, - identity_df, - input_df, - ], - axis=1, - ) - if isinstance(frame_counter, (list)): - tmp_df[FEAT_TIME_COLUMNS] = frame_counter[i] + batch_results = self.forward(faces_data) + + # Create metadata for each frame + file_names = [] + frame_ids = [] + for i, face in enumerate(faces_data): + n_faces = len(face["scores"]) + if data_type.lower() == "video": + current_frame_id = batch_data["Frame"].detach().numpy()[i] else: - tmp_df[FEAT_TIME_COLUMNS] = frame_counter + i - out.append(tmp_df) - - for j, face_in_frame in enumerate(frame): - facebox_df = pd.DataFrame( - [ - [ - face_in_frame[0], - face_in_frame[1], - face_in_frame[2] - face_in_frame[0], - face_in_frame[3] - face_in_frame[1], - face_in_frame[4], + current_frame_id = frame_counter + i + frame_ids.append(np.repeat(current_frame_id, n_faces)) + file_names.append(np.repeat(batch_data["FileName"][i], n_faces)) + batch_results["input"] = np.concatenate(file_names) + batch_results["frame"] = np.concatenate(frame_ids) + + # Invert the face boxes and landmarks based on the padded output size + for j, frame_idx in enumerate(batch_results["frame"].unique()): + batch_results.loc[ + batch_results["frame"] == frame_idx, ["FrameHeight", "FrameWidth"] + ] = ( + compute_original_image_size(batch_data)[j, :] + .repeat( + len( + batch_results.loc[ + batch_results["frame"] == frame_idx, "frame" + ] + ), + 1, + ) + .numpy() + ) + batch_results.loc[batch_results["frame"] == frame_idx, "FaceRectX"] = ( + batch_results.loc[batch_results["frame"] == frame_idx, "FaceRectX"] + - batch_data["Padding"]["Left"].detach().numpy()[j] + ) / batch_data["Scale"].detach().numpy()[j] + batch_results.loc[batch_results["frame"] == frame_idx, "FaceRectY"] = ( + batch_results.loc[batch_results["frame"] == frame_idx, "FaceRectY"] + - batch_data["Padding"]["Top"].detach().numpy()[j] + ) / batch_data["Scale"].detach().numpy()[j] + batch_results.loc[ + batch_results["frame"] == frame_idx, "FaceRectWidth" + ] = ( + ( + batch_results.loc[ + batch_results["frame"] == frame_idx, "FaceRectWidth" ] - ], - columns=self.info["face_detection_columns"], - index=[j], - ) - - facepose_df = pd.DataFrame( - [poses[i][j]], - columns=self.info["facepose_model_columns"], - index=[j], - ) - - landmarks_df = pd.DataFrame( - [landmarks[i][j].flatten(order="F")], - columns=self.info["face_landmark_columns"], - index=[j], - ) - - aus_df = pd.DataFrame( - aus[i][j, :].reshape(1, len(self["au_presence_columns"])), - columns=self.info["au_presence_columns"], - index=[j], - ) - - emotions_df = pd.DataFrame( - emotions[i][j, :].reshape(1, len(self.info["emotion_model_columns"])), - columns=self.info["emotion_model_columns"], - index=[j], - ) - - identity_df = pd.DataFrame( - np.hstack([np.nan, identities[i][j]]).reshape(-1, 1).T, - columns=self.info["identity_model_columns"], - index=[j], - ) - - input_df = pd.DataFrame( - file_names[i], - columns=["input"], - index=[j], - ) - - tmp_df = pd.concat( - [ - facebox_df, - landmarks_df, - facepose_df, - aus_df, - emotions_df, - identity_df, - input_df, - ], - axis=1, + ) + / batch_data["Scale"].detach().numpy()[j] + ) + batch_results.loc[ + batch_results["frame"] == frame_idx, "FaceRectHeight" + ] = ( + ( + batch_results.loc[ + batch_results["frame"] == frame_idx, "FaceRectHeight" + ] + ) + / batch_data["Scale"].detach().numpy()[j] ) - if isinstance(frame_counter, (list)): - tmp_df[FEAT_TIME_COLUMNS] = frame_counter[i] - else: - tmp_df[FEAT_TIME_COLUMNS] = frame_counter + i - out.append(tmp_df) + for i in range(68): + batch_results.loc[batch_results["frame"] == frame_idx, f"x_{i}"] = ( + batch_results.loc[batch_results["frame"] == frame_idx, f"x_{i}"] + - batch_data["Padding"]["Left"].detach().numpy()[j] + ) / batch_data["Scale"].detach().numpy()[j] + batch_results.loc[batch_results["frame"] == frame_idx, f"y_{i}"] = ( + batch_results.loc[batch_results["frame"] == frame_idx, f"y_{i}"] + - batch_data["Padding"]["Top"].detach().numpy()[j] + ) / batch_data["Scale"].detach().numpy()[j] - out = pd.concat(out) - out.reset_index(drop=True, inplace=True) - - # TODO: Add in support for gaze_columns - return Fex( - out, - au_columns=self.info["au_presence_columns"], - emotion_columns=self.info["emotion_model_columns"], - facebox_columns=self.info["face_detection_columns"], - landmark_columns=self.info["face_landmark_columns"], - facepose_columns=self.info["facepose_model_columns"], - identity_columns=self.info["identity_model_columns"], - detector="Feat", - face_model=self.info["face_model"], - landmark_model=self.info["landmark_model"], - au_model=self.info["au_model"], - emotion_model=self.info["emotion_model"], - facepose_model=self.info["facepose_model"], - identity_model=self.info["identity_model"], - ) - - @staticmethod - def _convert_detector_output(detected_faces, detector_results): - """ - Helper function to convert AU/Emotion detector output into frame by face list of lists. - Either face or landmark detector list of list outputs can be used. - - Args: - detected_faces (list): list of lists output from face/landmark detector - au_results (np.array):, results from au/emotion detectors - - Returns: - list_concat: (list of list). The list which contains the number of faces. for example - if you process 2 frames and each frame contains 4 faces, it will return: - [[xxx,xxx,xxx,xxx],[xxx,xxx,xxx,xxx]] - """ - - length_index = [len(x) for x in detected_faces] - - list_concat = [] - new_lens = np.insert(np.cumsum(length_index), 0, 0) - for ij in range(len(length_index)): - list_concat.append(detector_results[new_lens[ij] : new_lens[ij + 1], :]) - return list_concat - - @staticmethod - def _match_faces_to_poses(faces, faces_pose, poses): - """Helper function to match list of lists of faces and poses based on overlap in bounding boxes. - - Sometimes the face detector finds different faces than the pose detector unless the user - is using the same detector (i.e., img2pose). - - This function will match the faces and poses and will return nans if more faces are detected then poses. - Will only return poses that match faces even if more faces are detected by pose detector. - - Args: - faces (list): list of lists of face bounding boxes from face detector - faces_pose (list): list of lists of face bounding boxes from pose detector - poses (list): list of lists of poses from pose detector - - Returns: - faces (list): list of list of faces that have been matched to poses - poses (list): list of list of poses that have been matched to faces - """ - - if len(faces) != len(faces_pose): - raise ValueError( - "Make sure the number of batches in faces and poses is the same." - ) - - if is_list_of_lists_empty(faces): - # Currently assuming no faces if no face is detected. Not running pose - return (faces, poses) - - else: - overlap_faces = [] - overlap_poses = [] - for frame_face, frame_face_pose, frame_pose in zip(faces, faces_pose, poses): - if not frame_face: - n_faces = 0 - elif isinstance(frame_face[0], list): - n_faces = len(frame_face) - else: - n_faces = 1 - - if not frame_face_pose: - n_poses = 0 - elif isinstance(frame_face_pose[0], list): - n_poses = len(frame_face_pose) - else: - n_poses = 1 - - frame_overlap = np.zeros([n_faces, n_poses]) - - if n_faces == 0: - overlap_faces.append([]) - overlap_poses.append([]) - - elif (n_faces == 1) & (n_poses > 1): - b1 = BBox(frame_face[0][:-1]) - - for pose_idx in range(n_poses): - b2 = BBox(frame_face_pose[pose_idx][:-1]) - frame_overlap[0, pose_idx] = b1.overlap(b2) - matched_pose_index = np.where( - frame_overlap[0, :] == frame_overlap[0, :].max() - )[0][0] - overlap_faces.append(frame_face) - overlap_poses.append([frame_pose[matched_pose_index]]) - - elif (n_faces > 1) & (n_poses == 1): - b2 = BBox(frame_face_pose[0][:-1]) - for face_idx in range(n_faces): - b1 = BBox(frame_face[face_idx][:-1]) - frame_overlap[face_idx, 0] = b1.overlap(b2) - matched_face_index = np.where( - frame_overlap[:, 0] == frame_overlap[:, 0].max() - )[0][0] - new_poses = [] - for f_idx in range(n_faces): - if f_idx == matched_face_index: - new_poses.append(frame_pose[0]) - else: - new_poses.append(np.ones(3) * np.nan) - overlap_faces.append(frame_face) - overlap_poses.append(new_poses) - - else: - for face_idx in range(n_faces): - b1 = BBox(frame_face[face_idx][:-1]) - for pose_idx in range(n_poses): - b2 = BBox(frame_face_pose[pose_idx][:-1]) - frame_overlap[face_idx, pose_idx] = b1.overlap(b2) - - overlap_faces_frame = [] - overlap_poses_frame = [] - if n_faces < n_poses: - for face_idx in range(n_faces): - pose_idx = np.where( - frame_overlap[face_idx, :] - == frame_overlap[face_idx, :].max() - )[0][0] - overlap_faces_frame.append(frame_face[face_idx]) - overlap_poses_frame.append(frame_pose[pose_idx]) - elif n_faces > n_poses: - matched_pose_index = [] - for pose_idx in range(n_poses): - matched_pose_index.append( - np.where( - frame_overlap[:, pose_idx] - == frame_overlap[:, pose_idx].max() - )[0][0] - ) - for face_idx in range(n_faces): - overlap_faces_frame.append(frame_face[face_idx]) - if face_idx in matched_pose_index: - overlap_poses_frame.append( - frame_pose[ - np.where( - frame_overlap[face_idx, :] - == frame_overlap[face_idx, :].max() - )[0][0] - ] - ) - else: - overlap_poses_frame.append(np.ones(3) * np.nan) - elif n_faces == n_poses: - overlap_faces_frame = frame_face - overlap_poses_frame = frame_pose - - overlap_faces.append(overlap_faces_frame) - overlap_poses.append(overlap_poses_frame) - - return (overlap_faces, overlap_poses) + batch_output.append(batch_results) + frame_counter += 1 * batch_size + batch_output = pd.concat(batch_output) + batch_output.reset_index(drop=True, inplace=True) + if data_type.lower() == "video": + batch_output["approx_time"] = [ + dataset.calc_approx_frame_time(x) + for x in batch_output["frame"].to_numpy() + ] + batch_output.compute_identities(threshold=face_identity_threshold, inplace=True) + return batch_output diff --git a/feat/plotting.py b/feat/plotting.py index cc8fba6c..5b873dd1 100644 --- a/feat/plotting.py +++ b/feat/plotting.py @@ -27,7 +27,6 @@ from scipy.spatial import ConvexHull import torchvision.transforms as transforms from torchvision.utils import draw_keypoints, draw_bounding_boxes, make_grid -from feat.utils.mp_plotting import FaceLandmarksConnections __all__ = [ "draw_lineface", @@ -1560,154 +1559,3 @@ def extract_face_from_landmarks(frame, landmarks, face_size=112): masked_image = mask_image(aligned_img, mask) return (masked_image, new_landmarks) - - -def plot_face_landmarks( - fex, - frame_idx, - ax=None, - oval_color="white", - oval_linestyle="-", - oval_linewidth=3, - tesselation_color="gray", - tesselation_linestyle="-", - tesselation_linewidth=1, - mouth_color="white", - mouth_linestyle="-", - mouth_linewidth=3, - eye_color="navy", - eye_linestyle="-", - eye_linewidth=2, - iris_color="skyblue", - iris_linestyle="-", - iris_linewidth=2, -): - """Plots face landmarks on the given frame using specified styles for each part. - - Args: - fex: DataFrame containing face landmarks (x, y coordinates). - frame_idx: Index of the frame to plot. - ax: Matplotlib axis to draw on. If None, a new axis is created. - oval_color, tesselation_color, mouth_color, eye_color, iris_color: Colors for each face part. - oval_linestyle, tesselation_linestyle, mouth_linestyle, eye_linestyle, iris_linestyle: Linestyle for each face part. - oval_linewidth, tesselation_linewidth, mouth_linewidth, eye_linewidth, iris_linewidth: Linewidth for each face part. - n_faces: Number of faces in the frame. If None, will be determined from fex. - """ - if ax is None: - fig, ax = plt.subplots(figsize=(10, 10)) - - # Get frame data - fex_frame = fex.query("frame == @frame_idx") - n_faces_frame = fex_frame.shape[0] - - # Add the frame image - ax.imshow(Image.open(fex_frame["input"].unique()[0])) - - # Helper function to draw lines for a set of connections - def draw_connections(face_idx, connections, color, linestyle, linewidth): - for connection in connections: - start = connection.start - end = connection.end - line = plt.Line2D( - [fex.loc[face_idx, f"x_{start}"], fex.loc[face_idx, f"x_{end}"]], - [fex.loc[face_idx, f"y_{start}"], fex.loc[face_idx, f"y_{end}"]], - color=color, - linestyle=linestyle, - linewidth=linewidth, - ) - ax.add_line(line) - - # Face tessellation - for face in range(n_faces_frame): - draw_connections( - face, - FaceLandmarksConnections.FACE_LANDMARKS_TESSELATION, - tesselation_color, - tesselation_linestyle, - tesselation_linewidth, - ) - - # Mouth - for face in range(n_faces_frame): - draw_connections( - face, - FaceLandmarksConnections.FACE_LANDMARKS_LIPS, - mouth_color, - mouth_linestyle, - mouth_linewidth, - ) - - # Left iris - for face in range(n_faces_frame): - draw_connections( - face, - FaceLandmarksConnections.FACE_LANDMARKS_LEFT_IRIS, - iris_color, - iris_linestyle, - iris_linewidth, - ) - - # Left eye - for face in range(n_faces_frame): - draw_connections( - face, - FaceLandmarksConnections.FACE_LANDMARKS_LEFT_EYE, - eye_color, - eye_linestyle, - eye_linewidth, - ) - - # Left eyebrow - for face in range(n_faces_frame): - draw_connections( - face, - FaceLandmarksConnections.FACE_LANDMARKS_LEFT_EYEBROW, - eye_color, - eye_linestyle, - eye_linewidth, - ) - - # Right iris - for face in range(n_faces_frame): - draw_connections( - face, - FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_IRIS, - iris_color, - iris_linestyle, - iris_linewidth, - ) - - # Right eye - for face in range(n_faces_frame): - draw_connections( - face, - FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_EYE, - eye_color, - eye_linestyle, - eye_linewidth, - ) - - # Right eyebrow - for face in range(n_faces_frame): - draw_connections( - face, - FaceLandmarksConnections.FACE_LANDMARKS_RIGHT_EYEBROW, - eye_color, - eye_linestyle, - eye_linewidth, - ) - - # Face oval - for face in range(n_faces_frame): - draw_connections( - face, - FaceLandmarksConnections.FACE_LANDMARKS_FACE_OVAL, - oval_color, - oval_linestyle, - oval_linewidth, - ) - - # Optionally turn off axis for a clean plot - ax.axis("off") - - return ax diff --git a/feat/tests/performance_testing.py b/feat/tests/performance_testing.py index ecc41e17..d5cde0c1 100644 --- a/feat/tests/performance_testing.py +++ b/feat/tests/performance_testing.py @@ -1,5 +1,5 @@ # %% -from feat.FastDetector import MPDetector +from feat.MPDetector import MPDetector import os from feat.utils.io import get_test_data_path import cProfile @@ -7,7 +7,6 @@ multi_face = os.path.join(get_test_data_path(), "multi_face.jpg") -# detector = FastDetector() detector = MPDetector(device="mps", emotion_model="resmasknet", identity_model="facenet") # detector.detect(multi_face, data_type='image') diff --git a/feat/tests/test_fast_detector.py b/feat/tests/test_detector.py similarity index 98% rename from feat/tests/test_fast_detector.py rename to feat/tests/test_detector.py index 0c4aece7..e79c41ae 100644 --- a/feat/tests/test_fast_detector.py +++ b/feat/tests/test_detector.py @@ -1,5 +1,5 @@ import pytest -from feat.FastDetector import FastDetector +from feat.detector import Detector from feat.data import Fex from huggingface_hub import PyTorchModelHubMixin import numpy as np @@ -21,10 +21,10 @@ "face_noface_mov", "noface_face_mov", ) -class Test_Fast_Detector: +class Test_Detector: """Test new single model detector""" - detector = FastDetector(device="cpu") + detector = Detector(device="cpu") def test_init(self): assert isinstance(self.detector, PyTorchModelHubMixin) @@ -122,7 +122,7 @@ def test_fast_detection_and_batching_with_diff_img_sizes( def test_fast_init_with_wrongmodelname(self): """Should fail with unsupported model name""" with pytest.raises(ValueError): - _ = FastDetector(emotion_model="badmodelname") + _ = Detector(emotion_model="badmodelname") def test_fast_nofile(self): """Should fail with missing data"""