Why ?

How ?

The whole thing is actually quite simple with Google Face Landmarker, which is basically 3 models packaged together:

Face detection model: detects the presence of faces with a few key facial landmarks
Face mesh model: adds a complete mapping of the face. The model outputs an estimate of 478 3-dimensional face landmarks.
Blendshape prediction model: receives output from the face mesh model predicts 52 “blendshape” scores, which are coefficients representing different facial expressions.

We just run this on a video with a secret soyjack algorithm and voilà, we can detect soyjacks in real time (from your webcam !!!)

For extra spice, we can also add a hands detector which works exactly the same.

Code

Need to install mediapipe and download the packaged models:

pip install -q mediapipe

wget -O face_landmarker.task -q https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task

wget -O hand_landmarker.task -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/latest/hand_landmarker.task

Then we can define some helper function to nicely draw the detected landmarks and face mesh on the original image:

from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2

def landmarks_to_proto(landmarks):
    """Convert landmarks to protocol buffer format."""
    proto = landmark_pb2.NormalizedLandmarkList()
    proto.landmark.extend([
        landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) 
        for landmark in landmarks
    ])
    return proto

def draw_landmarks_with_style(image, landmark_list, connections, landmark_style=None, connection_style=None):
    """Draw landmarks with specified styles."""
    solutions.drawing_utils.draw_landmarks(
        image=image,
        landmark_list=landmark_list,
        connections=connections,
        landmark_drawing_spec=landmark_style,
        connection_drawing_spec=connection_style
    )

def draw_face_landmarks_on_image(rgb_image, detection_result):
    """Draw face landmarks, mesh, contours, and irises on the image."""
    face_landmarks_list = detection_result.face_landmarks
    annotated_image = np.copy(rgb_image)
    
    for face_landmarks in face_landmarks_list:
        face_proto = landmarks_to_proto(face_landmarks)
        
        # Define face mesh drawing configurations
        mesh_styles = [
            (solutions.face_mesh.FACEMESH_TESSELATION,
             solutions.drawing_styles.get_default_face_mesh_tesselation_style()),
            (solutions.face_mesh.FACEMESH_CONTOURS,
             solutions.drawing_styles.get_default_face_mesh_contours_style()),
            (solutions.face_mesh.FACEMESH_IRISES,
             solutions.drawing_styles.get_default_face_mesh_iris_connections_style())
        ]
        
        # Draw all face mesh components
        for connections, style in mesh_styles:
            draw_landmarks_with_style(
                annotated_image, 
                face_proto, 
                connections, 
                connection_style=style
            )
    
    return annotated_image

def draw_hands_landmarks_on_image(rgb_image, detection_result):
    """Draw hand landmarks and connections on the image."""
    hand_landmarks_list = detection_result.hand_landmarks
    annotated_image = np.copy(rgb_image)
    
    for hand_landmarks in hand_landmarks_list:
        hand_proto = landmarks_to_proto(hand_landmarks)
        
        draw_landmarks_with_style(
            annotated_image,
            hand_proto,
            solutions.hands.HAND_CONNECTIONS,
            landmark_style=solutions.drawing_styles.get_default_hand_landmarks_style(),
            connection_style=solutions.drawing_styles.get_default_hand_connections_style()
        )
    
    return annotated_image

The actual inference models can simply be created from the downloaded model like so :

from mediapipe.tasks import python
from mediapipe.tasks.python import vision

# FACE DETECTION
base_options = python.BaseOptions(model_asset_path='face_landmarker.task')
options = vision.FaceLandmarkerOptions(base_options=base_options,
                                       output_face_blendshapes=True,  # Needed to get facial expressions scores
                                       output_facial_transformation_matrixes=False,
                                       num_faces=1)
detector_face = vision.FaceLandmarker.create_from_options(options)


# HANDS DETECTION
base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=2)
detector_hand = vision.HandLandmarker.create_from_options(options)

We need to define a few more helper functions for the webcam stuff and text overlay:

import cv2

def setup_webcam(camera_id=0, width=800, height=800):
    """
    Initialize the webcam with specified parameters
    """
    cap = cv2.VideoCapture(camera_id)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
    
    if not cap.isOpened():
        raise IOError("Cannot open webcam")
    return cap


def add_text_to_frame(frame, text, position=None):
    height, width = frame.shape[:2]
    if position is None:
        position = (width - 500, 30)
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.7
    font_color = (255,192,203)
    thickness = 2
    (text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness)
    cv2.rectangle(frame, 
                 (position[0] - 5, position[1] - text_height - 5),
                 (position[0] + text_width + 5, position[1] + 5),
                 (0, 0, 0),
                 -1)
    cv2.putText(frame,
                text,
                position,
                font,
                font_scale,
                font_color,
                thickness)
    return frame

The main loop is quite simple (press Q to exit), we just ingest frames from the webcam feed and run the models on each frame :

import mediapipe as mp

def main(privacy=False):
    cap = setup_webcam()
    try:
        while True:
            # Read frame from webcam
            ret, frame = cap.read()
            if not ret:
                print("Failed to grab frame")
                break
                
            # Convert the webcam frame into correct format and run inference
            rgb_image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            mp_image = mp.Image(mp.ImageFormat.SRGB, data=rgb_image)
            
            detection_result_face = detector_face.detect(mp_image)
            detection_result_hands = detector_hand.detect(mp_image)
            
            
            # If turned on, only display the detected face landmarks
            if privacy:
                frame = np.zeros(frame.shape)
            
            # Draw detected landmarks on the webcam frame
            annotated_image = draw_face_landmarks_on_image(frame, detection_result_face)
            annotated_image = draw_hands_landmarks_on_image(annotated_image, detection_result_hands)
            
            # Top secret algorithm to detect a soyface based on the model outputs
            text = "No soyjack detected ......"
            jaw_open = 0
            eyes_wide = 0
            brows_up = 0
            smile_wide = 0
            try:
                for d in detection_result_face.face_blendshapes[0]:
                    if d.category_name == 'jawOpen':
                        jaw_open = d.score
                    elif d.category_name in ['eyeWideLeft', 'eyeWideRight']:
                        eyes_wide = max(eyes_wide, d.score)
                    elif d.category_name in ['browOuterUpLeft', 'browOuterUpRight']:
                        brows_up = max(brows_up, d.score)
                    elif d.category_name in ['mouthSmileLeft', 'mouthSmileRight']:
                        smile_wide = max(smile_wide, d.score)
                # Top secret
                soyjack_score = (eyes_wide * 0.3 + brows_up * 0.4 + smile_wide * 0.3)
                if jaw_open >= 0.5 or soyjack_score >= 0.5:
                    soyjack_score = max(jaw_open, soyjack_score)
                    text = f"SOYJACK DETECTED !!! (soyprob : {soyjack_score:.2%})"
                
                # Display the soyscore if detected
                annotated_image = add_text_to_frame(annotated_image, text)
                annotated_image = add_text_to_frame(annotated_image, f"PRIVACY MODE : {privacy}", position=(annotated_image.shape[1] - 250,
                                                                                                            annotated_image.shape[0] - 50))
                cv2.imshow('Webcam Feed', annotated_image)
            
            # Might fail in case of extreme occlusion etc, just ignore the frame
            except Exception as e:
                print(f"NO LANDMARKS DETECTED : {e}")
            
            # Break the loop when 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break          
    finally:
        cap.release()
        cv2.destroyAllWindows()

if __name__ == "__main__":
    main(privacy=False)

That’s pretty much it. If I’m not lazy I’ll package it into an app and add a simple frontend and maybe more options.

Happy soyjacking !!!