Skip to content

Commit e9ef448

Browse files
committed
[Feature] Add mediapipe face detector
1 parent bff7822 commit e9ef448

File tree

5 files changed

+185
-2
lines changed

5 files changed

+185
-2
lines changed

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pip install controlnet-aux==0.0.3
1818
from PIL import Image
1919
import requests
2020
from io import BytesIO
21-
from controlnet_aux import HEDdetector, MidasDetector, MLSDdetector, OpenposeDetector, PidiNetDetector, NormalBaeDetector, LineartDetector, LineartAnimeDetector, CannyDetector, ContentShuffleDetector, ZoeDetector
21+
from controlnet_aux import HEDdetector, MidasDetector, MLSDdetector, OpenposeDetector, PidiNetDetector, NormalBaeDetector, LineartDetector, LineartAnimeDetector, CannyDetector, ContentShuffleDetector, ZoeDetector, MediapipeFaceDetector
2222

2323
# load image
2424
url = "https://huggingface.co/lllyasviel/sd-controlnet-openpose/resolve/main/images/pose.png"
@@ -40,6 +40,7 @@ zoe = ZoeDetector.from_pretrained("lllyasviel/Annotators")
4040
# instantiate
4141
canny = CannyDetector()
4242
content = ContentShuffleDetector()
43+
face_detector = MediapipeFaceDetector()
4344

4445

4546
# process
@@ -55,4 +56,5 @@ processed_image_zoe = zoe(img)
5556

5657
processed_image_canny = canny(img)
5758
processed_image_content = content(img)
59+
processed_image_mediapipe_face = face_detector(img)
5860
```

setup.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@
8989
"einops",
9090
"timm",
9191
"torchvision",
92-
"scikit-image"
92+
"scikit-image",
93+
"mediapipe",
9394
]
9495

9596
# this is a lookup table with items like:
@@ -177,6 +178,7 @@ def run(self):
177178
deps["torchvision"],
178179
deps["timm"],
179180
deps["scikit-image"],
181+
deps["mediapipe"],
180182
]
181183

182184
setup(

src/controlnet_aux/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@
1212

1313
from .canny import CannyDetector
1414
from .shuffle import ContentShuffleDetector
15+
from .mediapipe_face import MediapipeFaceDetector
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from typing import Union
2+
3+
from .mediapipe_face_common import generate_annotation
4+
from PIL import Image
5+
import numpy as np
6+
7+
8+
class MediapipeFaceDetector:
9+
def __call__(self,
10+
image: Union[np.ndarray, Image.Image],
11+
max_faces: int = 1,
12+
min_confidence: float = 0.5,
13+
return_pil: bool = True):
14+
15+
if isinstance(image, Image.Image) is True:
16+
image = np.array(image)
17+
18+
face = generate_annotation(image, max_faces, min_confidence)
19+
20+
if return_pil is True:
21+
face = Image.fromarray(face)
22+
23+
return face
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
from typing import Mapping
2+
3+
import mediapipe as mp
4+
import numpy
5+
6+
7+
mp_drawing = mp.solutions.drawing_utils
8+
mp_drawing_styles = mp.solutions.drawing_styles
9+
mp_face_detection = mp.solutions.face_detection # Only for counting faces.
10+
mp_face_mesh = mp.solutions.face_mesh
11+
mp_face_connections = mp.solutions.face_mesh_connections.FACEMESH_TESSELATION
12+
mp_hand_connections = mp.solutions.hands_connections.HAND_CONNECTIONS
13+
mp_body_connections = mp.solutions.pose_connections.POSE_CONNECTIONS
14+
15+
DrawingSpec = mp.solutions.drawing_styles.DrawingSpec
16+
PoseLandmark = mp.solutions.drawing_styles.PoseLandmark
17+
18+
min_face_size_pixels: int = 64
19+
f_thick = 2
20+
f_rad = 1
21+
right_iris_draw = DrawingSpec(color=(10, 200, 250), thickness=f_thick, circle_radius=f_rad)
22+
right_eye_draw = DrawingSpec(color=(10, 200, 180), thickness=f_thick, circle_radius=f_rad)
23+
right_eyebrow_draw = DrawingSpec(color=(10, 220, 180), thickness=f_thick, circle_radius=f_rad)
24+
left_iris_draw = DrawingSpec(color=(250, 200, 10), thickness=f_thick, circle_radius=f_rad)
25+
left_eye_draw = DrawingSpec(color=(180, 200, 10), thickness=f_thick, circle_radius=f_rad)
26+
left_eyebrow_draw = DrawingSpec(color=(180, 220, 10), thickness=f_thick, circle_radius=f_rad)
27+
mouth_draw = DrawingSpec(color=(10, 180, 10), thickness=f_thick, circle_radius=f_rad)
28+
head_draw = DrawingSpec(color=(10, 200, 10), thickness=f_thick, circle_radius=f_rad)
29+
30+
# mp_face_mesh.FACEMESH_CONTOURS has all the items we care about.
31+
face_connection_spec = {}
32+
for edge in mp_face_mesh.FACEMESH_FACE_OVAL:
33+
face_connection_spec[edge] = head_draw
34+
for edge in mp_face_mesh.FACEMESH_LEFT_EYE:
35+
face_connection_spec[edge] = left_eye_draw
36+
for edge in mp_face_mesh.FACEMESH_LEFT_EYEBROW:
37+
face_connection_spec[edge] = left_eyebrow_draw
38+
# for edge in mp_face_mesh.FACEMESH_LEFT_IRIS:
39+
# face_connection_spec[edge] = left_iris_draw
40+
for edge in mp_face_mesh.FACEMESH_RIGHT_EYE:
41+
face_connection_spec[edge] = right_eye_draw
42+
for edge in mp_face_mesh.FACEMESH_RIGHT_EYEBROW:
43+
face_connection_spec[edge] = right_eyebrow_draw
44+
# for edge in mp_face_mesh.FACEMESH_RIGHT_IRIS:
45+
# face_connection_spec[edge] = right_iris_draw
46+
for edge in mp_face_mesh.FACEMESH_LIPS:
47+
face_connection_spec[edge] = mouth_draw
48+
iris_landmark_spec = {468: right_iris_draw, 473: left_iris_draw}
49+
50+
51+
def draw_pupils(image, landmark_list, drawing_spec, halfwidth: int = 2):
52+
"""We have a custom function to draw the pupils because the mp.draw_landmarks method requires a parameter for all
53+
landmarks. Until our PR is merged into mediapipe, we need this separate method."""
54+
if len(image.shape) != 3:
55+
raise ValueError("Input image must be H,W,C.")
56+
image_rows, image_cols, image_channels = image.shape
57+
if image_channels != 3: # BGR channels
58+
raise ValueError('Input image must contain three channel bgr data.')
59+
for idx, landmark in enumerate(landmark_list.landmark):
60+
if (
61+
(landmark.HasField('visibility') and landmark.visibility < 0.9) or
62+
(landmark.HasField('presence') and landmark.presence < 0.5)
63+
):
64+
continue
65+
if landmark.x >= 1.0 or landmark.x < 0 or landmark.y >= 1.0 or landmark.y < 0:
66+
continue
67+
image_x = int(image_cols*landmark.x)
68+
image_y = int(image_rows*landmark.y)
69+
draw_color = None
70+
if isinstance(drawing_spec, Mapping):
71+
if drawing_spec.get(idx) is None:
72+
continue
73+
else:
74+
draw_color = drawing_spec[idx].color
75+
elif isinstance(drawing_spec, DrawingSpec):
76+
draw_color = drawing_spec.color
77+
image[image_y-halfwidth:image_y+halfwidth, image_x-halfwidth:image_x+halfwidth, :] = draw_color
78+
79+
80+
def reverse_channels(image):
81+
"""Given a numpy array in RGB form, convert to BGR. Will also convert from BGR to RGB."""
82+
# im[:,:,::-1] is a neat hack to convert BGR to RGB by reversing the indexing order.
83+
# im[:,:,::[2,1,0]] would also work but makes a copy of the data.
84+
return image[:, :, ::-1]
85+
86+
87+
def generate_annotation(
88+
img_rgb,
89+
max_faces: int,
90+
min_confidence: float
91+
):
92+
"""
93+
Find up to 'max_faces' inside the provided input image.
94+
If min_face_size_pixels is provided and nonzero it will be used to filter faces that occupy less than this many
95+
pixels in the image.
96+
"""
97+
with mp_face_mesh.FaceMesh(
98+
static_image_mode=True,
99+
max_num_faces=max_faces,
100+
refine_landmarks=True,
101+
min_detection_confidence=min_confidence,
102+
) as facemesh:
103+
img_height, img_width, img_channels = img_rgb.shape
104+
assert(img_channels == 3)
105+
106+
results = facemesh.process(img_rgb).multi_face_landmarks
107+
108+
if results is None:
109+
print("No faces detected in controlnet image for Mediapipe face annotator.")
110+
return numpy.zeros_like(img_rgb)
111+
112+
# Filter faces that are too small
113+
filtered_landmarks = []
114+
for lm in results:
115+
landmarks = lm.landmark
116+
face_rect = [
117+
landmarks[0].x,
118+
landmarks[0].y,
119+
landmarks[0].x,
120+
landmarks[0].y,
121+
] # Left, up, right, down.
122+
for i in range(len(landmarks)):
123+
face_rect[0] = min(face_rect[0], landmarks[i].x)
124+
face_rect[1] = min(face_rect[1], landmarks[i].y)
125+
face_rect[2] = max(face_rect[2], landmarks[i].x)
126+
face_rect[3] = max(face_rect[3], landmarks[i].y)
127+
if min_face_size_pixels > 0:
128+
face_width = abs(face_rect[2] - face_rect[0])
129+
face_height = abs(face_rect[3] - face_rect[1])
130+
face_width_pixels = face_width * img_width
131+
face_height_pixels = face_height * img_height
132+
face_size = min(face_width_pixels, face_height_pixels)
133+
if face_size >= min_face_size_pixels:
134+
filtered_landmarks.append(lm)
135+
else:
136+
filtered_landmarks.append(lm)
137+
138+
# Annotations are drawn in BGR for some reason, but we don't need to flip a zero-filled image at the start.
139+
empty = numpy.zeros_like(img_rgb)
140+
141+
# Draw detected faces:
142+
for face_landmarks in filtered_landmarks:
143+
mp_drawing.draw_landmarks(
144+
empty,
145+
face_landmarks,
146+
connections=face_connection_spec.keys(),
147+
landmark_drawing_spec=None,
148+
connection_drawing_spec=face_connection_spec
149+
)
150+
draw_pupils(empty, face_landmarks, iris_landmark_spec, 2)
151+
152+
# Flip BGR back to RGB.
153+
empty = reverse_channels(empty).copy()
154+
155+
return empty

0 commit comments

Comments
 (0)