Skip to content

Commit

Permalink
add dwpose
Browse files Browse the repository at this point in the history
  • Loading branch information
haofanwang committed Aug 4, 2023
1 parent 1e6bdc0 commit 98a88ca
Show file tree
Hide file tree
Showing 7 changed files with 1,037 additions and 3 deletions.
14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ img = Image.open(BytesIO(response.content)).convert("RGB").resize((512, 512))
# "lineart_coarse", "lineart_realistic", "mediapipe_face", "mlsd", "normal_bae", "normal_midas",
# "openpose", "openpose_face", "openpose_faceonly", "openpose_full", "openpose_hand",
# "scribble_hed, "scribble_pidinet", "shuffle", "softedge_hed", "softedge_hedsafe",
# "softedge_pidinet", "softedge_pidsafe"]
# "softedge_pidinet", "softedge_pidsafe", "dwpose"]
processor_id = 'scribble_hed'
processor = Processor(processor_id)

Expand All @@ -47,7 +47,7 @@ Each model can be loaded individually by importing and instantiating them as fol
from PIL import Image
import requests
from io import BytesIO
from controlnet_aux import HEDdetector, MidasDetector, MLSDdetector, OpenposeDetector, PidiNetDetector, NormalBaeDetector, LineartDetector, LineartAnimeDetector, CannyDetector, ContentShuffleDetector, ZoeDetector, MediapipeFaceDetector, SamDetector, LeresDetector
from controlnet_aux import HEDdetector, MidasDetector, MLSDdetector, OpenposeDetector, PidiNetDetector, NormalBaeDetector, LineartDetector, LineartAnimeDetector, CannyDetector, ContentShuffleDetector, ZoeDetector, MediapipeFaceDetector, SamDetector, LeresDetector, DWposeDetector

# load image
url = "https://huggingface.co/lllyasviel/sd-controlnet-openpose/resolve/main/images/pose.png"
Expand All @@ -69,6 +69,15 @@ sam = SamDetector.from_pretrained("ybelkada/segment-anything", subfolder="checkp
mobile_sam = SamDetector.from_pretrained("dhkim2810/MobileSAM", model_type="vit_t", filename="mobile_sam.pt")
leres = LeresDetector.from_pretrained("lllyasviel/Annotators")

# download sepecify configs and ckpts
# det_config: ./src/controlnet_aux/dwpose/yolox_config/yolox_l_8xb8-300e_coco.py
# det_ckpt: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth
# pose_config: ./src/controlnet_aux/dwpose/dwpose_config/dwpose-l_384x288.py
# pose_ckpt: https://huggingface.co/wanghaofan/dw-ll_ucoco_384/resolve/main/dw-ll_ucoco_384.pth
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dwpose = DWposeDetector(det_config, det_ckpt, pose_config, pose_ckpt, device)

# instantiate
canny = CannyDetector()
content = ContentShuffleDetector()
Expand All @@ -91,4 +100,5 @@ processed_image_leres = leres(img)
processed_image_canny = canny(img)
processed_image_content = content(img)
processed_image_mediapipe_face = face_detector(img)
processed_image_dwpose = dwpose(img)
```
87 changes: 87 additions & 0 deletions src/controlnet_aux/dwpose/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Openpose
# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
# 3rd Edited by ControlNet
# 4th Edited by ControlNet (added face and correct hands)

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import cv2
import torch
import numpy as np
from PIL import Image

from . import util
from .wholebody import Wholebody


def draw_pose(pose, H, W):
bodies = pose['bodies']
faces = pose['faces']
hands = pose['hands']
candidate = bodies['candidate']
subset = bodies['subset']

canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
canvas = util.draw_bodypose(canvas, candidate, subset)
canvas = util.draw_handpose(canvas, hands)
canvas = util.draw_facepose(canvas, faces)

return canvas

class DWposeDetector:
def __init__(self, det_config, det_ckpt, pose_config, pose_ckpt, device):

self.pose_estimation = Wholebody(det_config, det_ckpt, pose_config, pose_ckpt, device)

def __call__(self, oriImg, output_type="pil", detect_resolution=512, image_resolution=512):

oriImg = oriImg.copy()
input_image = cv2.cvtColor(np.array(oriImg), cv2.COLOR_RGB2BGR)

input_image = util.HWC3(input_image)
input_image = util.resize_image(input_image, detect_resolution)
H, W, C = input_image.shape

with torch.no_grad():
candidate, subset = self.pose_estimation(input_image)
nums, keys, locs = candidate.shape
candidate[..., 0] /= float(W)
candidate[..., 1] /= float(H)
body = candidate[:,:18].copy()
body = body.reshape(nums*18, locs)
score = subset[:,:18]

for i in range(len(score)):
for j in range(len(score[i])):
if score[i][j] > 0.3:
score[i][j] = int(18*i+j)
else:
score[i][j] = -1

un_visible = subset<0.3
candidate[un_visible] = -1

foot = candidate[:,18:24]

faces = candidate[:,24:92]

hands = candidate[:,92:113]
hands = np.vstack([hands, candidate[:,113:]])

bodies = dict(candidate=body, subset=score)
pose = dict(bodies=bodies, hands=hands, faces=faces)

detected_map = draw_pose(pose, H, W)
detected_map = util.HWC3(detected_map)

img = util.resize_image(input_image, image_resolution)
H, W, C = img.shape

detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)

if output_type == "pil":
detected_map = Image.fromarray(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB))

return detected_map
257 changes: 257 additions & 0 deletions src/controlnet_aux/dwpose/dwpose_config/dwpose-l_384x288.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
# runtime
max_epochs = 270
stage2_num_epochs = 30
base_lr = 4e-3

train_cfg = dict(max_epochs=max_epochs, val_interval=10)
randomness = dict(seed=21)

# optimizer
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))

# learning rate
param_scheduler = [
dict(
type='LinearLR',
start_factor=1.0e-5,
by_epoch=False,
begin=0,
end=1000),
dict(
# use cosine lr from 150 to 300 epoch
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
]

# automatically scaling LR based on the actual training batch size
auto_scale_lr = dict(base_batch_size=512)

# codec settings
codec = dict(
type='SimCCLabel',
input_size=(288, 384),
sigma=(6., 6.93),
simcc_split_ratio=2.0,
normalize=False,
use_dark=False)

# model settings
model = dict(
type='TopdownPoseEstimator',
data_preprocessor=dict(
type='PoseDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True),
backbone=dict(
_scope_='mmdet',
type='CSPNeXt',
arch='P5',
expand_ratio=0.5,
deepen_factor=1.,
widen_factor=1.,
out_indices=(4, ),
channel_attention=True,
norm_cfg=dict(type='SyncBN'),
act_cfg=dict(type='SiLU'),
init_cfg=dict(
type='Pretrained',
prefix='backbone.',
checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa
)),
head=dict(
type='RTMCCHead',
in_channels=1024,
out_channels=133,
input_size=codec['input_size'],
in_featuremap_size=(9, 12),
simcc_split_ratio=codec['simcc_split_ratio'],
final_layer_kernel_size=7,
gau_cfg=dict(
hidden_dims=256,
s=128,
expansion_factor=2,
dropout_rate=0.,
drop_path=0.,
act_fn='SiLU',
use_rel_bias=False,
pos_enc=False),
loss=dict(
type='KLDiscretLoss',
use_target_weight=True,
beta=10.,
label_softmax=True),
decoder=codec),
test_cfg=dict(flip_test=True, ))

# base dataset settings
dataset_type = 'CocoWholeBodyDataset'
data_mode = 'topdown'
data_root = '/data/'

backend_args = dict(backend='local')
# backend_args = dict(
# backend='petrel',
# path_mapping=dict({
# f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
# f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
# }))

# pipelines
train_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=1.0),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]
val_pipeline = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='PackPoseInputs')
]

train_pipeline_stage2 = [
dict(type='LoadImage', backend_args=backend_args),
dict(type='GetBBoxCenterScale'),
dict(type='RandomFlip', direction='horizontal'),
dict(type='RandomHalfBody'),
dict(
type='RandomBBoxTransform',
shift_factor=0.,
scale_factor=[0.75, 1.25],
rotate_factor=60),
dict(type='TopdownAffine', input_size=codec['input_size']),
dict(type='mmdet.YOLOXHSVRandomAug'),
dict(
type='Albumentation',
transforms=[
dict(type='Blur', p=0.1),
dict(type='MedianBlur', p=0.1),
dict(
type='CoarseDropout',
max_holes=1,
max_height=0.4,
max_width=0.4,
min_holes=1,
min_height=0.2,
min_width=0.2,
p=0.5),
]),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs')
]

datasets = []
dataset_coco=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=[],
)
datasets.append(dataset_coco)

scene = ['Magic_show', 'Entertainment', 'ConductMusic', 'Online_class',
'TalkShow', 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow',
'Singing', 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference']

for i in range(len(scene)):
datasets.append(
dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='UBody/annotations/'+scene[i]+'/keypoint_annotation.json',
data_prefix=dict(img='UBody/images/'+scene[i]+'/'),
pipeline=[],
)
)

# data loaders
train_dataloader = dict(
batch_size=32,
num_workers=10,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CombinedDataset',
metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
datasets=datasets,
pipeline=train_pipeline,
test_mode=False,
))
val_dataloader = dict(
batch_size=32,
num_workers=10,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
bbox_file=f'{data_root}coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
data_prefix=dict(img='coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader

# hooks
default_hooks = dict(
checkpoint=dict(
save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))

custom_hooks = [
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
priority=49),
dict(
type='mmdet.PipelineSwitchHook',
switch_epoch=max_epochs - stage2_num_epochs,
switch_pipeline=train_pipeline_stage2)
]

# evaluators
val_evaluator = dict(
type='CocoWholeBodyMetric',
ann_file=data_root + 'coco/annotations/coco_wholebody_val_v1.0.json')
test_evaluator = val_evaluator
Loading

0 comments on commit 98a88ca

Please sign in to comment.