From 98c81c23eb4c21202c4bb4bf875cd10717995a12 Mon Sep 17 00:00:00 2001 From: LyqSpace Date: Wed, 8 Jun 2022 00:18:23 -0500 Subject: [PATCH] update README and data --- .gitignore | 14 +- README.md | 62 +++- est_waterlevel.py | 4 +- estimation/object_detection.py | 366 ++------------------- scripts/download_MeshTransformer_models.sh | 8 + scripts/inference_bodymesh.py | 332 +++++++++++++++++++ 6 files changed, 421 insertions(+), 365 deletions(-) create mode 100644 scripts/download_MeshTransformer_models.sh create mode 100644 scripts/inference_bodymesh.py diff --git a/.gitignore b/.gitignore index a0f5de6..999f12c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,17 +1,9 @@ .idea/ - __pycache__/ - +vflood/ logs/ - -records/cp_WaterNet.pth.tar -output/ -output2/ -overlay/ records/ +MeshTransformer/ -video_module/logs/ -image_module/WaterSegModels/ - -env/ \ No newline at end of file +output/ \ No newline at end of file diff --git a/README.md b/README.md index 95888c3..3b011bf 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,24 @@ This is an official PyTorch implementation for paper "V-FloodNet: A Video Segmentation System for Urban Flood Detection and Quantification". -## Environments +## 1 Environments + +### 1.1 Code and packages We developed and tested the source code under Ubuntu 18.04 and PyTorch framework. The following packages are required to run the code. -First, a python virtual environment is recommended. +First, git clone this repository +```bash +git clone https://github.com/xmlyqing00/V-FloodNet.git +``` + +Second, a python virtual environment is recommended. I use `pip` to create a virtual environment named `env` and activate it. Then, recursively pull the submodules code. ```shell -python3 -m venv env -source env/bin/activate +python3 -m venv vflood +source vflood/bin/activate git submodule update --init --recursive ``` @@ -23,7 +30,7 @@ In the virtual environment, install the following required packages from their o - [Detectron2](https://github.com/facebookresearch/detectron2) for reference objects segmentation. - [MeshTransformer](https://github.com/microsoft/MeshTransformer) for human detection and 3D mesh alignment. -We provide the corresponding installation command here +We provide the corresponding installation command here, you can replace the version number that fit your environment. ```shell pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 torchaudio==0.8.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html @@ -39,27 +46,56 @@ Then install the rest packages indicated in `requirements.txt` pip install -r requirements.txt ``` -## Usage +### 1.2 Pretrained Models -Download and extract the pretrained weights, and put them in the folder `./records/`. Weights and groundtruths are stored in [Google Drive](https://drive.google.com/file/d/1r0YmT24t4uMwi4xtSLXD5jyaMIuMzorS/view?usp=sharing). +First, run the following script to download the pretrained models of MeshTransformer +```bash +sh scripts/download_MeshTransformer_models.sh +``` -### Water Image Segmentation -Put the testing images in `image_folder`, then +Second, download SMPL model `mpips_smplify_public_v2.zip` from the official website [SMPLify](http://smplify.is.tue.mpg.de/). Extract it and place the model file `basicModel_neutral_lbs_10_207_0_v1.0.0.pkl` at `./MeshTransformer/metro/modeling/data`. + + + + +Third, download the archives from [Google Drive](https://drive.google.com/drive/folders/1DURwcb_qhBeWYznTrpJ-7yGJTHxm7pxC?usp=sharing). +Extract the pretrained models for water segmentation `records.zip` and put them in the folder `./records/`. +Extract the water dataset `WaterDataset` in any path, which includes the training images and testing videos. + + +## 2 Usage + +### 2.1 Water Image Segmentation +Put the testing images in a folder then ```shell python test_image_seg.py \ --test_path=/path/to/image_folder --test_name= ``` The default output folder is `output/segs/` -### Water Video Segmentation +### 2.2 Water Video Segmentation If your input is a video, we provide a script `scripts/cvt_video_to_imgs.py` to extract frames of the video. -Put the extracted frames in `frame_folder`, then +Put the extracted frames in a folder then ```shell python test_video_seg.py \ --test-path=/path/to/frame_folder --test-name= ``` -### Water Depth Estimation +### 2.3 Water Depth Estimation We provide three options `stopsign`, `people`, and `ref` for `--opt` to specify three types reference objects. ```shell @@ -71,5 +107,5 @@ For input video, to compare the estimated water level with the groundtruths in ` python cmp_hydrograph.py --test-name= ``` -## Copyright +## 3 Copyright This paper is submitted to Elsevier Journal Computers, Environment and Urban Systems under review. The corresponding author is Xin Li (Xin Li ). All rights are reserved. diff --git a/est_waterlevel.py b/est_waterlevel.py index bceef4d..a458611 100644 --- a/est_waterlevel.py +++ b/est_waterlevel.py @@ -20,7 +20,7 @@ def get_parser(): parser.add_argument('--out-dir', default='output/waterlevel', help='A file or directory to save output results.') parser.add_argument('--opt', type=str, - help='Estimation options.') + help='Estimation options. "people", "stopsign", or "ref"') return parser.parse_args() @@ -33,7 +33,7 @@ def main(args): out_dir = os.path.join(args.out_dir, f'{args.test_name}_{args.opt}') os.makedirs(out_dir, exist_ok=True) - if args.opt in ['skeleton', 'stopsign']: + if args.opt in ['people', 'stopsign']: est_by_obj_detection(img_list, water_mask_list, out_dir, args.opt) elif args.opt == 'ref': est_by_reference(img_list, water_mask_list, out_dir, args.test_name) diff --git a/estimation/object_detection.py b/estimation/object_detection.py index 9b81693..6a6de15 100644 --- a/estimation/object_detection.py +++ b/estimation/object_detection.py @@ -1,9 +1,7 @@ from tqdm import trange import cv2 import os -from detectron2.data import MetadataCatalog from detectron2.config import get_cfg -from detectron2.utils.visualizer import Visualizer from detectron2.engine.defaults import DefaultPredictor from detectron2.projects.point_rend import add_pointrend_config from detectron2.structures import Instances @@ -20,15 +18,15 @@ 'opts': ['MODEL.WEIGHTS', 'https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_X_101_32x8d_FPN_3x_coco/28119989/model_final_ba17b9.pkl'], 'conf_thres': 0.5, } -skeleton_config = { +people_config = { 'config_file': 'estimation/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml', 'opts': ['MODEL.WEIGHTS', 'https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x/138363331/model_final_997cc7.pkl'], 'conf_thres': 0.7, } stopsign_meta = { - 'size': 76.2, # 36 inch 91.44cm, 30inch, 76.2 - 'pole_height': 243.84, # 7 feet 213.36 cm, 8 feet 243.84cm + 'size': 79, # 75cm + 2 * 2cm (white border) = 79cm + 'pole_height': 215.9 # 85in = 215.9cm } people_meta = { @@ -39,7 +37,7 @@ object_colors = { 'background': [0, 0, 0], 'stopsign': [128, 128, 0], - 'skeleton': [0, 128, 128] + 'people': [0, 128, 128] } water_label_id = 1 @@ -56,7 +54,7 @@ def draw_instances(img: np.array, instances: Instances): return img -def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name): +def waterdepth_by_stopsign(img, instances, water_mask, result_dir, img_name): # Constants thickness = 6 @@ -69,10 +67,11 @@ def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name): degree_step = np.deg2rad(360 / pts_n) degree_pos = degree_step / 2 plate_radius = 50 - plate_center = (200, 100) - template_size = (400, 400) - template_plate_height = plate_radius * (2 ** 0.5) - template_pole_height = template_plate_height / stopsign_meta['size'] * stopsign_meta['pole_height'] + plate_center = (150, 75) + template_size = (400, 300) + template_plate_height = np.cos(degree_pos) * plate_radius + template_pole_height = 2 * template_plate_height / stopsign_meta['size'] * stopsign_meta['pole_height'] + # print(plate_radius, template_plate_height, template_pole_height) plate_pts = [] for i in range(pts_n): @@ -86,6 +85,8 @@ def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name): template_pole_bottom = template_pole_top.copy() template_pole_bottom[1] += template_pole_height template_pole_top, template_pole_bottom = template_pole_top.astype(int), template_pole_bottom.astype(int) + # print('pts', plate_pts) + # print('pole', template_pole_top, template_pole_bottom) template_canvas = np.ones((template_size) + (3,)) * 255 template_plate_pts = template_plate_pts.astype(int) @@ -182,8 +183,8 @@ def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name): template_pole_bottom_water[1] += (1 - submerged_ratio) * template_pole_height template_pole_top, template_pole_bottom_water = template_pole_top.astype(int), template_pole_bottom_water.astype(int) - template_pole_bottom_water_left = (template_size[0] // 4, template_pole_bottom_water[1]) - template_pole_bottom_water_right = (template_size[0] * 3 // 4, template_pole_bottom_water[1]) + template_pole_bottom_water_left = (template_size[1] // 4, template_pole_bottom_water[1]) + template_pole_bottom_water_right = (template_size[1] * 3 // 4, template_pole_bottom_water[1]) cv2.line(template_canvas, template_pole_bottom_water, template_pole_bottom, submerged_color, thickness) cv2.line(template_canvas, template_pole_bottom_water_left, template_pole_bottom_water_right, water_color, thickness) @@ -194,246 +195,13 @@ def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name): break - print(submerged_ratio, waterdepth) return submerged_ratio, waterdepth -def waterdepth_by_stopsign(img, instances, water_mask, viz_img): - - # Extract poles - img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - img_grad = cv2.convertScaleAbs(cv2.Sobel(img_gray, cv2.CV_16S, 1, 0, ksize=3, scale=0.6)) - ret, img_edge = cv2.threshold(img_grad, 50, 255, cv2.THRESH_BINARY) - - min_line_len = 100 - max_line_gap = 20 - lines = cv2.HoughLinesP(img_edge, 1, np.pi / 180, 50, minLineLength=min_line_len, maxLineGap=max_line_gap) - if lines is None: - print('Cannot detect lines in the image. Estimation by stop sign fails.') - return [], None - - lines = lines.squeeze() - dir = (abs(lines[:, 0] - lines[:, 2]) + 1) / (abs(lines[:, 1] - lines[:, 3]) + 1) # dx/dy - lines_vert = lines[dir < 0.5] - lines_vec = myutils.normalize(lines_vert[:, 2:] - lines_vert[:, :2]) - - # viz - # for x1, y1, x2, y2 in lines_vert: - # cv2.line(img, (x1, y1), (x2, y2), (0, 255, 0), 2) - # cv2.imshow('img', img) - # cv2.imshow('grad', img_grad) - # cv2.imshow('edge', img_edge) - # cv2.waitKey() - - stopsign_d = [] - stopsign_pt = [] - stopsign_in_waters = [] - raw_data_list = [] - - for i in range(len(instances.pred_classes)): - if instances.pred_classes[i] != 11: # class index for stopsign - continue - - edge_map = cv2.Canny(instances.pred_masks[i].numpy().astype(np.uint8) * 255, 75, 200) - cnts, hierarchy = cv2.findContours(edge_map, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - cnts = sorted(cnts, key=cv2.contourArea, reverse=True) - - peri = cv2.arcLength(cnts[0], True) - approx = cv2.approxPolyDP(cnts[0], 0.02 * peri, True) - if approx.shape[0] < 8: - continue - - # stopsign geo - pt_center = np.mean(approx, axis=0) - rank_y = np.argsort(approx[:, 0, 1], axis=0) - pt_top = np.mean(approx[rank_y[:2]], axis=0)[0] - pt_bottom = np.mean(approx[rank_y[-2:]], axis=0)[0] - rank_x = np.argsort(approx[:, 0, 0], axis=0) - pt_left = np.mean(approx[rank_x[:2]], axis=0)[0] - pt_right = np.mean(approx[rank_x[-2:]], axis=0)[0] - - stopsign_h = myutils.dist(pt_bottom, pt_top, axis=0) - stopsign_w = myutils.dist(pt_left, pt_right, axis=0) - - # stopsign_h = pt_bottom[1] - pt_top[1] - # stopsign_w = pt_right[0] - pt_left[0] - - # stopsign_vec0 = myutils.normalize(pt_center - lines_vert[:, :2]) - stopsign_vec1 = myutils.normalize(pt_center - pt_bottom).reshape(1, 2) - - # direction - # cos_sim0 = np.abs(np.multiply(lines_vec, stopsign_vec0).sum(axis=1)) - cos_sim1 = np.abs(np.multiply(lines_vec, stopsign_vec1).sum(axis=1)) - # lines_parallel = lines_vert[np.bitwise_and(cos_sim0 > 0.995, cos_sim1 > 0.995)] - xpd0 = np.bitwise_and(pt_left[0] <= lines_vert[:, 0], lines_vert[:, 0] <= pt_right[0]) - xpd1 = np.bitwise_and(pt_left[0] <= lines_vert[:, 2], lines_vert[:, 2] <= pt_right[0]) - lines_parallel = lines_vert[np.bitwise_and(cos_sim1 > 0.9, np.bitwise_or(xpd0, xpd1))] - - # position - lines_end_flag0 = lines_parallel[:, 1] >= pt_bottom[1] - lines_end_flag1 = lines_parallel[:, 3] >= pt_bottom[1] - lines_parallel = lines_parallel[np.bitwise_or(lines_end_flag0, lines_end_flag1)] - - # dist - dist0 = abs(lines_parallel[:, 1] - pt_bottom[1]) < stopsign_h * 3 - dist1 = abs(lines_parallel[:, 3] - pt_bottom[1]) < stopsign_h * 3 - poles = lines_parallel[np.bitwise_and(dist0, dist1)] - - # viz - # for x1, y1, x2, y2 in poles: - # cv2.line(img, (x1, y1), (x2, y2), (0, 255, 0), 2) - # cv2.imshow('img', img) - # tmp = edge_map.copy() - # cv2.drawContours(tmp, cnts, -1, 255, 3) - # cv2.imshow('tmp', tmp) - # cv2.drawContours(edge_map, approx, -1, 255, 3) - # cv2.imshow('edge', edge_map) - # cv2.waitKey() - - poles_bottom_arr = [] - thres_faraway = 5 * stopsign_h - for x1, y1, x2, y2 in poles: - - if y1 < y2: - if y2 - pt_bottom[1] < thres_faraway: - poles_bottom_arr.append([x2, y2]) - else: - if y1 - pt_bottom[1] < thres_faraway: - poles_bottom_arr.append([x1, y1]) - - if len(poles_bottom_arr) == 0: - continue - - poles_bottom_arr = np.array(poles_bottom_arr) - - # remove outliers - # poles_bottom_bias = abs(poles_bottom_arr - poles_bottom_arr.mean(axis=0)).sum(axis=1) - # poles_bottom_bias_std = poles_bottom_bias.min() * 2 - # poles_bottom_arr = poles_bottom_arr[poles_bottom_bias < poles_bottom_bias_std] - - # select topk - # d = myutils.dist(poles_bottom_arr, pt_bottom.reshape(1, 2), axis=1) - # rank_d = np.argsort(d) - # topk = len(poles_bottom_arr) // 2 - # poles_bottom_arr = poles_bottom_arr[rank_d[topk:]] - - poles_bottom_pt = poles_bottom_arr.mean(axis=0).astype(np.int32) - - dx = (poles_bottom_pt[0] - pt_bottom[0]) / (poles_bottom_pt[1] - pt_bottom[1]) - pole_x, pole_y = poles_bottom_pt[0], poles_bottom_pt[1] - for y in range(poles_bottom_pt[1], water_mask.shape[0]): - if water_mask[y][np.round(pole_x).astype(np.int32)] == water_label_id: - pole_y = y - break - else: - pole_x += dx - - poles_bottom_pt = np.array([pole_x, pole_y]) - poles_bottom_d = myutils.dist(poles_bottom_pt, pt_bottom, axis=0) - # cos_ratio = (poles_bottom_pt[1] - pt_bottom[1]) / poles_bottom_d - raw_data_list.append({ - 'pole_top': (*pt_bottom, 1), - 'pole_bottom': (*poles_bottom_pt, 1) - }) - - # print(poles_bottom_pt) - - px2cm = stopsign_meta['size'] / stopsign_h - pole_h_cm = px2cm * poles_bottom_d - # pole_h_cm = pole_d_cm * cos_ratio - - stopsign_in_water = max(0, stopsign_meta['height_urban'] - pole_h_cm) - stopsign_in_waters.append(stopsign_in_water) - print('Est stopsign in water', stopsign_in_water) - - stopsign_pt.append(poles_bottom_pt) - stopsign_d.append(stopsign_in_water) - - # viz - cv2.line(viz_img, tuple(pt_bottom.astype(np.int)), tuple(pt_top.astype(np.int)), (0, 200, 0), 2) - cv2.line(viz_img, tuple(pt_left.astype(np.int)), tuple(pt_right.astype(np.int)), (0, 200, 0), 2) - cv2.line(viz_img, tuple(poles_bottom_pt.astype(np.int)), tuple(pt_bottom.astype(np.int)), (0, 0, 200), 2) - - text_pos = pt_bottom.astype(np.int) - text_pos[0] = max(0, text_pos[0] - 300) - text_pos[1] = max(0, text_pos[1] + 100) - text = f'Depth {stopsign_in_water:.1f}cm' - cv2.putText(viz_img, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 200), thickness=3) - # cv2.imshow('viz_img', viz_img) - # cv2.waitKey() - - # h, w = pred_masks[0].shape[:2] - # depth = self.calc_depth(stopsign_pt, stopsign_d, h, w) - # self.viz_dict['viz_img'] = viz_img - - return stopsign_in_waters, viz_img, raw_data_list - - -def waterdepth_by_skeleton(pred_keypoints, water_mask, keypoint_names, viz_img): - - key_centers = [] - key_depths = [] - thres_keypoint = 0 # 0.05 - bottom_region_size = 15 - bottom_region_area = 2 * (bottom_region_size ** 2) - water_thres = 0.05 - - raw_data_list = [] - for keypoints_per_instance in pred_keypoints: - - max_depth_keypoint_name = None - max_depth_x = 0 - max_depth_y = 0 - max_depth = 200 - - raw_data_dict = {} - for i, keypoint in enumerate(keypoints_per_instance): - x, y, prob = keypoint - raw_data_dict[keypoint_names[i]] = (x.item(), y.item(), prob.item()) - - # if prob < thres_keypoint: - # continue - - # x, y = int(x), int(y) - # bottom_region_l = x - bottom_region_size - # bottom_region_r = x + bottom_region_size - # bottom_region_t = y - bottom_region_size - # bottom_region_b = y + bottom_region_size - # bottom_region = water_mask[bottom_region_t:bottom_region_b, bottom_region_l:bottom_region_r] - # - # water_ratio = bottom_region.sum() / bottom_region_area - - # print(bottom_region.shape, water_ratio, self.keypoint_names[i]) - - # if water_ratio < water_thres: - # continue - - # if water_mask: - # cv2.circle(viz_img, (int(x.item()), int(y.item())), radius=2, color=(0, 200, 0), thickness=2) - - if not max_depth_keypoint_name or (max_depth > skeleton_meta[keypoint_names[i]]): - max_depth_keypoint_name = keypoint_names[i] - max_depth_x = x - max_depth_y = y - max_depth = skeleton_meta[keypoint_names[i]] - - raw_data_list.append(raw_data_dict) - - if max_depth_keypoint_name and water_mask is not None: - # key_centers.append([water_depth_x, water_depth_y]) - key_depths.append(max_depth) - - text_pos = (max(0, int(max_depth_x - 250)), max(0, int(max_depth_y - 25))) - text = f'{max_depth_keypoint_name}: Depth {max_depth:.1f}cm' - cv2.putText(viz_img, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 200), thickness=2) - print('Est people in water', max_depth_keypoint_name, f'depth {max_depth}cm', 'pos', max_depth_x, max_depth_y) - - return key_depths, viz_img, raw_data_list - - -def waterdepth_by_skeleton2(instances, img, water_mask, out_dir, img_name): +def waterdepth_by_people(instances, img, water_mask, out_dir, img_name): img_h, img_w, img_c = img.shape + scale_ratio = 1.5 for person_idx, pred_box in enumerate(instances.pred_boxes): @@ -443,8 +211,8 @@ def waterdepth_by_skeleton2(instances, img, water_mask, out_dir, img_name): x1, y1, x2, y2 = pred_box.numpy().tolist() center_x, center_y = (x1 + x2) / 2, (y1 + y2) / 2 - bbox_w = 1.5 * (x2 - x1) - bbox_h = 1.5 * (y2 - y1) + bbox_w = scale_ratio * (x2 - x1) + bbox_h = scale_ratio * (y2 - y1) radius = max(bbox_w, bbox_h) radius = min(min(img_h, img_w), radius) / 2 @@ -493,29 +261,6 @@ def predict_boundary(y1: np.array, y2: np.array, resolution): # print(y2_bottom, y1_top) boundary = (y2_bottom + y1_top) // 2 - # Option2, Naive Bayesian Gaussian Distribution - # pi1 = len(y1) / (len(y1) + len(y2)) - # pi2 = len(y2) / (len(y1) + len(y2)) - # pi1 = pi2 = 1 - # mu1, sigma1 = y1.mean(), y1.std() - # mu2, sigma2 = y2.mean(), y2.std() - # sigma12, sigma22 = sigma1 ** 2, sigma2 ** 2 - # - # a = sigma22 - sigma12 - # b = 2 * (mu2 * sigma12 - mu1 * sigma22) - # c = mu1 ** 2 * sigma22 - mu2 ** 2 * sigma22 - 2 * sigma12 * sigma22 * np.log((pi1 * sigma2) / (pi2 * sigma1)) - # - # delta = np.sqrt(b ** 2 - 4 * a * c) - # x1 = (-b - delta) / (2 * a) - # x2 = (-b + delta) / (2 * a) - # print(x1, x2) - # if 0 < x1 < resolution: - # boundary = x1 - # elif 0 < x2 < resolution: - # boundary = x2 - # else: - # raise ValueError('Can\'t estimate boundary.') - if np.isnan(boundary): return np.NaN, None else: @@ -526,8 +271,8 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt): if opt == 'stopsign': user_config = stopsign_config - elif opt == 'skeleton': - user_config = skeleton_config + elif opt == 'people': + user_config = people_config else: raise NotImplementedError(opt) @@ -543,7 +288,6 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt): cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = user_config['conf_thres'] cfg.freeze() - metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) det_model = DefaultPredictor(cfg) waterdepth_list = [] @@ -564,49 +308,24 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt): with torch.no_grad(): pred_obj = det_model(img) - # visualizer = Visualizer(img, metadata) instances = pred_obj['instances'].to(torch.device('cpu')) - # if opt == 'stopsign': - # # viz_img = draw_instances(img, instances) - # viz_img = img - # # visualizer.draw_instance_predictions(predictions=instances) - # else: - # for keypoints_per_instance in instances.pred_keypoints: - # visualizer.draw_and_connect_keypoints(keypoints_per_instance) - # viz_img = visualizer.output.get_image() - # - # if water_mask is not None: - # viz_img = myutils.add_overlay(viz_img, water_mask, myutils.color_palette) - if opt == 'stopsign': - submerge_ratio, waterdepth = waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name) - # raw_data = { - # 'instances': raw_data_list, - # 'connection_rules': [('pole_top', 'pole_bottom', (100, 100, 100))] - # } + submerge_ratio, waterdepth = waterdepth_by_stopsign(img, instances, water_mask, result_dir, img_name) waterdepth_list.append((submerge_ratio, waterdepth)) else: - waterdepth_by_skeleton2(instances, img, water_mask, out_dir, img_name) - # raw_data = { - # 'instances': raw_data_list, - # 'connection_rules': metadata.get('keypoint_connection_rules') - # } - # cv2.imwrite(os.path.join(out_dir, f'{img_name}.png'), viz_img) - - # pred_res_path = os.path.join(out_dir, img_name + '.json') - # with open(pred_res_path, 'w') as f: - # json.dump(raw_data, f) + waterdepth_by_people(instances, img, water_mask, out_dir, img_name) + if opt == 'stopsign': with open(os.path.join(out_dir, f'waterdepth.txt'), 'w') as f: for i in trange(len(img_list)): img_name = os.path.basename(img_list[i])[:-4] f.write(f'{img_name}\t{waterdepth_list[i][0]:.4f}\t{waterdepth_list[i][1]:.4f}\n') - elif opt == 'skeleton': + elif opt == 'people': - cmd_str = f'cd /Ship03/Sources/MeshTransformer/ && ' \ - f'python3.8 /Ship03/Sources/MeshTransformer/metro/tools/inference_bodymesh.py ' \ + cmd_str = f'cd ./MeshTransformer/ && ' \ + f'python3.8 ./metro/tools/inference_bodymesh.py ' \ f'--resume_checkpoint=./models/metro_release/metro_3dpw_state_dict.bin ' \ f'--image_file_or_path={os.path.abspath(out_dir)}/input/' print('Execute', cmd_str) @@ -648,9 +367,7 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt): template_2d_under_water = template_3d[label_under_water] template_2d_above_water = template_3d[label_above_water] - # pred_2d_under_water = pred_2d_under_water[np.argsort(pred_2d_under_water[:, 1])] for j in range(pred_2d_under_water.shape[0]): - # cv2.circle(img, pred_2d_under_water[j], 0, [0, 0, 200], 1, lineType=cv2.FILLED) cv2.circle(canvas_est, pred_2d_under_water[j], 0, [0, 0, 200], 2, lineType=cv2.FILLED) water_boundary, under_water_indices = predict_boundary(template_2d_under_water[:, 1], template_2d_above_water[:, 1], resolution) @@ -658,10 +375,8 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt): warnings.warn('Cannot estimate the water boundary.') else: submerge_ratio = 1 - (water_boundary - template_3d_top) / template_3d_height - print(img_name, 'Estimate water boundary', water_boundary, f'submerge ratio {submerge_ratio:.3f}') + # print(img_name, 'Estimate water boundary', water_boundary, f'submerge ratio {submerge_ratio:.3f}') submerge_ratio_list.append(submerge_ratio) - # with open(os.path.join(result_dir, f'{img_name}_waterdepth.txt'), 'w') as f: - # f.write(str(submerge_ratio)) water_boundary_left = (int(resolution * 0.25), water_boundary) water_boundary_right = (int(resolution * 0.75), water_boundary) @@ -671,11 +386,6 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt): for j in range(template_2d_under_water.shape[0]): cv2.circle(canvas_template, (template_2d_under_water[j][0], template_2d_under_water[j][1]), 0, [0, 0, 200], 2, lineType=cv2.FILLED) - # cv2.imshow('img', img) - # cv2.imshow('canvas_est', canvas_est) - # cv2.imshow('canvas_template', canvas_template) - # cv2.imshow('overlay', overlay) - # cv2.waitKey() cv2.imwrite(os.path.join(result_dir, f'{img_name}_est.png'), canvas_est) cv2.imwrite(os.path.join(result_dir, f'{img_name}_template.png'), canvas_template) cv2.imwrite(os.path.join(result_dir, f'{img_name}_overlay.png'), overlay) @@ -685,25 +395,3 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt): img_name = os.path.basename(img_list[i])[:-4] waterdepth = submerge_ratio_list[i] * people_meta['man_height'] f.write(f'{img_name}\t{submerge_ratio_list[i]:.4f}\t{waterdepth:.4f}\n') - - # - # def calc_depth(self, key_centers, key_depths, h, w): - # if len(key_centers) == 0: - # return None - # elif len(key_centers) == 1: - # depth = np.ones((h, w)) * key_depths[0] - # return depth - # else: - # key_centers = np.array(key_centers) - # key_depths = np.array(key_depths) - # - # p = np.stack(np.meshgrid(np.arange(w), np.arange(h)), axis=2).reshape(-1, 2) - # - # d = cdist(p, key_centers, 'euclidean') - # d = np.exp(-d / self.d_var) - - # d = d / d.sum(axis=1, keepdims=True) - # - # depth = np.multiply(d, key_depths).sum(axis=1).reshape(h, w) - # - # return depth \ No newline at end of file diff --git a/scripts/download_MeshTransformer_models.sh b/scripts/download_MeshTransformer_models.sh new file mode 100644 index 0000000..86b96ca --- /dev/null +++ b/scripts/download_MeshTransformer_models.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +cd MeshTransformer +mkdir -p models +bash scripts/download_models.sh +cd .. + +cp scripts/inference_bodymesh.py MeshTransformer/metro/tools/ \ No newline at end of file diff --git a/scripts/inference_bodymesh.py b/scripts/inference_bodymesh.py new file mode 100644 index 0000000..badd1de --- /dev/null +++ b/scripts/inference_bodymesh.py @@ -0,0 +1,332 @@ +""" +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. + +End-to-end inference codes for +3D human body mesh reconstruction from an image +""" + +from __future__ import absolute_import, division, print_function +import argparse +import os +import os.path as op +import code +import json +import torch +import torchvision.models as models +from torchvision.utils import make_grid +import numpy as np +import cv2 +from tqdm import tqdm +from metro.modeling.bert import BertConfig, METRO +from metro.modeling.bert import METRO_Body_Network as METRO_Network +from metro.modeling._smpl import SMPL, Mesh +from metro.modeling.hrnet.hrnet_cls_net_featmaps import get_cls_net +from metro.modeling.hrnet.config import config as hrnet_config +from metro.modeling.hrnet.config import update_config as hrnet_update_config +import metro.modeling.data.config as cfg + +from metro.utils.renderer import Renderer, visualize_reconstruction, visualize_reconstruction_test, \ + visualize_reconstruction_no_text, visualize_reconstruction_and_att_local +from metro.utils.geometric_layers import orthographic_projection +from metro.utils.logger import setup_logger +from metro.utils.miscellaneous import mkdir, set_seed + +from PIL import Image +from torchvision import transforms + +transform = transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225])]) + +transform_visualize = transforms.Compose([ + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.ToTensor()]) + + +def run_inference(args, image_list, _metro_network, smpl, renderer, mesh_sampler): + # switch to evaluate mode + _metro_network.eval() + + mesh_dir = os.path.dirname(image_list[0]) + image_list = sorted(image_list) + + for idx, image_file in enumerate(tqdm(image_list)): + if 'pred' not in image_file: + att_all = [] + img = Image.open(image_file) + img_tensor = transform(img) + img_visual = transform_visualize(img) + + batch_imgs = torch.unsqueeze(img_tensor, 0).cuda() + batch_visual_imgs = torch.unsqueeze(img_visual, 0).cuda() + # forward-pass + pred_camera, pred_3d_joints, pred_vertices_sub2, pred_vertices_sub, pred_vertices, hidden_states, att = _metro_network( + batch_imgs, smpl, mesh_sampler) + + # obtain 3d joints from full mesh + pred_3d_joints_from_smpl = smpl.get_h36m_joints(pred_vertices) + + pred_3d_pelvis = pred_3d_joints_from_smpl[:, cfg.H36M_J17_NAME.index('Pelvis'), :] + pred_3d_joints_from_smpl = pred_3d_joints_from_smpl[:, cfg.H36M_J17_TO_J14, :] + pred_3d_joints_from_smpl = pred_3d_joints_from_smpl - pred_3d_pelvis[:, None, :] + pred_vertices = pred_vertices - pred_3d_pelvis[:, None, :] + + # save attantion + att_max_value = att[-1] + att_cpu = np.asarray(att_max_value.cpu().detach()) + att_all.append(att_cpu) + + # obtain 3d joints, which are regressed from the full mesh + pred_3d_joints_from_smpl = smpl.get_h36m_joints(pred_vertices) + pred_3d_joints_from_smpl = pred_3d_joints_from_smpl[:, cfg.H36M_J17_TO_J14, :] + # obtain 2d joints, which are projected from 3d joints of smpl mesh + pred_2d_joints_from_smpl = orthographic_projection(pred_3d_joints_from_smpl, pred_camera) + pred_2d_431_vertices_from_smpl = orthographic_projection(pred_vertices_sub2, pred_camera) + visual_imgs_att = visualize_mesh_and_attention(renderer, batch_visual_imgs[0], + pred_vertices[0].detach(), + pred_vertices_sub2[0].detach(), + pred_2d_431_vertices_from_smpl[0].detach(), + pred_2d_joints_from_smpl[0].detach(), + pred_camera.detach(), + att[-1][0].detach()) + + visual_imgs = visual_imgs_att.transpose(1, 2, 0) + visual_imgs = np.asarray(visual_imgs) + + temp_fname = image_file[:-4] + '_metro_pred.jpg' + # print('Save img to ', temp_fname) + cv2.imwrite(temp_fname, np.asarray(visual_imgs[:, -225:, ::-1] * 255)) + + pred_2d = pred_2d_431_vertices_from_smpl[0].detach().cpu().numpy().tolist() + pred_result_path = image_file[:-4] + '_pred.txt' + # print('Save predictions to', pred_result_path) + with open(pred_result_path, 'w') as f: + json.dump(pred_2d, f) + + return + + +def visualize_mesh_and_attention(renderer, images, + pred_vertices_full, + pred_vertices, + pred_2d_vertices, + pred_2d_joints, + pred_camera, + attention): + """Tensorboard logging.""" + + img = images.cpu().numpy().transpose(1, 2, 0) + # Get predict vertices for the particular example + vertices_full = pred_vertices_full.cpu().numpy() + vertices = pred_vertices.cpu().numpy() + vertices_2d = pred_2d_vertices.cpu().numpy() + joints_2d = pred_2d_joints.cpu().numpy() + cam = pred_camera.cpu().numpy() + att = attention.cpu().numpy() + # Visualize reconstruction and attention + rend_img = visualize_reconstruction_and_att_local(img, 224, vertices_full, vertices, vertices_2d, cam, renderer, + joints_2d, att, color='pink') + rend_img = rend_img.transpose(2, 0, 1) + + return rend_img + + +def visualize_mesh_no_text(renderer, + images, + pred_vertices, + pred_camera): + """Tensorboard logging.""" + img = images.cpu().numpy().transpose(1, 2, 0) + # Get predict vertices for the particular example + vertices = pred_vertices.cpu().numpy() + cam = pred_camera.cpu().numpy() + # Visualize reconstruction only + rend_img = visualize_reconstruction_no_text(img, 224, vertices, cam, renderer, color='hand') + rend_img = rend_img.transpose(2, 0, 1) + return rend_img + + +def parse_args(): + parser = argparse.ArgumentParser() + ######################################################### + # Data related arguments + ######################################################### + parser.add_argument("--image_file_or_path", default='./test_images/human-body', type=str, + help="test data") + ######################################################### + # Loading/saving checkpoints + ######################################################### + parser.add_argument("--model_name_or_path", default='metro/modeling/bert/bert-base-uncased/', type=str, + required=False, + help="Path to pre-trained transformer model or model type.") + parser.add_argument("--resume_checkpoint", default=None, type=str, required=False, + help="Path to specific checkpoint for inference.") + parser.add_argument("--output_dir", default='output/', type=str, required=False, + help="The output directory to save checkpoint and test results.") + ######################################################### + # Model architectures + ######################################################### + parser.add_argument('-a', '--arch', default='hrnet-w64', + help='CNN backbone architecture: hrnet-w64, hrnet, resnet50') + parser.add_argument("--num_hidden_layers", default=4, type=int, required=False, + help="Update model config if given") + parser.add_argument("--hidden_size", default=-1, type=int, required=False, + help="Update model config if given") + parser.add_argument("--num_attention_heads", default=4, type=int, required=False, + help="Update model config if given. Note that the division of " + "hidden_size / num_attention_heads should be in integer.") + parser.add_argument("--intermediate_size", default=-1, type=int, required=False, + help="Update model config if given.") + parser.add_argument("--input_feat_dim", default='2051,512,128', type=str, + help="The Image Feature Dimension.") + parser.add_argument("--hidden_feat_dim", default='1024,256,128', type=str, + help="The Image Feature Dimension.") + parser.add_argument("--legacy_setting", default=True, action='store_true', ) + ######################################################### + # Others + ######################################################### + parser.add_argument("--device", type=str, default='cuda', + help="cuda or cpu") + parser.add_argument('--seed', type=int, default=88, + help="random seed for initialization.") + + args = parser.parse_args() + return args + + +def main(args): + global logger + # Setup CUDA, GPU & distributed training + args.num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 + args.distributed = args.num_gpus > 1 + args.device = torch.device(args.device) + + mkdir(args.output_dir) + logger = setup_logger("METRO Inference", args.output_dir, 0) + set_seed(args.seed, args.num_gpus) + logger.info("Using {} GPUs".format(args.num_gpus)) + + # Mesh and SMPL utils + mesh_smpl = SMPL().to(args.device) + mesh_sampler = Mesh() + # Renderer for visualization + renderer = Renderer(faces=mesh_smpl.faces.cpu().numpy()) + # Load pretrained model + logger.info("Inference: Loading from checkpoint {}".format(args.resume_checkpoint)) + + if args.resume_checkpoint != None and args.resume_checkpoint != 'None' and 'state_dict' not in args.resume_checkpoint: + logger.info("Evaluation: Loading from checkpoint {}".format(args.resume_checkpoint)) + _metro_network = torch.load(args.resume_checkpoint) + else: + # Build model from scratch, and load weights from state_dict.bin + trans_encoder = [] + input_feat_dim = [int(item) for item in args.input_feat_dim.split(',')] + hidden_feat_dim = [int(item) for item in args.hidden_feat_dim.split(',')] + output_feat_dim = input_feat_dim[1:] + [3] + # init three transformer encoders in a loop + for i in range(len(output_feat_dim)): + config_class, model_class = BertConfig, METRO + config = config_class.from_pretrained(args.model_name_or_path) + + config.output_attentions = False + config.img_feature_dim = input_feat_dim[i] + config.output_feature_dim = output_feat_dim[i] + args.hidden_size = hidden_feat_dim[i] + + if args.legacy_setting == True: + # During our paper submission, we were using the original intermediate size, which is 3072 fixed + # We keep our legacy setting here + args.intermediate_size = -1 + else: + # We have recently tried to use an updated intermediate size, which is 4*hidden-size. + # But we didn't find significant performance changes on Human3.6M (~36.7 PA-MPJPE) + args.intermediate_size = int(args.hidden_size * 4) + + # update model structure if specified in arguments + update_params = ['num_hidden_layers', 'hidden_size', 'num_attention_heads', 'intermediate_size'] + + for idx, param in enumerate(update_params): + arg_param = getattr(args, param) + config_param = getattr(config, param) + if arg_param > 0 and arg_param != config_param: + logger.info("Update config parameter {}: {} -> {}".format(param, config_param, arg_param)) + setattr(config, param, arg_param) + + # init a transformer encoder and append it to a list + assert config.hidden_size % config.num_attention_heads == 0 + model = model_class(config=config) + logger.info("Init model from scratch.") + trans_encoder.append(model) + + # init ImageNet pre-trained backbone model + if args.arch == 'hrnet': + hrnet_yaml = 'models/hrnet/cls_hrnet_w40_sgd_lr5e-2_wd1e-4_bs32_x100.yaml' + hrnet_checkpoint = 'models/hrnet/hrnetv2_w40_imagenet_pretrained.pth' + hrnet_update_config(hrnet_config, hrnet_yaml) + backbone = get_cls_net(hrnet_config, pretrained=hrnet_checkpoint) + logger.info('=> loading hrnet-v2-w40 model') + elif args.arch == 'hrnet-w64': + hrnet_yaml = 'models/hrnet/cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml' + hrnet_checkpoint = 'models/hrnet/hrnetv2_w64_imagenet_pretrained.pth' + hrnet_update_config(hrnet_config, hrnet_yaml) + backbone = get_cls_net(hrnet_config, pretrained=hrnet_checkpoint) + logger.info('=> loading hrnet-v2-w64 model') + else: + print("=> using pre-trained model '{}'".format(args.arch)) + backbone = models.__dict__[args.arch](pretrained=True) + # remove the last fc layer + backbone = torch.nn.Sequential(*list(backbone.children())[:-2]) + + trans_encoder = torch.nn.Sequential(*trans_encoder) + total_params = sum(p.numel() for p in trans_encoder.parameters()) + logger.info('Transformers total parameters: {}'.format(total_params)) + backbone_total_params = sum(p.numel() for p in backbone.parameters()) + logger.info('Backbone total parameters: {}'.format(backbone_total_params)) + + # build end-to-end METRO network (CNN backbone + multi-layer transformer encoder) + _metro_network = METRO_Network(args, config, backbone, trans_encoder, mesh_sampler) + + logger.info("Loading state dict from checkpoint {}".format(args.resume_checkpoint)) + cpu_device = torch.device('cpu') + state_dict = torch.load(args.resume_checkpoint, map_location=cpu_device) + _metro_network.load_state_dict(state_dict, strict=False) + del state_dict + + # update configs to enable attention outputs + setattr(_metro_network.trans_encoder[-1].config, 'output_attentions', True) + setattr(_metro_network.trans_encoder[-1].config, 'output_hidden_states', True) + _metro_network.trans_encoder[-1].bert.encoder.output_attentions = True + _metro_network.trans_encoder[-1].bert.encoder.output_hidden_states = True + for iter_layer in range(4): + _metro_network.trans_encoder[-1].bert.encoder.layer[iter_layer].attention.self.output_attentions = True + for inter_block in range(3): + setattr(_metro_network.trans_encoder[-1].config, 'device', args.device) + + _metro_network.to(args.device) + logger.info("Run inference") + + image_list = [] + if not args.image_file_or_path: + raise ValueError("image_file_or_path not specified") + if op.isfile(args.image_file_or_path): + image_list = [args.image_file_or_path] + elif op.isdir(args.image_file_or_path): + # should be a path with images only + for filename in os.listdir(args.image_file_or_path): + if filename.endswith(".png") or filename.endswith(".jpg") and 'pred' not in filename: + image_list.append(args.image_file_or_path + '/' + filename) + else: + raise ValueError("Cannot find images at {}".format(args.image_file_or_path)) + + run_inference(args, image_list, _metro_network, mesh_smpl, renderer, mesh_sampler) + + +if __name__ == "__main__": + args = parse_args() + main(args)