From 98c81c23eb4c21202c4bb4bf875cd10717995a12 Mon Sep 17 00:00:00 2001
From: LyqSpace <root@lyq.me>
Date: Wed, 8 Jun 2022 00:18:23 -0500
Subject: [PATCH] update README and data

---
 .gitignore                                 |  14 +-
 README.md                                  |  62 +++-
 est_waterlevel.py                          |   4 +-
 estimation/object_detection.py             | 366 ++-------------------
 scripts/download_MeshTransformer_models.sh |   8 +
 scripts/inference_bodymesh.py              | 332 +++++++++++++++++++
 6 files changed, 421 insertions(+), 365 deletions(-)
 create mode 100644 scripts/download_MeshTransformer_models.sh
 create mode 100644 scripts/inference_bodymesh.py

diff --git a/.gitignore b/.gitignore
index a0f5de6..999f12c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,17 +1,9 @@
 .idea/
-
 __pycache__/
 
-
+vflood/
 logs/
-
-records/cp_WaterNet.pth.tar
-output/
-output2/
-overlay/
 records/
+MeshTransformer/
 
-video_module/logs/
-image_module/WaterSegModels/
-
-env/
\ No newline at end of file
+output/
\ No newline at end of file
diff --git a/README.md b/README.md
index 95888c3..3b011bf 100644
--- a/README.md
+++ b/README.md
@@ -2,17 +2,24 @@
 
 This is an official PyTorch implementation for paper "V-FloodNet: A Video Segmentation System for Urban Flood Detection and Quantification".
 
-## Environments
+## 1 Environments
+
+### 1.1 Code and packages
 We developed and tested the source code under Ubuntu 18.04 and PyTorch framework. 
 The following packages are required to run the code.
 
-First, a python virtual environment is recommended. 
+First, git clone this repository
+```bash
+git clone https://github.com/xmlyqing00/V-FloodNet.git
+```
+
+Second, a python virtual environment is recommended. 
 I use `pip` to create a virtual environment named `env` and activate it.
 Then, recursively pull the submodules code.
 
 ```shell
-python3 -m venv env
-source env/bin/activate
+python3 -m venv vflood
+source vflood/bin/activate
 git submodule update --init --recursive
 ```
 
@@ -23,7 +30,7 @@ In the virtual environment, install the following required packages from their o
 - [Detectron2](https://github.com/facebookresearch/detectron2) for reference objects segmentation.
 - [MeshTransformer](https://github.com/microsoft/MeshTransformer) for human detection and 3D mesh alignment.
 
-We provide the corresponding installation command here
+We provide the corresponding installation command here, you can replace the version number that fit your environment.
 
 ```shell
 pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 torchaudio==0.8.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
@@ -39,27 +46,56 @@ Then install the rest packages indicated in `requirements.txt`
 pip install -r requirements.txt
 ```
 
-## Usage
+### 1.2 Pretrained Models
 
-Download and extract the pretrained weights, and put them in the folder `./records/`. Weights and groundtruths are stored in [Google Drive](https://drive.google.com/file/d/1r0YmT24t4uMwi4xtSLXD5jyaMIuMzorS/view?usp=sharing).
+First, run the following script to download the pretrained models of MeshTransformer
+```bash
+sh scripts/download_MeshTransformer_models.sh
+```
 
-### Water Image Segmentation
-Put the testing images in `image_folder`, then
+Second, download SMPL model `mpips_smplify_public_v2.zip` from the official website [SMPLify](http://smplify.is.tue.mpg.de/). Extract it and place the model file `basicModel_neutral_lbs_10_207_0_v1.0.0.pkl` at `./MeshTransformer/metro/modeling/data`.
+<!-- - Download `MANO_RIGHT.pkl` from [MANO](https://mano.is.tue.mpg.de/), and place it at `${REPO_DIR}/metro/modeling/data`. -->
+<!-- 
+```
+${REPO_DIR}  
+|-- metro  
+|   |-- modeling
+|   |   |-- data
+|   |   |   |-- basicModel_neutral_lbs_10_207_0_v1.0.0.pkl
+|   |   |   |-- MANO_RIGHT.pkl
+|-- models
+|-- datasets
+|-- predictions
+|-- README.md 
+|-- ... 
+|-- ... 
+``` -->
+<!-- Please check [/metro/modeling/data/README.md](../metro/modeling/data/README.md) for further details. -->
+
+Third, download the archives from [Google Drive](https://drive.google.com/drive/folders/1DURwcb_qhBeWYznTrpJ-7yGJTHxm7pxC?usp=sharing).
+Extract the pretrained models for water segmentation `records.zip` and put them in the folder `./records/`. 
+Extract the water dataset `WaterDataset` in any path, which includes the training images and testing videos.
+
+
+## 2 Usage
+
+### 2.1 Water Image Segmentation
+Put the testing images in a folder then
 ```shell
 python test_image_seg.py \
     --test_path=/path/to/image_folder --test_name=<test_name>
 ```
 The default output folder is `output/segs/`
 
-### Water Video Segmentation
+### 2.2 Water Video Segmentation
 If your input is a video, we provide a script `scripts/cvt_video_to_imgs.py` to extract frames of the video.
-Put the extracted frames in `frame_folder`, then
+Put the extracted frames in a folder then
 ```shell
 python test_video_seg.py \
     --test-path=/path/to/frame_folder --test-name=<test_name>
 ```
 
-### Water Depth Estimation
+### 2.3 Water Depth Estimation
 
 We provide three options `stopsign`, `people`, and `ref` for `--opt` to specify three types reference objects.
 ```shell
@@ -71,5 +107,5 @@ For input video, to compare the estimated water level with the groundtruths in `
 python cmp_hydrograph.py --test-name=<test_name>
 ```
 
-## Copyright
+## 3 Copyright
 This paper is submitted to Elsevier Journal Computers, Environment and Urban Systems under review. The corresponding author is Xin Li (Xin Li <xin.shane.li@ieee.org>).  All rights are reserved.
diff --git a/est_waterlevel.py b/est_waterlevel.py
index bceef4d..a458611 100644
--- a/est_waterlevel.py
+++ b/est_waterlevel.py
@@ -20,7 +20,7 @@ def get_parser():
     parser.add_argument('--out-dir', default='output/waterlevel',
                         help='A file or directory to save output results.')
     parser.add_argument('--opt', type=str,
-                        help='Estimation options.')
+                        help='Estimation options. "people", "stopsign", or "ref"')
 
     return parser.parse_args()
 
@@ -33,7 +33,7 @@ def main(args):
     out_dir = os.path.join(args.out_dir, f'{args.test_name}_{args.opt}')
     os.makedirs(out_dir, exist_ok=True)
 
-    if args.opt in ['skeleton', 'stopsign']:
+    if args.opt in ['people', 'stopsign']:
         est_by_obj_detection(img_list, water_mask_list, out_dir, args.opt)
     elif args.opt == 'ref':
         est_by_reference(img_list, water_mask_list, out_dir, args.test_name)
diff --git a/estimation/object_detection.py b/estimation/object_detection.py
index 9b81693..6a6de15 100644
--- a/estimation/object_detection.py
+++ b/estimation/object_detection.py
@@ -1,9 +1,7 @@
 from tqdm import trange
 import cv2
 import os
-from detectron2.data import MetadataCatalog
 from detectron2.config import get_cfg
-from detectron2.utils.visualizer import Visualizer
 from detectron2.engine.defaults import DefaultPredictor
 from detectron2.projects.point_rend import add_pointrend_config
 from detectron2.structures import Instances
@@ -20,15 +18,15 @@
     'opts': ['MODEL.WEIGHTS', 'https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_X_101_32x8d_FPN_3x_coco/28119989/model_final_ba17b9.pkl'],
     'conf_thres': 0.5,
 }
-skeleton_config = {
+people_config = {
     'config_file': 'estimation/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml',
     'opts': ['MODEL.WEIGHTS', 'https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x/138363331/model_final_997cc7.pkl'],
     'conf_thres': 0.7,
 }
 
 stopsign_meta = {
-    'size': 76.2,  # 36 inch 91.44cm,  30inch, 76.2
-    'pole_height': 243.84,  # 7 feet 213.36 cm, 8 feet 243.84cm
+    'size': 79,  # 75cm + 2 * 2cm (white border) = 79cm
+    'pole_height': 215.9 # 85in = 215.9cm
 }
 
 people_meta = {
@@ -39,7 +37,7 @@
 object_colors = {
     'background': [0, 0, 0],
     'stopsign': [128, 128, 0],
-    'skeleton': [0, 128, 128]
+    'people': [0, 128, 128]
 }
 
 water_label_id = 1
@@ -56,7 +54,7 @@ def draw_instances(img: np.array, instances: Instances):
     return img
 
 
-def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name):
+def waterdepth_by_stopsign(img, instances, water_mask, result_dir, img_name):
 
     # Constants
     thickness = 6
@@ -69,10 +67,11 @@ def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name):
     degree_step = np.deg2rad(360 / pts_n)
     degree_pos = degree_step / 2
     plate_radius = 50
-    plate_center = (200, 100)
-    template_size = (400, 400)
-    template_plate_height = plate_radius * (2 ** 0.5)
-    template_pole_height = template_plate_height / stopsign_meta['size'] * stopsign_meta['pole_height']
+    plate_center = (150, 75)
+    template_size = (400, 300)
+    template_plate_height = np.cos(degree_pos) * plate_radius
+    template_pole_height = 2 * template_plate_height / stopsign_meta['size'] * stopsign_meta['pole_height']
+    # print(plate_radius, template_plate_height, template_pole_height)
     plate_pts = []
 
     for i in range(pts_n):
@@ -86,6 +85,8 @@ def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name):
     template_pole_bottom = template_pole_top.copy()
     template_pole_bottom[1] += template_pole_height
     template_pole_top, template_pole_bottom = template_pole_top.astype(int), template_pole_bottom.astype(int)
+    # print('pts', plate_pts)
+    # print('pole', template_pole_top, template_pole_bottom)
 
     template_canvas = np.ones((template_size) + (3,)) * 255
     template_plate_pts = template_plate_pts.astype(int)
@@ -182,8 +183,8 @@ def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name):
         template_pole_bottom_water[1] += (1 - submerged_ratio) * template_pole_height
         template_pole_top, template_pole_bottom_water = template_pole_top.astype(int), template_pole_bottom_water.astype(int)
 
-        template_pole_bottom_water_left = (template_size[0] // 4, template_pole_bottom_water[1])
-        template_pole_bottom_water_right = (template_size[0] * 3 // 4, template_pole_bottom_water[1])
+        template_pole_bottom_water_left = (template_size[1] // 4, template_pole_bottom_water[1])
+        template_pole_bottom_water_right = (template_size[1] * 3 // 4, template_pole_bottom_water[1])
         cv2.line(template_canvas, template_pole_bottom_water, template_pole_bottom, submerged_color, thickness)
         cv2.line(template_canvas, template_pole_bottom_water_left, template_pole_bottom_water_right, water_color,
                  thickness)
@@ -194,246 +195,13 @@ def waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name):
 
         break
 
-    print(submerged_ratio, waterdepth)
     return submerged_ratio, waterdepth
 
 
-def waterdepth_by_stopsign(img, instances, water_mask, viz_img):
-
-    # Extract poles
-    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    img_grad = cv2.convertScaleAbs(cv2.Sobel(img_gray, cv2.CV_16S, 1, 0, ksize=3, scale=0.6))
-    ret, img_edge = cv2.threshold(img_grad, 50, 255, cv2.THRESH_BINARY)
-
-    min_line_len = 100
-    max_line_gap = 20
-    lines = cv2.HoughLinesP(img_edge, 1, np.pi / 180, 50, minLineLength=min_line_len, maxLineGap=max_line_gap)
-    if lines is None:
-        print('Cannot detect lines in the image. Estimation by stop sign fails.')
-        return [], None
-
-    lines = lines.squeeze()
-    dir = (abs(lines[:, 0] - lines[:, 2]) + 1) / (abs(lines[:, 1] - lines[:, 3]) + 1)  # dx/dy
-    lines_vert = lines[dir < 0.5]
-    lines_vec = myutils.normalize(lines_vert[:, 2:] - lines_vert[:, :2])
-
-    # viz
-    # for x1, y1, x2, y2 in lines_vert:
-    #     cv2.line(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
-    # cv2.imshow('img', img)
-    # cv2.imshow('grad', img_grad)
-    # cv2.imshow('edge', img_edge)
-    # cv2.waitKey()
-
-    stopsign_d = []
-    stopsign_pt = []
-    stopsign_in_waters = []
-    raw_data_list = []
-
-    for i in range(len(instances.pred_classes)):
-        if instances.pred_classes[i] != 11:  # class index for stopsign
-            continue
-
-        edge_map = cv2.Canny(instances.pred_masks[i].numpy().astype(np.uint8) * 255, 75, 200)
-        cnts, hierarchy = cv2.findContours(edge_map, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
-
-        peri = cv2.arcLength(cnts[0], True)
-        approx = cv2.approxPolyDP(cnts[0], 0.02 * peri, True)
-        if approx.shape[0] < 8:
-            continue
-
-        # stopsign geo
-        pt_center = np.mean(approx, axis=0)
-        rank_y = np.argsort(approx[:, 0, 1], axis=0)
-        pt_top = np.mean(approx[rank_y[:2]], axis=0)[0]
-        pt_bottom = np.mean(approx[rank_y[-2:]], axis=0)[0]
-        rank_x = np.argsort(approx[:, 0, 0], axis=0)
-        pt_left = np.mean(approx[rank_x[:2]], axis=0)[0]
-        pt_right = np.mean(approx[rank_x[-2:]], axis=0)[0]
-
-        stopsign_h = myutils.dist(pt_bottom, pt_top, axis=0)
-        stopsign_w = myutils.dist(pt_left, pt_right, axis=0)
-
-        # stopsign_h =   pt_bottom[1] - pt_top[1]
-        # stopsign_w = pt_right[0] - pt_left[0]
-
-        # stopsign_vec0 = myutils.normalize(pt_center - lines_vert[:, :2])
-        stopsign_vec1 = myutils.normalize(pt_center - pt_bottom).reshape(1, 2)
-
-        # direction
-        # cos_sim0 = np.abs(np.multiply(lines_vec, stopsign_vec0).sum(axis=1))
-        cos_sim1 = np.abs(np.multiply(lines_vec, stopsign_vec1).sum(axis=1))
-        # lines_parallel = lines_vert[np.bitwise_and(cos_sim0 > 0.995, cos_sim1 > 0.995)]
-        xpd0 = np.bitwise_and(pt_left[0] <= lines_vert[:, 0], lines_vert[:, 0] <= pt_right[0])
-        xpd1 = np.bitwise_and(pt_left[0] <= lines_vert[:, 2], lines_vert[:, 2] <= pt_right[0])
-        lines_parallel = lines_vert[np.bitwise_and(cos_sim1 > 0.9, np.bitwise_or(xpd0, xpd1))]
-
-        # position
-        lines_end_flag0 = lines_parallel[:, 1] >= pt_bottom[1]
-        lines_end_flag1 = lines_parallel[:, 3] >= pt_bottom[1]
-        lines_parallel = lines_parallel[np.bitwise_or(lines_end_flag0, lines_end_flag1)]
-
-        # dist
-        dist0 = abs(lines_parallel[:, 1] - pt_bottom[1]) < stopsign_h * 3
-        dist1 = abs(lines_parallel[:, 3] - pt_bottom[1]) < stopsign_h * 3
-        poles = lines_parallel[np.bitwise_and(dist0, dist1)]
-
-        # viz
-        # for x1, y1, x2, y2 in poles:
-        #     cv2.line(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
-        # cv2.imshow('img', img)
-        # tmp = edge_map.copy()
-        # cv2.drawContours(tmp, cnts, -1, 255, 3)
-        # cv2.imshow('tmp', tmp)
-        # cv2.drawContours(edge_map, approx, -1, 255, 3)
-        # cv2.imshow('edge', edge_map)
-        # cv2.waitKey()
-
-        poles_bottom_arr = []
-        thres_faraway = 5 * stopsign_h
-        for x1, y1, x2, y2 in poles:
-
-            if y1 < y2:
-                if y2 - pt_bottom[1] < thres_faraway:
-                    poles_bottom_arr.append([x2, y2])
-            else:
-                if y1 - pt_bottom[1] < thres_faraway:
-                    poles_bottom_arr.append([x1, y1])
-
-        if len(poles_bottom_arr) == 0:
-            continue
-
-        poles_bottom_arr = np.array(poles_bottom_arr)
-
-        # remove outliers
-        # poles_bottom_bias = abs(poles_bottom_arr - poles_bottom_arr.mean(axis=0)).sum(axis=1)
-        # poles_bottom_bias_std = poles_bottom_bias.min() * 2
-        # poles_bottom_arr = poles_bottom_arr[poles_bottom_bias < poles_bottom_bias_std]
-
-        # select topk
-        # d = myutils.dist(poles_bottom_arr, pt_bottom.reshape(1, 2), axis=1)
-        # rank_d = np.argsort(d)
-        # topk = len(poles_bottom_arr) // 2
-        # poles_bottom_arr = poles_bottom_arr[rank_d[topk:]]
-
-        poles_bottom_pt = poles_bottom_arr.mean(axis=0).astype(np.int32)
-
-        dx = (poles_bottom_pt[0] - pt_bottom[0]) / (poles_bottom_pt[1] - pt_bottom[1])
-        pole_x, pole_y = poles_bottom_pt[0], poles_bottom_pt[1]
-        for y in range(poles_bottom_pt[1], water_mask.shape[0]):
-            if water_mask[y][np.round(pole_x).astype(np.int32)] == water_label_id:
-                pole_y = y
-                break
-            else:
-                pole_x += dx
-
-        poles_bottom_pt = np.array([pole_x, pole_y])
-        poles_bottom_d = myutils.dist(poles_bottom_pt, pt_bottom, axis=0)
-        # cos_ratio = (poles_bottom_pt[1] - pt_bottom[1]) / poles_bottom_d
-        raw_data_list.append({
-            'pole_top': (*pt_bottom, 1),
-            'pole_bottom': (*poles_bottom_pt, 1)
-        })
-
-        # print(poles_bottom_pt)
-
-        px2cm = stopsign_meta['size'] / stopsign_h
-        pole_h_cm = px2cm * poles_bottom_d
-        # pole_h_cm = pole_d_cm * cos_ratio
-
-        stopsign_in_water = max(0, stopsign_meta['height_urban'] - pole_h_cm)
-        stopsign_in_waters.append(stopsign_in_water)
-        print('Est stopsign in water', stopsign_in_water)
-
-        stopsign_pt.append(poles_bottom_pt)
-        stopsign_d.append(stopsign_in_water)
-
-        # viz
-        cv2.line(viz_img, tuple(pt_bottom.astype(np.int)), tuple(pt_top.astype(np.int)), (0, 200, 0), 2)
-        cv2.line(viz_img, tuple(pt_left.astype(np.int)), tuple(pt_right.astype(np.int)), (0, 200, 0), 2)
-        cv2.line(viz_img, tuple(poles_bottom_pt.astype(np.int)), tuple(pt_bottom.astype(np.int)), (0, 0, 200), 2)
-
-        text_pos = pt_bottom.astype(np.int)
-        text_pos[0] = max(0, text_pos[0] - 300)
-        text_pos[1] = max(0, text_pos[1] + 100)
-        text = f'Depth {stopsign_in_water:.1f}cm'
-        cv2.putText(viz_img, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 200), thickness=3)
-        # cv2.imshow('viz_img', viz_img)
-        # cv2.waitKey()
-
-    # h, w = pred_masks[0].shape[:2]
-    # depth = self.calc_depth(stopsign_pt, stopsign_d, h, w)
-    # self.viz_dict['viz_img'] = viz_img
-
-    return stopsign_in_waters, viz_img, raw_data_list
-
-
-def waterdepth_by_skeleton(pred_keypoints, water_mask, keypoint_names, viz_img):
-
-    key_centers = []
-    key_depths = []
-    thres_keypoint = 0  # 0.05
-    bottom_region_size = 15
-    bottom_region_area = 2 * (bottom_region_size ** 2)
-    water_thres = 0.05
-
-    raw_data_list = []
-    for keypoints_per_instance in pred_keypoints:
-
-        max_depth_keypoint_name = None
-        max_depth_x = 0
-        max_depth_y = 0
-        max_depth = 200
-
-        raw_data_dict = {}
-        for i, keypoint in enumerate(keypoints_per_instance):
-            x, y, prob = keypoint
-            raw_data_dict[keypoint_names[i]] = (x.item(), y.item(), prob.item())
-
-            # if prob < thres_keypoint:
-            #     continue
-
-            # x, y = int(x), int(y)
-            # bottom_region_l = x - bottom_region_size
-            # bottom_region_r = x + bottom_region_size
-            # bottom_region_t = y - bottom_region_size
-            # bottom_region_b = y + bottom_region_size
-            # bottom_region = water_mask[bottom_region_t:bottom_region_b, bottom_region_l:bottom_region_r]
-            #
-            # water_ratio = bottom_region.sum() / bottom_region_area
-
-            # print(bottom_region.shape, water_ratio, self.keypoint_names[i])
-
-            # if water_ratio < water_thres:
-            #     continue
-
-            # if water_mask:
-            #     cv2.circle(viz_img, (int(x.item()), int(y.item())), radius=2, color=(0, 200, 0), thickness=2)
-
-            if not max_depth_keypoint_name or (max_depth > skeleton_meta[keypoint_names[i]]):
-                max_depth_keypoint_name = keypoint_names[i]
-                max_depth_x = x
-                max_depth_y = y
-                max_depth = skeleton_meta[keypoint_names[i]]
-
-        raw_data_list.append(raw_data_dict)
-
-        if max_depth_keypoint_name and water_mask is not None:
-            # key_centers.append([water_depth_x, water_depth_y])
-            key_depths.append(max_depth)
-
-            text_pos = (max(0, int(max_depth_x - 250)), max(0, int(max_depth_y - 25)))
-            text = f'{max_depth_keypoint_name}: Depth {max_depth:.1f}cm'
-            cv2.putText(viz_img, text, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 200), thickness=2)
-            print('Est people in water', max_depth_keypoint_name, f'depth {max_depth}cm', 'pos', max_depth_x, max_depth_y)
-
-    return key_depths, viz_img, raw_data_list
-
-
-def waterdepth_by_skeleton2(instances, img, water_mask, out_dir, img_name):
+def waterdepth_by_people(instances, img, water_mask, out_dir, img_name):
 
     img_h, img_w, img_c = img.shape
+    scale_ratio = 1.5
 
     for person_idx, pred_box in enumerate(instances.pred_boxes):
 
@@ -443,8 +211,8 @@ def waterdepth_by_skeleton2(instances, img, water_mask, out_dir, img_name):
         x1, y1, x2, y2 = pred_box.numpy().tolist()
 
         center_x, center_y = (x1 + x2) / 2, (y1 + y2) / 2
-        bbox_w = 1.5 * (x2 - x1)
-        bbox_h = 1.5 * (y2 - y1)
+        bbox_w = scale_ratio * (x2 - x1)
+        bbox_h = scale_ratio * (y2 - y1)
         radius = max(bbox_w, bbox_h)
         radius = min(min(img_h, img_w), radius) / 2
 
@@ -493,29 +261,6 @@ def predict_boundary(y1: np.array, y2: np.array, resolution):
     # print(y2_bottom, y1_top)
     boundary = (y2_bottom + y1_top) // 2
 
-    # Option2, Naive Bayesian Gaussian Distribution
-    # pi1 = len(y1) / (len(y1) + len(y2))
-    # pi2 = len(y2) / (len(y1) + len(y2))
-    # pi1 = pi2 = 1
-    # mu1, sigma1 = y1.mean(), y1.std()
-    # mu2, sigma2 = y2.mean(), y2.std()
-    # sigma12, sigma22 = sigma1 ** 2, sigma2 ** 2
-    #
-    # a = sigma22 - sigma12
-    # b = 2 * (mu2 * sigma12 - mu1 * sigma22)
-    # c = mu1 ** 2 * sigma22 - mu2 ** 2 * sigma22 - 2 * sigma12 * sigma22 * np.log((pi1 * sigma2) / (pi2 * sigma1))
-    #
-    # delta = np.sqrt(b ** 2 - 4 * a * c)
-    # x1 = (-b - delta) / (2 * a)
-    # x2 = (-b + delta) / (2 * a)
-    # print(x1, x2)
-    # if 0 < x1 < resolution:
-    #     boundary = x1
-    # elif 0 < x2 < resolution:
-    #     boundary = x2
-    # else:
-    #     raise ValueError('Can\'t estimate boundary.')
-
     if np.isnan(boundary):
         return np.NaN, None
     else:
@@ -526,8 +271,8 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt):
 
     if opt == 'stopsign':
         user_config = stopsign_config
-    elif opt == 'skeleton':
-        user_config = skeleton_config
+    elif opt == 'people':
+        user_config = people_config
     else:
         raise NotImplementedError(opt)
 
@@ -543,7 +288,6 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt):
     cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = user_config['conf_thres']
     cfg.freeze()
 
-    metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
     det_model = DefaultPredictor(cfg)
 
     waterdepth_list = []
@@ -564,49 +308,24 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt):
 
         with torch.no_grad():
             pred_obj = det_model(img)
-        # visualizer = Visualizer(img, metadata)
         instances = pred_obj['instances'].to(torch.device('cpu'))
 
-        # if opt == 'stopsign':
-        #     # viz_img = draw_instances(img, instances)
-        #     viz_img = img
-        #     # visualizer.draw_instance_predictions(predictions=instances)
-        # else:
-        #     for keypoints_per_instance in instances.pred_keypoints:
-        #         visualizer.draw_and_connect_keypoints(keypoints_per_instance)
-        #         viz_img = visualizer.output.get_image()
-        #
-        # if water_mask is not None:
-        #     viz_img = myutils.add_overlay(viz_img, water_mask, myutils.color_palette)
-
         if opt == 'stopsign':
-            submerge_ratio, waterdepth = waterdepth_by_stopsign2(img, instances, water_mask, result_dir, img_name)
-            # raw_data = {
-            #     'instances': raw_data_list,
-            #     'connection_rules': [('pole_top', 'pole_bottom', (100, 100, 100))]
-            # }
+            submerge_ratio, waterdepth = waterdepth_by_stopsign(img, instances, water_mask, result_dir, img_name)
             waterdepth_list.append((submerge_ratio, waterdepth))
         else:
-            waterdepth_by_skeleton2(instances, img, water_mask, out_dir, img_name)
-            # raw_data = {
-            #     'instances': raw_data_list,
-            #     'connection_rules': metadata.get('keypoint_connection_rules')
-            # }
-            # cv2.imwrite(os.path.join(out_dir, f'{img_name}.png'), viz_img)
-
-        # pred_res_path = os.path.join(out_dir, img_name + '.json')
-        # with open(pred_res_path, 'w') as f:
-        #     json.dump(raw_data, f)
+            waterdepth_by_people(instances, img, water_mask, out_dir, img_name)
+            
     if opt == 'stopsign':
         with open(os.path.join(out_dir, f'waterdepth.txt'), 'w') as f:
             for i in trange(len(img_list)):
                 img_name = os.path.basename(img_list[i])[:-4]
                 f.write(f'{img_name}\t{waterdepth_list[i][0]:.4f}\t{waterdepth_list[i][1]:.4f}\n')
 
-    elif opt == 'skeleton':
+    elif opt == 'people':
 
-        cmd_str = f'cd /Ship03/Sources/MeshTransformer/ && ' \
-                  f'python3.8 /Ship03/Sources/MeshTransformer/metro/tools/inference_bodymesh.py ' \
+        cmd_str = f'cd ./MeshTransformer/ && ' \
+                  f'python3.8 ./metro/tools/inference_bodymesh.py ' \
                   f'--resume_checkpoint=./models/metro_release/metro_3dpw_state_dict.bin ' \
                   f'--image_file_or_path={os.path.abspath(out_dir)}/input/'
         print('Execute', cmd_str)
@@ -648,9 +367,7 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt):
             template_2d_under_water = template_3d[label_under_water]
             template_2d_above_water = template_3d[label_above_water]
 
-            # pred_2d_under_water = pred_2d_under_water[np.argsort(pred_2d_under_water[:, 1])]
             for j in range(pred_2d_under_water.shape[0]):
-                # cv2.circle(img, pred_2d_under_water[j], 0, [0, 0, 200], 1, lineType=cv2.FILLED)
                 cv2.circle(canvas_est, pred_2d_under_water[j], 0, [0, 0, 200], 2, lineType=cv2.FILLED)
 
             water_boundary, under_water_indices = predict_boundary(template_2d_under_water[:, 1], template_2d_above_water[:, 1], resolution)
@@ -658,10 +375,8 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt):
                 warnings.warn('Cannot estimate the water boundary.')
             else:
                 submerge_ratio = 1 - (water_boundary - template_3d_top) / template_3d_height
-                print(img_name, 'Estimate water boundary', water_boundary, f'submerge ratio {submerge_ratio:.3f}')
+                # print(img_name, 'Estimate water boundary', water_boundary, f'submerge ratio {submerge_ratio:.3f}')
                 submerge_ratio_list.append(submerge_ratio)
-                # with open(os.path.join(result_dir, f'{img_name}_waterdepth.txt'), 'w') as f:
-                #     f.write(str(submerge_ratio))
 
                 water_boundary_left = (int(resolution * 0.25), water_boundary)
                 water_boundary_right = (int(resolution * 0.75), water_boundary)
@@ -671,11 +386,6 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt):
                 for j in range(template_2d_under_water.shape[0]):
                     cv2.circle(canvas_template, (template_2d_under_water[j][0], template_2d_under_water[j][1]), 0, [0, 0, 200], 2, lineType=cv2.FILLED)
 
-                # cv2.imshow('img', img)
-                # cv2.imshow('canvas_est', canvas_est)
-                # cv2.imshow('canvas_template', canvas_template)
-                # cv2.imshow('overlay', overlay)
-                # cv2.waitKey()
                 cv2.imwrite(os.path.join(result_dir, f'{img_name}_est.png'), canvas_est)
                 cv2.imwrite(os.path.join(result_dir, f'{img_name}_template.png'), canvas_template)
                 cv2.imwrite(os.path.join(result_dir, f'{img_name}_overlay.png'), overlay)
@@ -685,25 +395,3 @@ def est_by_obj_detection(img_list, water_mask_list, out_dir, opt):
                 img_name = os.path.basename(img_list[i])[:-4]
                 waterdepth = submerge_ratio_list[i] * people_meta['man_height']
                 f.write(f'{img_name}\t{submerge_ratio_list[i]:.4f}\t{waterdepth:.4f}\n')
-
-    #
-    # def calc_depth(self, key_centers, key_depths, h, w):
-    #     if len(key_centers) == 0:
-    #         return None
-    #     elif len(key_centers) == 1:
-    #         depth = np.ones((h, w)) * key_depths[0]
-    #         return depth
-    #     else:
-    #         key_centers = np.array(key_centers)
-    #         key_depths = np.array(key_depths)
-    #
-    #         p = np.stack(np.meshgrid(np.arange(w), np.arange(h)), axis=2).reshape(-1, 2)
-    #
-    #         d = cdist(p, key_centers, 'euclidean')
-    #         d = np.exp(-d / self.d_var)
-
-    #         d = d / d.sum(axis=1, keepdims=True)
-    #
-    #         depth = np.multiply(d, key_depths).sum(axis=1).reshape(h, w)
-    #
-    #         return depth
\ No newline at end of file
diff --git a/scripts/download_MeshTransformer_models.sh b/scripts/download_MeshTransformer_models.sh
new file mode 100644
index 0000000..86b96ca
--- /dev/null
+++ b/scripts/download_MeshTransformer_models.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+cd MeshTransformer
+mkdir -p models
+bash scripts/download_models.sh
+cd ..
+
+cp scripts/inference_bodymesh.py MeshTransformer/metro/tools/
\ No newline at end of file
diff --git a/scripts/inference_bodymesh.py b/scripts/inference_bodymesh.py
new file mode 100644
index 0000000..badd1de
--- /dev/null
+++ b/scripts/inference_bodymesh.py
@@ -0,0 +1,332 @@
+"""
+Copyright (c) Microsoft Corporation.
+Licensed under the MIT license.
+
+End-to-end inference codes for
+3D human body mesh reconstruction from an image
+"""
+
+from __future__ import absolute_import, division, print_function
+import argparse
+import os
+import os.path as op
+import code
+import json
+import torch
+import torchvision.models as models
+from torchvision.utils import make_grid
+import numpy as np
+import cv2
+from tqdm import tqdm
+from metro.modeling.bert import BertConfig, METRO
+from metro.modeling.bert import METRO_Body_Network as METRO_Network
+from metro.modeling._smpl import SMPL, Mesh
+from metro.modeling.hrnet.hrnet_cls_net_featmaps import get_cls_net
+from metro.modeling.hrnet.config import config as hrnet_config
+from metro.modeling.hrnet.config import update_config as hrnet_update_config
+import metro.modeling.data.config as cfg
+
+from metro.utils.renderer import Renderer, visualize_reconstruction, visualize_reconstruction_test, \
+    visualize_reconstruction_no_text, visualize_reconstruction_and_att_local
+from metro.utils.geometric_layers import orthographic_projection
+from metro.utils.logger import setup_logger
+from metro.utils.miscellaneous import mkdir, set_seed
+
+from PIL import Image
+from torchvision import transforms
+
+transform = transforms.Compose([
+    transforms.Resize(224),
+    transforms.CenterCrop(224),
+    transforms.ToTensor(),
+    transforms.Normalize(
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225])])
+
+transform_visualize = transforms.Compose([
+    transforms.Resize(224),
+    transforms.CenterCrop(224),
+    transforms.ToTensor()])
+
+
+def run_inference(args, image_list, _metro_network, smpl, renderer, mesh_sampler):
+    # switch to evaluate mode
+    _metro_network.eval()
+
+    mesh_dir = os.path.dirname(image_list[0])
+    image_list = sorted(image_list)
+
+    for idx, image_file in enumerate(tqdm(image_list)):
+        if 'pred' not in image_file:
+            att_all = []
+            img = Image.open(image_file)
+            img_tensor = transform(img)
+            img_visual = transform_visualize(img)
+
+            batch_imgs = torch.unsqueeze(img_tensor, 0).cuda()
+            batch_visual_imgs = torch.unsqueeze(img_visual, 0).cuda()
+            # forward-pass
+            pred_camera, pred_3d_joints, pred_vertices_sub2, pred_vertices_sub, pred_vertices, hidden_states, att = _metro_network(
+                batch_imgs, smpl, mesh_sampler)
+
+            # obtain 3d joints from full mesh
+            pred_3d_joints_from_smpl = smpl.get_h36m_joints(pred_vertices)
+
+            pred_3d_pelvis = pred_3d_joints_from_smpl[:, cfg.H36M_J17_NAME.index('Pelvis'), :]
+            pred_3d_joints_from_smpl = pred_3d_joints_from_smpl[:, cfg.H36M_J17_TO_J14, :]
+            pred_3d_joints_from_smpl = pred_3d_joints_from_smpl - pred_3d_pelvis[:, None, :]
+            pred_vertices = pred_vertices - pred_3d_pelvis[:, None, :]
+
+            # save attantion
+            att_max_value = att[-1]
+            att_cpu = np.asarray(att_max_value.cpu().detach())
+            att_all.append(att_cpu)
+
+            # obtain 3d joints, which are regressed from the full mesh
+            pred_3d_joints_from_smpl = smpl.get_h36m_joints(pred_vertices)
+            pred_3d_joints_from_smpl = pred_3d_joints_from_smpl[:, cfg.H36M_J17_TO_J14, :]
+            # obtain 2d joints, which are projected from 3d joints of smpl mesh
+            pred_2d_joints_from_smpl = orthographic_projection(pred_3d_joints_from_smpl, pred_camera)
+            pred_2d_431_vertices_from_smpl = orthographic_projection(pred_vertices_sub2, pred_camera)
+            visual_imgs_att = visualize_mesh_and_attention(renderer, batch_visual_imgs[0],
+                                                           pred_vertices[0].detach(),
+                                                           pred_vertices_sub2[0].detach(),
+                                                           pred_2d_431_vertices_from_smpl[0].detach(),
+                                                           pred_2d_joints_from_smpl[0].detach(),
+                                                           pred_camera.detach(),
+                                                           att[-1][0].detach())
+
+            visual_imgs = visual_imgs_att.transpose(1, 2, 0)
+            visual_imgs = np.asarray(visual_imgs)
+
+            temp_fname = image_file[:-4] + '_metro_pred.jpg'
+            # print('Save img to ', temp_fname)
+            cv2.imwrite(temp_fname, np.asarray(visual_imgs[:, -225:, ::-1] * 255))
+
+            pred_2d = pred_2d_431_vertices_from_smpl[0].detach().cpu().numpy().tolist()
+            pred_result_path = image_file[:-4] + '_pred.txt'
+            # print('Save predictions to', pred_result_path)
+            with open(pred_result_path, 'w') as f:
+                json.dump(pred_2d, f)
+
+    return
+
+
+def visualize_mesh_and_attention(renderer, images,
+                                 pred_vertices_full,
+                                 pred_vertices,
+                                 pred_2d_vertices,
+                                 pred_2d_joints,
+                                 pred_camera,
+                                 attention):
+    """Tensorboard logging."""
+
+    img = images.cpu().numpy().transpose(1, 2, 0)
+    # Get predict vertices for the particular example
+    vertices_full = pred_vertices_full.cpu().numpy()
+    vertices = pred_vertices.cpu().numpy()
+    vertices_2d = pred_2d_vertices.cpu().numpy()
+    joints_2d = pred_2d_joints.cpu().numpy()
+    cam = pred_camera.cpu().numpy()
+    att = attention.cpu().numpy()
+    # Visualize reconstruction and attention
+    rend_img = visualize_reconstruction_and_att_local(img, 224, vertices_full, vertices, vertices_2d, cam, renderer,
+                                                      joints_2d, att, color='pink')
+    rend_img = rend_img.transpose(2, 0, 1)
+
+    return rend_img
+
+
+def visualize_mesh_no_text(renderer,
+                           images,
+                           pred_vertices,
+                           pred_camera):
+    """Tensorboard logging."""
+    img = images.cpu().numpy().transpose(1, 2, 0)
+    # Get predict vertices for the particular example
+    vertices = pred_vertices.cpu().numpy()
+    cam = pred_camera.cpu().numpy()
+    # Visualize reconstruction only
+    rend_img = visualize_reconstruction_no_text(img, 224, vertices, cam, renderer, color='hand')
+    rend_img = rend_img.transpose(2, 0, 1)
+    return rend_img
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    #########################################################
+    # Data related arguments
+    #########################################################
+    parser.add_argument("--image_file_or_path", default='./test_images/human-body', type=str,
+                        help="test data")
+    #########################################################
+    # Loading/saving checkpoints
+    #########################################################
+    parser.add_argument("--model_name_or_path", default='metro/modeling/bert/bert-base-uncased/', type=str,
+                        required=False,
+                        help="Path to pre-trained transformer model or model type.")
+    parser.add_argument("--resume_checkpoint", default=None, type=str, required=False,
+                        help="Path to specific checkpoint for inference.")
+    parser.add_argument("--output_dir", default='output/', type=str, required=False,
+                        help="The output directory to save checkpoint and test results.")
+    #########################################################
+    # Model architectures
+    #########################################################
+    parser.add_argument('-a', '--arch', default='hrnet-w64',
+                        help='CNN backbone architecture: hrnet-w64, hrnet, resnet50')
+    parser.add_argument("--num_hidden_layers", default=4, type=int, required=False,
+                        help="Update model config if given")
+    parser.add_argument("--hidden_size", default=-1, type=int, required=False,
+                        help="Update model config if given")
+    parser.add_argument("--num_attention_heads", default=4, type=int, required=False,
+                        help="Update model config if given. Note that the division of "
+                             "hidden_size / num_attention_heads should be in integer.")
+    parser.add_argument("--intermediate_size", default=-1, type=int, required=False,
+                        help="Update model config if given.")
+    parser.add_argument("--input_feat_dim", default='2051,512,128', type=str,
+                        help="The Image Feature Dimension.")
+    parser.add_argument("--hidden_feat_dim", default='1024,256,128', type=str,
+                        help="The Image Feature Dimension.")
+    parser.add_argument("--legacy_setting", default=True, action='store_true', )
+    #########################################################
+    # Others
+    #########################################################
+    parser.add_argument("--device", type=str, default='cuda',
+                        help="cuda or cpu")
+    parser.add_argument('--seed', type=int, default=88,
+                        help="random seed for initialization.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    global logger
+    # Setup CUDA, GPU & distributed training
+    args.num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+    args.distributed = args.num_gpus > 1
+    args.device = torch.device(args.device)
+
+    mkdir(args.output_dir)
+    logger = setup_logger("METRO Inference", args.output_dir, 0)
+    set_seed(args.seed, args.num_gpus)
+    logger.info("Using {} GPUs".format(args.num_gpus))
+
+    # Mesh and SMPL utils
+    mesh_smpl = SMPL().to(args.device)
+    mesh_sampler = Mesh()
+    # Renderer for visualization
+    renderer = Renderer(faces=mesh_smpl.faces.cpu().numpy())
+    # Load pretrained model
+    logger.info("Inference: Loading from checkpoint {}".format(args.resume_checkpoint))
+
+    if args.resume_checkpoint != None and args.resume_checkpoint != 'None' and 'state_dict' not in args.resume_checkpoint:
+        logger.info("Evaluation: Loading from checkpoint {}".format(args.resume_checkpoint))
+        _metro_network = torch.load(args.resume_checkpoint)
+    else:
+        # Build model from scratch, and load weights from state_dict.bin
+        trans_encoder = []
+        input_feat_dim = [int(item) for item in args.input_feat_dim.split(',')]
+        hidden_feat_dim = [int(item) for item in args.hidden_feat_dim.split(',')]
+        output_feat_dim = input_feat_dim[1:] + [3]
+        # init three transformer encoders in a loop
+        for i in range(len(output_feat_dim)):
+            config_class, model_class = BertConfig, METRO
+            config = config_class.from_pretrained(args.model_name_or_path)
+
+            config.output_attentions = False
+            config.img_feature_dim = input_feat_dim[i]
+            config.output_feature_dim = output_feat_dim[i]
+            args.hidden_size = hidden_feat_dim[i]
+
+            if args.legacy_setting == True:
+                # During our paper submission, we were using the original intermediate size, which is 3072 fixed
+                # We keep our legacy setting here
+                args.intermediate_size = -1
+            else:
+                # We have recently tried to use an updated intermediate size, which is 4*hidden-size.
+                # But we didn't find significant performance changes on Human3.6M (~36.7 PA-MPJPE)
+                args.intermediate_size = int(args.hidden_size * 4)
+
+            # update model structure if specified in arguments
+            update_params = ['num_hidden_layers', 'hidden_size', 'num_attention_heads', 'intermediate_size']
+
+            for idx, param in enumerate(update_params):
+                arg_param = getattr(args, param)
+                config_param = getattr(config, param)
+                if arg_param > 0 and arg_param != config_param:
+                    logger.info("Update config parameter {}: {} -> {}".format(param, config_param, arg_param))
+                    setattr(config, param, arg_param)
+
+            # init a transformer encoder and append it to a list
+            assert config.hidden_size % config.num_attention_heads == 0
+            model = model_class(config=config)
+            logger.info("Init model from scratch.")
+            trans_encoder.append(model)
+
+        # init ImageNet pre-trained backbone model
+        if args.arch == 'hrnet':
+            hrnet_yaml = 'models/hrnet/cls_hrnet_w40_sgd_lr5e-2_wd1e-4_bs32_x100.yaml'
+            hrnet_checkpoint = 'models/hrnet/hrnetv2_w40_imagenet_pretrained.pth'
+            hrnet_update_config(hrnet_config, hrnet_yaml)
+            backbone = get_cls_net(hrnet_config, pretrained=hrnet_checkpoint)
+            logger.info('=> loading hrnet-v2-w40 model')
+        elif args.arch == 'hrnet-w64':
+            hrnet_yaml = 'models/hrnet/cls_hrnet_w64_sgd_lr5e-2_wd1e-4_bs32_x100.yaml'
+            hrnet_checkpoint = 'models/hrnet/hrnetv2_w64_imagenet_pretrained.pth'
+            hrnet_update_config(hrnet_config, hrnet_yaml)
+            backbone = get_cls_net(hrnet_config, pretrained=hrnet_checkpoint)
+            logger.info('=> loading hrnet-v2-w64 model')
+        else:
+            print("=> using pre-trained model '{}'".format(args.arch))
+            backbone = models.__dict__[args.arch](pretrained=True)
+            # remove the last fc layer
+            backbone = torch.nn.Sequential(*list(backbone.children())[:-2])
+
+        trans_encoder = torch.nn.Sequential(*trans_encoder)
+        total_params = sum(p.numel() for p in trans_encoder.parameters())
+        logger.info('Transformers total parameters: {}'.format(total_params))
+        backbone_total_params = sum(p.numel() for p in backbone.parameters())
+        logger.info('Backbone total parameters: {}'.format(backbone_total_params))
+
+        # build end-to-end METRO network (CNN backbone + multi-layer transformer encoder)
+        _metro_network = METRO_Network(args, config, backbone, trans_encoder, mesh_sampler)
+
+        logger.info("Loading state dict from checkpoint {}".format(args.resume_checkpoint))
+        cpu_device = torch.device('cpu')
+        state_dict = torch.load(args.resume_checkpoint, map_location=cpu_device)
+        _metro_network.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+    # update configs to enable attention outputs
+    setattr(_metro_network.trans_encoder[-1].config, 'output_attentions', True)
+    setattr(_metro_network.trans_encoder[-1].config, 'output_hidden_states', True)
+    _metro_network.trans_encoder[-1].bert.encoder.output_attentions = True
+    _metro_network.trans_encoder[-1].bert.encoder.output_hidden_states = True
+    for iter_layer in range(4):
+        _metro_network.trans_encoder[-1].bert.encoder.layer[iter_layer].attention.self.output_attentions = True
+    for inter_block in range(3):
+        setattr(_metro_network.trans_encoder[-1].config, 'device', args.device)
+
+    _metro_network.to(args.device)
+    logger.info("Run inference")
+
+    image_list = []
+    if not args.image_file_or_path:
+        raise ValueError("image_file_or_path not specified")
+    if op.isfile(args.image_file_or_path):
+        image_list = [args.image_file_or_path]
+    elif op.isdir(args.image_file_or_path):
+        # should be a path with images only
+        for filename in os.listdir(args.image_file_or_path):
+            if filename.endswith(".png") or filename.endswith(".jpg") and 'pred' not in filename:
+                image_list.append(args.image_file_or_path + '/' + filename)
+    else:
+        raise ValueError("Cannot find images at {}".format(args.image_file_or_path))
+
+    run_inference(args, image_list, _metro_network, mesh_smpl, renderer, mesh_sampler)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)