Quantcast
Channel: TAO Toolkit - NVIDIA Developer Forums
Viewing all articles
Browse latest Browse all 497

TAOkit Training Task -

$
0
0

• A5000
• Faster-RCNN
• 5.0.0-tf1.15.5
• spec file:

random_seed: 42

verbose: True
model_config: {
  input_image_config: {
    image_type: RGB,
    image_channel_order: 'bgr',
    size_height_width: {
      height: 384,
      width: 1248
      },
    image_channel_mean: {
        key: 'b',
        value: 103.939
      },
    image_channel_mean: {
        key: 'g',
        value: 116.779
      },
    image_channel_mean: {
        key: 'r',
        value: 123.68
      },
    image_scaling_factor: 1.0,
    max_objects_num_per_image: 1000
    },
  arch: "resnet:50",
  anchor_box_config: {
    scale: 64.0,
    scale: 128.0,
    scale: 256.0,
    ratio: 1.0,
    ratio: 0.5,
    ratio: 2.0
    },
  freeze_bn: True,
  freeze_blocks: 0,
  freeze_blocks: 1,
  roi_mini_batch: 256,
  rpn_stride: 16,
  use_bias: False,
  roi_pooling_config: {
    pool_size: 7,
    pool_size_2x: False
    },
  all_projections: True,
  use_pooling: False
}

dataset_config: {
  data_sources: {
    tfrecords_path: "/workspace/tao-experiments/data/tfrecords/coco_trainval/coco_trainval-fold-000-of-001-shard-0000[0-9]-of-00020",
    image_directory_path: "/workspace/tao-experiments/train"
  },
  validation_data_source: {
    tfrecords_path: "/workspace/tao-experiments/data/tfrecords/coco_trainval/coco_trainval-fold-000-of-001-shard-0001[0-9]-of-00020",
    image_directory_path: "/workspace/tao-experiments/train"
  },
  image_extension: 'jpg',
  target_class_mapping: {
    key: 'person',
    value: 'person'
    },
  # validation_fold: 0
}

augmentation_config: {
  preprocessing: {
    output_image_width: 1248,
    output_image_height: 384,
    output_image_channel: 3,
    min_bbox_width: 1.0,
    min_bbox_height: 1.0,
    enable_auto_resize: True
    },
  spatial_augmentation: {
    hflip_probability: 0.5,
    vflip_probability: 0.0,
    zoom_min: 1.0,
    zoom_max: 1.0,
    translate_max_x: 0,
    translate_max_y: 0
    },
  color_augmentation: {
    hue_rotation_max: 0.0,
    saturation_shift_max: 0.0,
    contrast_scale_max: 0.0,
    contrast_center: 0.5
    }
}

training_config: {
  enable_augmentation: True,
  enable_qat: True,
  batch_size_per_gpu: 4,
  num_epochs: 15,
  pretrained_weights: "/workspace/tao-experiments/faster_rcnn/resnet_50.hdf5",
  # resume_from_model: ""/data/TAO_TOOLKIT/tao_poc/frcnn_training/model/resnet_50.hdf5",
  rpn_min_overlap: 0.3,
  rpn_max_overlap: 0.7,
  classifier_min_overlap: 0.0,
  classifier_max_overlap: 0.5,
  gt_as_roi: False,
  std_scaling: 1.0,
  classifier_regr_std: {
    key: 'x',
    value: 10.0
    },
  classifier_regr_std: {
    key: 'y',
    value: 10.0
    },
  classifier_regr_std: {
    key: 'w',
    value: 5.0
    },
  classifier_regr_std: {
    key: 'h',
    value: 5.0
    },
  rpn_mini_batch: 256,
  rpn_pre_nms_top_N: 12000,
  rpn_nms_max_boxes: 2000,
  rpn_nms_overlap_threshold: 0.7,
  regularizer: {
    type: L2,
    weight: 1e-4
    },
  optimizer: {
    sgd: {
      lr: 0.02,
      momentum: 0.9,
      decay: 0.0,
      nesterov: False
      }
    },
  visualizer: {
    enabled: true
    clearml_config{
        project: "training"
        tags: "resnet50"
        tags: "tao_toolkit"
        tags: "unpruned"
        task: "taokit_test"
    }
  },
  learning_rate: {
    soft_start: {
      base_lr: 0.02,
      start_lr: 0.002,
      soft_start: 0.1,
      annealing_points: 0.8,
      annealing_points: 0.9,
      annealing_divider: 10.0
      }
    },
  lambda_rpn_regr: 1.0,
  lambda_rpn_class: 1.0,
  lambda_cls_regr: 1.0,
  lambda_cls_class: 1.0
}

inference_config: {
  images_dir: "/workspace/tao-experiments/test",
  model: '/workspace/tao-experiments/faster_rcnn/frcnn_coco_resnet50.epoch_15.hdf5', # Update this with final model
  batch_size: 1,
  detection_image_output_dir: '/workspace/tao-experiments/faster_rcnn/inference_results_imgs',
  labels_dump_dir: '/workspace/tao-experiments/faster_rcnn/inference_dump_labels',
  rpn_pre_nms_top_N: 6000,
  rpn_nms_max_boxes: 300,
  rpn_nms_overlap_threshold: 0.7,
  object_confidence_thres: 0.0001,
  bbox_visualize_threshold: 0.6,
  classifier_nms_max_boxes: 100,
  classifier_nms_overlap_threshold: 0.3
}

evaluation_config: {
  model: '/workspace/tao-experiments/faster_rcnn/frcnn_coco_resnet50.epoch_15.hdf5', # Update this with final model
  batch_size: 1,
  validation_period_during_training: 1,
  rpn_pre_nms_top_N: 6000,
  rpn_nms_max_boxes: 300,
  rpn_nms_overlap_threshold: 0.7,
  classifier_nms_max_boxes: 100,
  classifier_nms_overlap_threshold: 0.3,
  object_confidence_thres: 0.0001,
  use_voc07_11point_metric: False,
  gt_matching_iou_threshold: 0.5
}

• Ran the following command:

 !tao model faster_rcnn train \
              --gpus 4 \
              --gpu_index 0 1 2 3 \
              -e $SPECS_DIR/spec_resnet50.yaml \
              -r /workspace/tao-experiments/faster_rcnn

• This is how my local directories are mapped:

drive_map = {
    "Mounts": [
        # Mapping the data directory
        {
            "source": os.environ["LOCAL_PROJECT_DIR"],
            "destination": "/workspace/tao-experiments"
        },
        # Mapping the specs directory.
        {
            "source": os.environ["LOCAL_SPECS_DIR"],
            "destination": os.environ["SPECS_DIR"]
        },
        # Mapping the data directory
        {
            "source": os.environ["LOCAL_TRAIN_IMAGES_DIR"],
            "destination": os.environ["DOCKER_TRAIN_IMAGES_DIR"]
        },
        {
            "source": os.environ["LOCAL_TEST_IMAGES_DIR"],
            "destination": os.environ["DOCKER_TEST_IMAGES_DIR"]
        },
        {
            "source": os.environ["LOCAL_TRAIN_ANNOTATIONS_DIR"],
            "destination": os.environ["DOCKER_TRAIN_ANNOTATIONS_DIR"]
        },
        {
            "source": os.environ["LOCAL_TEST_ANNOTATIONS_DIR"],
            "destination": os.environ["DOCKER_TEST_ANNOTATIONS_DIR"]
        }
    ],
    "DockerOptions":{
        "user": f"{os.getuid()}:{os.getgid()}"
    }
}

if CLEARML_LOGGED_IN:
    if "Envs" not in drive_map.keys():
        drive_map["Envs"] = []
    drive_map["Envs"].extend([
        {
            "variable": "CLEARML_WEB_HOST",
            "value": os.getenv("CLEARML_WEB_HOST")
        },
        {
            "variable": "CLEARML_API_HOST",
            "value": os.getenv("CLEARML_API_HOST")
        },
        {
            "variable": "CLEARML_FILES_HOST",
            "value": os.getenv("CLEARML_FILES_HOST")
        },
        {
            "variable": "CLEARML_API_ACCESS_KEY",
            "value": os.getenv("CLEARML_API_ACCESS_KEY")
        },
        {
            "variable": "CLEARML_API_SECRET_KEY",
            "value": os.getenv("CLEARML_API_SECRET_KEY")
        },
    ])

Once the training loop has started, get a UCX WARN failed to connect to vfs socket. After each epoch, the mAP continues to decrease until it hits 0. Why is this the case? How can I solve it?

Logs:

INFO: Starting Training Loop.
Epoch 1/15
[1701798639.992438] [ce76ee64c0c6:341  :f]        vfs_fuse.c:424  UCX  WARN  failed to connect to vfs socket '������': Invalid argument
[1701798640.409428] [ce76ee64c0c6:344  :f]        vfs_fuse.c:424  UCX  WARN  failed to connect to vfs socket '������': Invalid argument
[1701798640.483896] [ce76ee64c0c6:347  :f]        vfs_fuse.c:424  UCX  WARN  failed to connect to vfs socket '������': Invalid argument
[1701798641.621337] [ce76ee64c0c6:342  :f]        vfs_fuse.c:424  UCX  WARN  failed to connect to vfs socket '������': Invalid argument
 1/92 [..............................] - ETA: 1:58:30 - loss: 2.7451 - rpn_out_class_act_qdq_loss: 0.8070 - rpn_out_regress_qdq_loss: 0.3538 - dense_class_td_loss: 0.7456 - dense_regress_td_loss: 0.2469 1/92 [..............................] - ETA: 1:37:56 - loss: 2.6520 - rpn_out_class_act_qdq_loss: 0.5986 - rpn_out_regress_qdq_loss: 0.2623 - dense_class_td_loss: 1.0800 - dense_regress_td_loss: 0.1194 1/92 [..............................] - ETA: 2:01:07 - loss: 2.9666 - rpn_out_class_act_qdq_loss: 0.7850 - rpn_out_regress_qdq_loss: 0.3013 - dense_class_td_loss: 0.9165 - dense_regress_td_loss: 0.3722 1/92 [..............................] - ETA: 1:23:09 - loss: 2.8116 - rpn_out_class_act_qdq_loss: 0.8215 - rpn_out_regress_qdq_loss: 0.4443 - dense_class_td_loss: 0.7303 - dense_regress_td_loss: 0.2237WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/common/utils.py:199: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.

WARNING: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/common/utils.py:199: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.

92/92 [==============================] - 142s 2s/step - loss: 1.8313 - rpn_out_class_act_qdq_loss: 0.2323 - rpn_out_regress_qdq_loss: 0.1051 - dense_class_td_loss: 0.4038 - dense_regress_td_loss: 0.5004 [============================>.] - ETA: 1s - loss: 1.8369 - rpn_out_class_act_qdq_loss: 0.2327 - rpn_out_regress_qdq_loss: 0.1049 - dense_class_td_loss: 0.4027 - dense_regress_td_loss: 0.506390/92 [============================>.] - ETA: 3s - loss: 1.8101 - rpn_out_class_act_qdq_loss: 0.2270 - rpn_out_regress_qdq_loss: 0.0955 - dense_class_td_loss: 0.3972 - dense_regress_td_loss: 0.5004958621014318305 2/92 [..............................] - ETA: 1:18:08 -
92/92 [==============================] - 158s 2s/step - loss: 1.8062 - rpn_out_class_act_qdq_loss: 0.2253 - rpn_out_regress_qdq_loss: 0.0946 - dense_class_td_loss: 0.3965 - dense_regress_td_loss: 0.5002
92/92 [==============================] - 156s 2s/step - loss: 1.8452 - rpn_out_class_act_qdq_loss: 0.2350 - rpn_out_regress_qdq_loss: 0.1039 - dense_class_td_loss: 0.4030 - dense_regress_td_loss: 0.5136
92/92 [==============================] - 133s 1s/step - loss: 1.8390 - rpn_out_class_act_qdq_loss: 0.2325 - rpn_out_regress_qdq_loss: 0.1048 - dense_class_td_loss: 0.4037 - dense_regress_td_loss: 0.5082
Doing validation at epoch 1(1-based index)...
Doing validation at epoch 1(1-based index)...
Doing validation at epoch 1(1-based index)...
 78%|███████▊  | 1141/1472 [01:44<00:29, 11.18it/s]Doing validation at epoch 1(1-based index)...
100%|██████████| 1472/1472 [02:11<00:00, 11.20it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
100%|█████████▉| 1467/1472 [02:11<00:00, 11.86it/s]person              0.0284              0.0594              0.2321              0.3938              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0284              
Validation done!
Epoch 2/15
100%|██████████| 1472/1472 [02:12<00:00, 11.12it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
100%|██████████| 1472/1472 [02:12<00:00, 11.08it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
person              0.0288              0.0595              0.2327              0.3952              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0288              
 15%|█▍        | 217/1472 [00:25<01:51, 11.23it/s]Validation done!
Epoch 2/15
 15%|█▌        | 221/1472 [00:25<01:53, 10.98it/s]person              0.0285              0.0591              0.2311              0.3949              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0285              
Validation done!
Epoch 2/15
100%|██████████| 1472/1472 [02:09<00:00, 11.38it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
person              0.0282              0.0592              0.2314              0.3937              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0282              
Validation done!

INFO: Training loop in progress
Epoch 2/15
92/92 [==============================] - 156s 2s/step - loss: 10.0243 - rpn_out_class_act_qdq_loss: 0.2518 - rpn_out_regress_qdq_loss: 0.1060 - dense_class_td_loss: 0.7024 - dense_regress_td_loss: 6.8182[============================>.] - ETA: 1s - loss: 9.8168 - rpn_out_class_act_qdq_loss: 0.2518 - rpn_out_regress_qdq_loss: 0.1067 - dense_class_td_loss: 0.7039 - dense_regress_td_loss: 6.704890/92 [============================>.] - ETA: 3s - loss: 9.1867 - rpn_out_class_act_qdq_loss: 0.2577 - rpn_out_regress_qdq_loss: 0.1056 - dense_class_td_loss: 0.6917 - dense_regress_td_loss: 6.1
92/92 [==============================] - 158s 2s/step - loss: 9.8692 - rpn_out_class_act_qdq_loss: 0.2624 - rpn_out_regress_qdq_loss: 0.1064 - dense_class_td_loss: 0.6943 - dense_regress_td_loss: 6.6601
92/92 [==============================] - 157s 2s/step - loss: 9.7071 - rpn_out_class_act_qdq_loss: 0.2582 - rpn_out_regress_qdq_loss: 0.1046 - dense_class_td_loss: 0.6912 - dense_regress_td_loss: 6.5071
92/92 [==============================] - 52s 567ms/step - loss: 8.9768 - rpn_out_class_act_qdq_loss: 0.2496 - rpn_out_regress_qdq_loss: 0.1057 - dense_class_td_loss: 0.6595 - dense_regress_td_loss: 5.8160
Doing validation at epoch 2(1-based index)...
Doing validation at epoch 2(1-based index)...
Doing validation at epoch 2(1-based index)...

  0%|          | 0/1472 [00:00<?, ?it/s]
  0%|          | 0/1472 [00:00<?, ?it/s]
  1%|          | 18/1472 [00:01<01:58, 12.27it/s]Doing validation at epoch 2(1-based index)...
100%|██████████| 1472/1472 [01:57<00:00, 12.45it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
 99%|█████████▉| 1458/1472 [01:58<00:01, 12.52it/s]person              0.0000              0.0000              0.0000              0.0614              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000              
Validation done!
Epoch 3/15
100%|██████████| 1472/1472 [01:59<00:00, 12.36it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
100%|██████████| 1472/1472 [01:57<00:00, 12.50it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
100%|█████████▉| 1470/1472 [01:59<00:00, 12.28it/s]person              0.0000              0.0000              0.0000              0.0582              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000              
Validation done!
Epoch 3/15
100%|██████████| 1472/1472 [01:59<00:00, 12.31it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
person              0.0000              0.0000              0.0000              0.0611              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000              
Validation done!

INFO: Training loop in progress
Epoch 3/15
person              0.0000              0.0000              0.0000              0.0614              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000              
Validation done!
Epoch 3/15
92/92 [==============================] - 52s 566ms/step - loss: 29.4644 - rpn_out_class_act_qdq_loss: 0.3227 - rpn_out_regress_qdq_loss: 0.0680 - dense_class_td_loss: 0.6850 - dense_regress_td_loss: 15.6211============================>.] - ETA: 0s - loss: 29.6206 - rpn_out_class_act_qdq_loss: 0.3233 - rpn_out_regress_qdq_loss: 0.0681 - dense_class_td_loss: 0.6888 - dense_regress_td_loss: 15.773491/92 [============================>.] - ETA: 0s - loss: 30.6456 - rpn_out_class_act_qdq_loss: 0.3300 - rpn_out_regress_qdq_loss: 0.0742 - dense_class_td_loss: 0.7208 - dense_regress_td_loss: 16.75
92/92 [==============================] - 54s 584ms/step - loss: 31.6361 - rpn_out_class_act_qdq_loss: 0.3398 - rpn_out_regress_qdq_loss: 0.0809 - dense_class_td_loss: 0.7658 - dense_regress_td_loss: 17.6821
92/92 [==============================] - 53s 573ms/step - loss: 30.0760 - rpn_out_class_act_qdq_loss: 0.3273 - rpn_out_regress_qdq_loss: 0.0725 - dense_class_td_loss: 0.7164 - dense_regress_td_loss: 16.1922
92/92 [==============================] - 52s 569ms/step - loss: 30.5264 - rpn_out_class_act_qdq_loss: 0.3298 - rpn_out_regress_qdq_loss: 0.0741 - dense_class_td_loss: 0.7244 - dense_regress_td_loss: 16.6306
Doing validation at epoch 3(1-based index)...
Doing validation at epoch 3(1-based index)...
Doing validation at epoch 3(1-based index)...

  0%|          | 0/1472 [00:00<?, ?it/s]
  0%|          | 0/1472 [00:00<?, ?it/s]
  1%|          | 18/1472 [00:01<02:01, 11.98it/s]Doing validation at epoch 3(1-based index)...
100%|██████████| 1472/1472 [01:59<00:00, 12.31it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
100%|██████████| 1472/1472 [02:00<00:00, 12.26it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
 99%|█████████▉| 1464/1472 [02:00<00:00, 12.46it/s]person              0.0000              0.0000              0.0000              0.0576              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000              
Validation done!
Epoch 4/15
100%|█████████▉| 1470/1472 [01:58<00:00, 12.20it/s]person              0.0000              0.0000              0.0000              0.0575              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000              
100%|█████████▉| 1470/1472 [02:00<00:00, 12.11it/s]Validation done!
Epoch 4/15
100%|██████████| 1472/1472 [01:59<00:00, 12.37it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
100%|██████████| 1472/1472 [02:00<00:00, 12.18it/s]==========================================================================================
Class               AP                  precision           recall              RPN_recall          
------------------------------------------------------------------------------------------
person              0.0000              0.0000              0.0000              0.0571              
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000              
Validation done!

INFO: Training loop in progress
Epoch 4/15
........

2 posts - 2 participants

Read full topic


Viewing all articles
Browse latest Browse all 497

Trending Articles