• A5000
• Faster-RCNN
• 5.0.0-tf1.15.5
• spec file:
random_seed: 42
verbose: True
model_config: {
input_image_config: {
image_type: RGB,
image_channel_order: 'bgr',
size_height_width: {
height: 384,
width: 1248
},
image_channel_mean: {
key: 'b',
value: 103.939
},
image_channel_mean: {
key: 'g',
value: 116.779
},
image_channel_mean: {
key: 'r',
value: 123.68
},
image_scaling_factor: 1.0,
max_objects_num_per_image: 1000
},
arch: "resnet:50",
anchor_box_config: {
scale: 64.0,
scale: 128.0,
scale: 256.0,
ratio: 1.0,
ratio: 0.5,
ratio: 2.0
},
freeze_bn: True,
freeze_blocks: 0,
freeze_blocks: 1,
roi_mini_batch: 256,
rpn_stride: 16,
use_bias: False,
roi_pooling_config: {
pool_size: 7,
pool_size_2x: False
},
all_projections: True,
use_pooling: False
}
dataset_config: {
data_sources: {
tfrecords_path: "/workspace/tao-experiments/data/tfrecords/coco_trainval/coco_trainval-fold-000-of-001-shard-0000[0-9]-of-00020",
image_directory_path: "/workspace/tao-experiments/train"
},
validation_data_source: {
tfrecords_path: "/workspace/tao-experiments/data/tfrecords/coco_trainval/coco_trainval-fold-000-of-001-shard-0001[0-9]-of-00020",
image_directory_path: "/workspace/tao-experiments/train"
},
image_extension: 'jpg',
target_class_mapping: {
key: 'person',
value: 'person'
},
# validation_fold: 0
}
augmentation_config: {
preprocessing: {
output_image_width: 1248,
output_image_height: 384,
output_image_channel: 3,
min_bbox_width: 1.0,
min_bbox_height: 1.0,
enable_auto_resize: True
},
spatial_augmentation: {
hflip_probability: 0.5,
vflip_probability: 0.0,
zoom_min: 1.0,
zoom_max: 1.0,
translate_max_x: 0,
translate_max_y: 0
},
color_augmentation: {
hue_rotation_max: 0.0,
saturation_shift_max: 0.0,
contrast_scale_max: 0.0,
contrast_center: 0.5
}
}
training_config: {
enable_augmentation: True,
enable_qat: True,
batch_size_per_gpu: 4,
num_epochs: 15,
pretrained_weights: "/workspace/tao-experiments/faster_rcnn/resnet_50.hdf5",
# resume_from_model: ""/data/TAO_TOOLKIT/tao_poc/frcnn_training/model/resnet_50.hdf5",
rpn_min_overlap: 0.3,
rpn_max_overlap: 0.7,
classifier_min_overlap: 0.0,
classifier_max_overlap: 0.5,
gt_as_roi: False,
std_scaling: 1.0,
classifier_regr_std: {
key: 'x',
value: 10.0
},
classifier_regr_std: {
key: 'y',
value: 10.0
},
classifier_regr_std: {
key: 'w',
value: 5.0
},
classifier_regr_std: {
key: 'h',
value: 5.0
},
rpn_mini_batch: 256,
rpn_pre_nms_top_N: 12000,
rpn_nms_max_boxes: 2000,
rpn_nms_overlap_threshold: 0.7,
regularizer: {
type: L2,
weight: 1e-4
},
optimizer: {
sgd: {
lr: 0.02,
momentum: 0.9,
decay: 0.0,
nesterov: False
}
},
visualizer: {
enabled: true
clearml_config{
project: "training"
tags: "resnet50"
tags: "tao_toolkit"
tags: "unpruned"
task: "taokit_test"
}
},
learning_rate: {
soft_start: {
base_lr: 0.02,
start_lr: 0.002,
soft_start: 0.1,
annealing_points: 0.8,
annealing_points: 0.9,
annealing_divider: 10.0
}
},
lambda_rpn_regr: 1.0,
lambda_rpn_class: 1.0,
lambda_cls_regr: 1.0,
lambda_cls_class: 1.0
}
inference_config: {
images_dir: "/workspace/tao-experiments/test",
model: '/workspace/tao-experiments/faster_rcnn/frcnn_coco_resnet50.epoch_15.hdf5', # Update this with final model
batch_size: 1,
detection_image_output_dir: '/workspace/tao-experiments/faster_rcnn/inference_results_imgs',
labels_dump_dir: '/workspace/tao-experiments/faster_rcnn/inference_dump_labels',
rpn_pre_nms_top_N: 6000,
rpn_nms_max_boxes: 300,
rpn_nms_overlap_threshold: 0.7,
object_confidence_thres: 0.0001,
bbox_visualize_threshold: 0.6,
classifier_nms_max_boxes: 100,
classifier_nms_overlap_threshold: 0.3
}
evaluation_config: {
model: '/workspace/tao-experiments/faster_rcnn/frcnn_coco_resnet50.epoch_15.hdf5', # Update this with final model
batch_size: 1,
validation_period_during_training: 1,
rpn_pre_nms_top_N: 6000,
rpn_nms_max_boxes: 300,
rpn_nms_overlap_threshold: 0.7,
classifier_nms_max_boxes: 100,
classifier_nms_overlap_threshold: 0.3,
object_confidence_thres: 0.0001,
use_voc07_11point_metric: False,
gt_matching_iou_threshold: 0.5
}
• Ran the following command:
!tao model faster_rcnn train \
--gpus 4 \
--gpu_index 0 1 2 3 \
-e $SPECS_DIR/spec_resnet50.yaml \
-r /workspace/tao-experiments/faster_rcnn
• This is how my local directories are mapped:
drive_map = {
"Mounts": [
# Mapping the data directory
{
"source": os.environ["LOCAL_PROJECT_DIR"],
"destination": "/workspace/tao-experiments"
},
# Mapping the specs directory.
{
"source": os.environ["LOCAL_SPECS_DIR"],
"destination": os.environ["SPECS_DIR"]
},
# Mapping the data directory
{
"source": os.environ["LOCAL_TRAIN_IMAGES_DIR"],
"destination": os.environ["DOCKER_TRAIN_IMAGES_DIR"]
},
{
"source": os.environ["LOCAL_TEST_IMAGES_DIR"],
"destination": os.environ["DOCKER_TEST_IMAGES_DIR"]
},
{
"source": os.environ["LOCAL_TRAIN_ANNOTATIONS_DIR"],
"destination": os.environ["DOCKER_TRAIN_ANNOTATIONS_DIR"]
},
{
"source": os.environ["LOCAL_TEST_ANNOTATIONS_DIR"],
"destination": os.environ["DOCKER_TEST_ANNOTATIONS_DIR"]
}
],
"DockerOptions":{
"user": f"{os.getuid()}:{os.getgid()}"
}
}
if CLEARML_LOGGED_IN:
if "Envs" not in drive_map.keys():
drive_map["Envs"] = []
drive_map["Envs"].extend([
{
"variable": "CLEARML_WEB_HOST",
"value": os.getenv("CLEARML_WEB_HOST")
},
{
"variable": "CLEARML_API_HOST",
"value": os.getenv("CLEARML_API_HOST")
},
{
"variable": "CLEARML_FILES_HOST",
"value": os.getenv("CLEARML_FILES_HOST")
},
{
"variable": "CLEARML_API_ACCESS_KEY",
"value": os.getenv("CLEARML_API_ACCESS_KEY")
},
{
"variable": "CLEARML_API_SECRET_KEY",
"value": os.getenv("CLEARML_API_SECRET_KEY")
},
])
Once the training loop has started, get a UCX WARN failed to connect to vfs socket. After each epoch, the mAP continues to decrease until it hits 0. Why is this the case? How can I solve it?
Logs:
INFO: Starting Training Loop.
Epoch 1/15
[1701798639.992438] [ce76ee64c0c6:341 :f] vfs_fuse.c:424 UCX WARN failed to connect to vfs socket '������': Invalid argument
[1701798640.409428] [ce76ee64c0c6:344 :f] vfs_fuse.c:424 UCX WARN failed to connect to vfs socket '������': Invalid argument
[1701798640.483896] [ce76ee64c0c6:347 :f] vfs_fuse.c:424 UCX WARN failed to connect to vfs socket '������': Invalid argument
[1701798641.621337] [ce76ee64c0c6:342 :f] vfs_fuse.c:424 UCX WARN failed to connect to vfs socket '������': Invalid argument
1/92 [..............................] - ETA: 1:58:30 - loss: 2.7451 - rpn_out_class_act_qdq_loss: 0.8070 - rpn_out_regress_qdq_loss: 0.3538 - dense_class_td_loss: 0.7456 - dense_regress_td_loss: 0.2469 1/92 [..............................] - ETA: 1:37:56 - loss: 2.6520 - rpn_out_class_act_qdq_loss: 0.5986 - rpn_out_regress_qdq_loss: 0.2623 - dense_class_td_loss: 1.0800 - dense_regress_td_loss: 0.1194 1/92 [..............................] - ETA: 2:01:07 - loss: 2.9666 - rpn_out_class_act_qdq_loss: 0.7850 - rpn_out_regress_qdq_loss: 0.3013 - dense_class_td_loss: 0.9165 - dense_regress_td_loss: 0.3722 1/92 [..............................] - ETA: 1:23:09 - loss: 2.8116 - rpn_out_class_act_qdq_loss: 0.8215 - rpn_out_regress_qdq_loss: 0.4443 - dense_class_td_loss: 0.7303 - dense_regress_td_loss: 0.2237WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/common/utils.py:199: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.
WARNING: From /usr/local/lib/python3.8/dist-packages/nvidia_tao_tf1/cv/common/utils.py:199: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.
92/92 [==============================] - 142s 2s/step - loss: 1.8313 - rpn_out_class_act_qdq_loss: 0.2323 - rpn_out_regress_qdq_loss: 0.1051 - dense_class_td_loss: 0.4038 - dense_regress_td_loss: 0.5004 [============================>.] - ETA: 1s - loss: 1.8369 - rpn_out_class_act_qdq_loss: 0.2327 - rpn_out_regress_qdq_loss: 0.1049 - dense_class_td_loss: 0.4027 - dense_regress_td_loss: 0.506390/92 [============================>.] - ETA: 3s - loss: 1.8101 - rpn_out_class_act_qdq_loss: 0.2270 - rpn_out_regress_qdq_loss: 0.0955 - dense_class_td_loss: 0.3972 - dense_regress_td_loss: 0.5004958621014318305 2/92 [..............................] - ETA: 1:18:08 -
92/92 [==============================] - 158s 2s/step - loss: 1.8062 - rpn_out_class_act_qdq_loss: 0.2253 - rpn_out_regress_qdq_loss: 0.0946 - dense_class_td_loss: 0.3965 - dense_regress_td_loss: 0.5002
92/92 [==============================] - 156s 2s/step - loss: 1.8452 - rpn_out_class_act_qdq_loss: 0.2350 - rpn_out_regress_qdq_loss: 0.1039 - dense_class_td_loss: 0.4030 - dense_regress_td_loss: 0.5136
92/92 [==============================] - 133s 1s/step - loss: 1.8390 - rpn_out_class_act_qdq_loss: 0.2325 - rpn_out_regress_qdq_loss: 0.1048 - dense_class_td_loss: 0.4037 - dense_regress_td_loss: 0.5082
Doing validation at epoch 1(1-based index)...
Doing validation at epoch 1(1-based index)...
Doing validation at epoch 1(1-based index)...
78%|███████▊ | 1141/1472 [01:44<00:29, 11.18it/s]Doing validation at epoch 1(1-based index)...
100%|██████████| 1472/1472 [02:11<00:00, 11.20it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
100%|█████████▉| 1467/1472 [02:11<00:00, 11.86it/s]person 0.0284 0.0594 0.2321 0.3938
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0284
Validation done!
Epoch 2/15
100%|██████████| 1472/1472 [02:12<00:00, 11.12it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
100%|██████████| 1472/1472 [02:12<00:00, 11.08it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
person 0.0288 0.0595 0.2327 0.3952
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0288
15%|█▍ | 217/1472 [00:25<01:51, 11.23it/s]Validation done!
Epoch 2/15
15%|█▌ | 221/1472 [00:25<01:53, 10.98it/s]person 0.0285 0.0591 0.2311 0.3949
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0285
Validation done!
Epoch 2/15
100%|██████████| 1472/1472 [02:09<00:00, 11.38it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
person 0.0282 0.0592 0.2314 0.3937
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0282
Validation done!
INFO: Training loop in progress
Epoch 2/15
92/92 [==============================] - 156s 2s/step - loss: 10.0243 - rpn_out_class_act_qdq_loss: 0.2518 - rpn_out_regress_qdq_loss: 0.1060 - dense_class_td_loss: 0.7024 - dense_regress_td_loss: 6.8182[============================>.] - ETA: 1s - loss: 9.8168 - rpn_out_class_act_qdq_loss: 0.2518 - rpn_out_regress_qdq_loss: 0.1067 - dense_class_td_loss: 0.7039 - dense_regress_td_loss: 6.704890/92 [============================>.] - ETA: 3s - loss: 9.1867 - rpn_out_class_act_qdq_loss: 0.2577 - rpn_out_regress_qdq_loss: 0.1056 - dense_class_td_loss: 0.6917 - dense_regress_td_loss: 6.1
92/92 [==============================] - 158s 2s/step - loss: 9.8692 - rpn_out_class_act_qdq_loss: 0.2624 - rpn_out_regress_qdq_loss: 0.1064 - dense_class_td_loss: 0.6943 - dense_regress_td_loss: 6.6601
92/92 [==============================] - 157s 2s/step - loss: 9.7071 - rpn_out_class_act_qdq_loss: 0.2582 - rpn_out_regress_qdq_loss: 0.1046 - dense_class_td_loss: 0.6912 - dense_regress_td_loss: 6.5071
92/92 [==============================] - 52s 567ms/step - loss: 8.9768 - rpn_out_class_act_qdq_loss: 0.2496 - rpn_out_regress_qdq_loss: 0.1057 - dense_class_td_loss: 0.6595 - dense_regress_td_loss: 5.8160
Doing validation at epoch 2(1-based index)...
Doing validation at epoch 2(1-based index)...
Doing validation at epoch 2(1-based index)...
0%| | 0/1472 [00:00<?, ?it/s]
0%| | 0/1472 [00:00<?, ?it/s]
1%| | 18/1472 [00:01<01:58, 12.27it/s]Doing validation at epoch 2(1-based index)...
100%|██████████| 1472/1472 [01:57<00:00, 12.45it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
99%|█████████▉| 1458/1472 [01:58<00:01, 12.52it/s]person 0.0000 0.0000 0.0000 0.0614
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000
Validation done!
Epoch 3/15
100%|██████████| 1472/1472 [01:59<00:00, 12.36it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
100%|██████████| 1472/1472 [01:57<00:00, 12.50it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
100%|█████████▉| 1470/1472 [01:59<00:00, 12.28it/s]person 0.0000 0.0000 0.0000 0.0582
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000
Validation done!
Epoch 3/15
100%|██████████| 1472/1472 [01:59<00:00, 12.31it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
person 0.0000 0.0000 0.0000 0.0611
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000
Validation done!
INFO: Training loop in progress
Epoch 3/15
person 0.0000 0.0000 0.0000 0.0614
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000
Validation done!
Epoch 3/15
92/92 [==============================] - 52s 566ms/step - loss: 29.4644 - rpn_out_class_act_qdq_loss: 0.3227 - rpn_out_regress_qdq_loss: 0.0680 - dense_class_td_loss: 0.6850 - dense_regress_td_loss: 15.6211============================>.] - ETA: 0s - loss: 29.6206 - rpn_out_class_act_qdq_loss: 0.3233 - rpn_out_regress_qdq_loss: 0.0681 - dense_class_td_loss: 0.6888 - dense_regress_td_loss: 15.773491/92 [============================>.] - ETA: 0s - loss: 30.6456 - rpn_out_class_act_qdq_loss: 0.3300 - rpn_out_regress_qdq_loss: 0.0742 - dense_class_td_loss: 0.7208 - dense_regress_td_loss: 16.75
92/92 [==============================] - 54s 584ms/step - loss: 31.6361 - rpn_out_class_act_qdq_loss: 0.3398 - rpn_out_regress_qdq_loss: 0.0809 - dense_class_td_loss: 0.7658 - dense_regress_td_loss: 17.6821
92/92 [==============================] - 53s 573ms/step - loss: 30.0760 - rpn_out_class_act_qdq_loss: 0.3273 - rpn_out_regress_qdq_loss: 0.0725 - dense_class_td_loss: 0.7164 - dense_regress_td_loss: 16.1922
92/92 [==============================] - 52s 569ms/step - loss: 30.5264 - rpn_out_class_act_qdq_loss: 0.3298 - rpn_out_regress_qdq_loss: 0.0741 - dense_class_td_loss: 0.7244 - dense_regress_td_loss: 16.6306
Doing validation at epoch 3(1-based index)...
Doing validation at epoch 3(1-based index)...
Doing validation at epoch 3(1-based index)...
0%| | 0/1472 [00:00<?, ?it/s]
0%| | 0/1472 [00:00<?, ?it/s]
1%| | 18/1472 [00:01<02:01, 11.98it/s]Doing validation at epoch 3(1-based index)...
100%|██████████| 1472/1472 [01:59<00:00, 12.31it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
100%|██████████| 1472/1472 [02:00<00:00, 12.26it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
99%|█████████▉| 1464/1472 [02:00<00:00, 12.46it/s]person 0.0000 0.0000 0.0000 0.0576
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000
Validation done!
Epoch 4/15
100%|█████████▉| 1470/1472 [01:58<00:00, 12.20it/s]person 0.0000 0.0000 0.0000 0.0575
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000
100%|█████████▉| 1470/1472 [02:00<00:00, 12.11it/s]Validation done!
Epoch 4/15
100%|██████████| 1472/1472 [01:59<00:00, 12.37it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
100%|██████████| 1472/1472 [02:00<00:00, 12.18it/s]==========================================================================================
Class AP precision recall RPN_recall
------------------------------------------------------------------------------------------
person 0.0000 0.0000 0.0000 0.0571
------------------------------------------------------------------------------------------
mAP@0.5 = 0.0000
Validation done!
INFO: Training loop in progress
Epoch 4/15
........
2 posts - 2 participants