Please provide the following information when requesting support.
• Hardware (NVIDIA RTX 3080 Ti)
• Network Type (Deformable_detr)
• TLT Version (tao info
Configuration of the TAO Toolkit Instance
task_group: [‘model’, ‘dataset’, ‘deploy’]
format_version: 3.0
toolkit_version: 5.5.0
published_date: 08/26/2024
)
• Training spec file(
train:
num_gpus: 1
num_nodes: 1
validation_interval: 1
optim:
lr_backbone: 2e-5
lr: 2e-4
lr_steps: [10, 20, 30, 40]
momentum: 0.9
num_epochs: 1
precision: fp32
dataset:
train_data_sources:
- image_dir: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/train2017
json_file: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/instances_train2017.json
val_data_sources:
- image_dir: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/val2017/
json_file: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/instances_val2017.json
num_classes: 91
batch_size: 4
workers: 8
augmentation:
fixed_padding: False
model:
backbone: resnet_50
train_backbone: True
pretrained_backbone_path: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/resnet50_nvimagenetv2.pth.tar
num_feature_levels: 2
return_interm_indices: [1, 2]
dec_layers: 6
enc_layers: 6
num_queries: 300
with_box_refine: True
dropout_ratio: 0.3
)
• How to reproduce the issue ? (
deformable_detr# python scripts/train.py --config-path /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/experiment_specs --config-name train.yaml --cfg job
/tao-pt/nvidia_tao_pytorch/cv/deformable_detr/scripts/train.py:142: UserWarning:
‘train.yaml’ is validated against ConfigStore schema with the same name.
This behavior is deprecated in Hydra 1.1 and will be removed in Hydra 1.2.
See https://hydra.cc/docs/next/upgrades/1.0_to_1.1/automatic_schema_matching for migration instructions.
main()
encryption_key: null
results_dir: /results
wandb:
enable: true
project: TAO Toolkit
entity: ‘’
tags:
- training
- tao-toolkit
reinit: false
sync_tensorboard: false
save_code: false
name: TAO Toolkit training experiment
model:
pretrained_backbone_path: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/resnet50_nvimagenetv2.pth.tar
backbone: resnet_50
num_queries: 300
num_feature_levels: 2
cls_loss_coef: 2.0
bbox_loss_coef: 5.0
giou_loss_coef: 2.0
with_box_refine: true
num_select: 300
return_interm_indices: - 1
- 2
focal_alpha: 0.25
clip_max_norm: 0.1
nheads: 8
dropout_ratio: 0.3
hidden_dim: 256
enc_layers: 6
dec_layers: 6
dim_feedforward: 1024
dec_n_points: 4
enc_n_points: 4
aux_loss: true
dilation: false
train_backbone: true
loss_types: - labels
- boxes
backbone_names: - backbone.0
linear_proj_names: - reference_points
- sampling_offsets
dataset:
train_sampler: default_sampler
train_data_sources: - image_dir: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/train2017
json_file: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/instances_train2017.json
val_data_sources: - image_dir: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/val2017/
json_file: /tao-pt/nvidia_tao_pytorch/cv/deformable_detr/instances_val2017.json
test_data_sources: null
infer_data_sources: null
batch_size: 4
workers: 8
pin_memory: true
dataset_type: serialized
num_classes: 91
eval_class_ids: null
augmentation:
scales:- 480
- 512
- 544
- 576
- 608
- 640
- 672
- 704
- 736
- 768
- 800
input_mean: - 0.485
- 0.456
- 0.406
input_std: - 0.229
- 0.224
- 0.225
train_random_resize: - 400
- 500
- 600
horizontal_flip_prob: 0.5
train_random_crop_min: 384
train_random_crop_max: 600
random_resize_max_size: 1333
test_random_resize: 800
fixed_padding: false
fixed_random_crop: null
train:
num_gpus: 1
gpu_ids:
- 0
num_nodes: 1
seed: 1234
cudnn:
benchmark: false
deterministic: true
num_epochs: 1
checkpoint_interval: 1
validation_interval: 1
resume_training_checkpoint_path: null
results_dir: null
freeze:
pretrained_model_path: null
clip_grad_norm: 0.1
is_dry_run: false
optim:
optimizer: AdamW
monitor_name: val_loss
lr: 0.0002
lr_backbone: 2.0e-05
lr_linear_proj_mult: 0.1
momentum: 0.9
weight_decay: 0.0001
lr_scheduler: MultiStep
lr_steps:- 10
- 20
- 30
- 40
lr_step_size: 40
lr_decay: 0.1
precision: fp32
distributed_strategy: ddp
activation_checkpoint: true
verbose: false
evaluate:
num_gpus: 1
gpu_ids:
- 0
num_nodes: 1
checkpoint: ???
results_dir: null
input_width: null
input_height: null
trt_engine: null
conf_threshold: 0.0
inference:
num_gpus: 1
gpu_ids: - 0
num_nodes: 1
checkpoint: ???
results_dir: null
trt_engine: null
color_map: null
conf_threshold: 0.5
is_internal: false
input_width: null
input_height: null
outline_width: 3
export:
results_dir: null
gpu_id: 0
checkpoint: ???
onnx_file: ???
on_cpu: false
input_channel: 3
input_width: 960
input_height: 544
opset_version: 17
batch_size: -1
verbose: false
gen_trt_engine:
results_dir: null
gpu_id: 0
onnx_file: ???
trt_engine: null
input_channel: 3
input_width: 960
input_height: 544
opset_version: 17
batch_size: -1
verbose: false
tensorrt:
data_type: FP32
workspace_size: 1024
min_batch_size: 4
opt_batch_size: 4
max_batch_size: 4
calibration:
cal_image_dir: ???
cal_cache_file: ???
cal_batch_size: 1
cal_batches: 1
)
1 post - 1 participant