Transfer learning is the process of transferring learned features from one application to another. It is a commonly used training technique where you use a model trained on one task and re-train to use it on a different task. Train Adapt Optimize (TAO) Toolkit is a simple and easy-to-use Python based AI toolkit for taking purpose-built AI models and customizing them with users' own data.
import os
import glob
import subprocess
import getpass
import uuid
import json
namespace = 'default'
# SKIP this step IF you have already installed the TAO-Client wheel.
! pip3 install nvidia-tao-client
# View the version of the TAO-Client
! tao-client --version
# Create a new UUID
user_id = str(uuid.uuid4())
print(user_id)
# Below selection of IP and port is assuming one runs the remote client [notebook] from a node within the cluster
api_service_ip_addr = subprocess.getoutput(f'kubectl get service tao-toolkit-api-service -n {namespace} -o jsonpath="{{.spec.clusterIP}}"')
api_service_port = subprocess.getoutput(f'kubectl get service tao-toolkit-api-service -n {namespace} -o jsonpath="{{.spec.ports[0].port}}"')
print(api_service_ip_addr, api_service_port)
%env BASE_URL=http://{api_service_ip_addr}:{api_service_port}/api/v1/user/{user_id}
! echo $BASE_URL
# Get PVC ID
pvc_id = subprocess.getoutput(f'kubectl get pvc tao-toolkit-api-pvc -n {namespace} -o jsonpath="{{.spec.volumeName}}"')
print(pvc_id)
# Get NFS server info
provisioner = json.loads(subprocess.getoutput(f'helm get values nfs-subdir-external-provisioner -o json'))
nfs_server = provisioner['nfs']['server']
nfs_path = provisioner['nfs']['path']
print(nfs_server, nfs_path)
user = getpass.getuser()
home = os.path.expanduser('~')
! echo "Password for {user}"
password = getpass.getpass()
# Mount shared volume
! mkdir -p ~/shared
command = "apt-get -y install nfs-common >> /dev/null"
! echo {password} | sudo -S -k {command}
command = f"mount -t nfs {nfs_server}:{nfs_path}/{namespace}-tao-toolkit-api-pvc-{pvc_id} ~/shared"
! echo {password} | sudo -S -k {command} && echo DONE
We will be using the kitti object detection dataset for this example. To find more details, please visit http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=2d. One can request the images from here, and the training labels from here.
# FIXME: Update URLs as per emails (note that direct URLs are embedded/encoded within the links)
%env IMAGES_URL=https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_image_2.zip
%env LABELS_URL=https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_label_2.zip
! wget -O images.zip $IMAGES_URL
! wget -O labels.zip $LABELS_URL
! unzip -u images.zip
! unzip -u labels.zip
! mkdir -p evaluating/image_2 && mv training/image_2/000*.png evaluating/image_2/
! mkdir -p evaluating/label_2 && mv training/label_2/000*.txt evaluating/label_2/
! mkdir -p test_samples/image_2 && mv testing/image_2/00000*.png test_samples/image_2/
train_dataset_id = subprocess.getoutput("tao-client detectnet-v2 dataset-create --format kitti")
print(train_dataset_id)
! rsync -ah --info=progress2 training/image_2/ ~/shared/users/{user_id}/datasets/{train_dataset_id}/images
! rsync -ah --info=progress2 training/label_2/ ~/shared/users/{user_id}/datasets/{train_dataset_id}/labels
! echo DONE
eval_dataset_id = subprocess.getoutput("tao-client detectnet-v2 dataset-create --format kitti")
print(eval_dataset_id)
! rsync -ah --info=progress2 evaluating/image_2/ ~/shared/users/{user_id}/datasets/{eval_dataset_id}/images
! rsync -ah --info=progress2 evaluating/label_2/ ~/shared/users/{user_id}/datasets/{eval_dataset_id}/labels
! echo DONE
infer_dataset_id = subprocess.getoutput("tao-client detectnet-v2 dataset-create --format raw")
print(infer_dataset_id)
! rsync -ah --info=progress2 test_samples/image_2/ ~/shared/users/{user_id}/datasets/{infer_dataset_id}/images
! echo DONE
pattern = os.path.join(home, 'shared', 'users', user_id, 'datasets', '*', 'metadata.json')
datasets = []
for metadata_path in glob.glob(pattern):
with open(metadata_path, 'r') as metadata_file:
datasets.append(json.load(metadata_file))
print(json.dumps(datasets, indent=2))
# Default train dataset specs
! tao-client detectnet-v2 dataset-convert-defaults --id {train_dataset_id} | tee ~/shared/users/{user_id}/datasets/{train_dataset_id}/specs/convert.json
# Customize train dataset specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'datasets', train_dataset_id, 'specs', 'convert.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["kitti_config"]["image_extension"] = ".png"
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
# Default eval dataset specs
! tao-client detectnet-v2 dataset-convert-defaults --id {eval_dataset_id} | tee ~/shared/users/{user_id}/datasets/{eval_dataset_id}/specs/convert.json
# Customize eval dataset specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'datasets', eval_dataset_id, 'specs', 'convert.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["kitti_config"]["image_extension"] = ".png"
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
train_convert_job_id = subprocess.getoutput("tao-client detectnet-v2 dataset-convert --id " + train_dataset_id)
print(train_convert_job_id)
def my_tail(logs_dir, log_file):
%env LOG_FILE={logs_dir}/{log_file}
! mkdir -p {logs_dir}
! [ ! -f "$LOG_FILE" ] && touch $LOG_FILE && chmod 666 $LOG_FILE
! tail -f -n +1 $LOG_FILE | while read LINE; do echo "$LINE"; [[ "$LINE" == "EOF" ]] && pkill -P $$ tail; done
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
logs_dir = os.path.join(home, 'shared', 'users', user_id, 'datasets', train_dataset_id, 'logs')
log_file = f"{train_convert_job_id}.txt"
my_tail(logs_dir, log_file)
eval_convert_job_id = subprocess.getoutput("tao-client detectnet-v2 dataset-convert --id " + eval_dataset_id)
print(eval_convert_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
logs_dir = os.path.join(home, 'shared', 'users', user_id, 'datasets', eval_dataset_id, 'logs')
log_file = f"{eval_convert_job_id}.txt"
my_tail(logs_dir, log_file)
model_id = subprocess.getoutput("tao-client detectnet-v2 model-create")
print(model_id)
pattern = os.path.join(home, 'shared', 'users', '*', 'models', '*', 'metadata.json')
ptm_id = None
for metadata_path in glob.glob(pattern):
with open(metadata_path, 'r') as metadata_file:
metadata = json.load(metadata_file)
ngc_path = metadata.get("ngc_path")
if ngc_path and "detectnet_v2:resnet18" in ngc_path:
ptm_id = metadata["id"]
break
print(ptm_id)
metadata_path = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'metadata.json')
with open(metadata_path , "r") as metadata_file:
metadata = json.load(metadata_file)
metadata["encryption_key"] = "tlt_encode"
metadata["train_datasets"] = [train_dataset_id]
metadata["eval_dataset"] = eval_dataset_id
metadata["inference_dataset"] = infer_dataset_id
metadata["ptm"] = ptm_id
with open(metadata_path, "w") as metadata_file:
json.dump(metadata, metadata_file, indent=2)
print(json.dumps(metadata, indent=2))
# Default train model specs
! tao-client detectnet-v2 model-train-defaults --id {model_id} | tee ~/shared/users/{user_id}/models/{model_id}/specs/train.json
# Customize train model specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'specs', 'train.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["training_config"]["num_epochs"] = 10
specs["model_config"]["num_layers"] = 18
specs["dataset_config"]["image_extension"] = "png"
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
train_job_id = subprocess.getoutput("tao-client detectnet-v2 model-train --id " + model_id)
print(train_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
logs_dir = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'logs')
log_file = f"{train_job_id}.txt"
my_tail(logs_dir, log_file)
# Default evaluate model specs
! tao-client detectnet-v2 model-evaluate-defaults --id {model_id} | tee ~/shared/users/{user_id}/models/{model_id}/specs/evaluate.json
# Customize evaluate model specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'specs', 'evaluate.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["training_config"]["num_epochs"] = 10
specs["model_config"]["num_layers"] = 18
specs["dataset_config"]["image_extension"] = "png"
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
eval_job_id = subprocess.getoutput(f"tao-client detectnet-v2 model-evaluate --id {model_id} --job {train_job_id}")
print(eval_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
log_file = f"{eval_job_id}.txt"
my_tail(logs_dir, log_file)
# Default prune model specs
! tao-client detectnet-v2 model-prune-defaults --id {model_id} | tee ~/shared/users/{user_id}/models/{model_id}/specs/prune.json
prune_job_id = subprocess.getoutput(f"tao-client detectnet-v2 model-prune --id {model_id} --job {train_job_id}")
print(prune_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
log_file = f"{prune_job_id}.txt"
my_tail(logs_dir, log_file)
# Default retrain model specs
! tao-client detectnet-v2 model-retrain-defaults --id {model_id} | tee ~/shared/users/{user_id}/models/{model_id}/specs/retrain.json
# Customize retrain model specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'specs', 'retrain.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["training_config"]["num_epochs"] = 10
specs["model_config"]["num_layers"] = 18
specs["dataset_config"]["image_extension"] = "png"
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
retrain_job_id = subprocess.getoutput(f"tao-client detectnet-v2 model-retrain --id {model_id} --job {prune_job_id}")
print(retrain_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
log_file = f"{retrain_job_id}.txt"
my_tail(logs_dir, log_file)
eval2_job_id = subprocess.getoutput(f"tao-client detectnet-v2 model-evaluate --id {model_id} --job {retrain_job_id}")
print(eval2_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
log_file = f"{eval2_job_id}.txt"
my_tail(logs_dir, log_file)
# Default export model specs
! tao-client detectnet-v2 model-export-defaults --id {model_id} | tee ~/shared/users/{user_id}/models/{model_id}/specs/export.json
# Customize export model specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'specs', 'export.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["data_type"] = "fp32"
del specs["batch_size"]
del specs["batches"]
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
fp32_export_job_id = subprocess.getoutput(f"tao-client detectnet-v2 model-export --id {model_id} --job {train_job_id}")
print(fp32_export_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
log_file = f"{fp32_export_job_id}.txt"
my_tail(logs_dir, log_file)
# Default export model specs
! tao-client detectnet-v2 model-export-defaults --id {model_id} | tee ~/shared/users/{user_id}/models/{model_id}/specs/export.json
# Customize export model specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'specs', 'export.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["data_type"] = "int8"
specs["batches"] = 10
specs["batch_size"] = 4
specs["max_batch_size"] = 4
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
int8_export_job_id = subprocess.getoutput(f"tao-client detectnet-v2 model-export --id {model_id} --job {train_job_id}")
print(int8_export_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
log_file = f"{int8_export_job_id}.txt"
my_tail(logs_dir, log_file)
# Default convert model specs
! tao-client detectnet-v2 model-convert-defaults --id {model_id} | tee ~/shared/users/{user_id}/models/{model_id}/specs/convert.json
# Customize convert model specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'specs', 'convert.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["t"] = "int8"
specs["b"] = 10
specs["m"] = 64
specs["d"] = "3,384,1248"
specs["o"] = "output_cov/Sigmoid,output_bbox/BiasAdd"
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
convert_job_id = subprocess.getoutput(f"tao-client detectnet-v2 model-convert --id {model_id} --job {int8_export_job_id}")
print(convert_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
log_file = f"{convert_job_id}.txt"
my_tail(logs_dir, log_file)
# Default inference model specs
! tao-client detectnet-v2 model-inference-defaults --id {model_id} | tee ~/shared/users/{user_id}/models/{model_id}/specs/inference.json
# Customize TAO inference specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'specs', 'inference.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["inferencer_config"]["image_height"] = 384
specs["inferencer_config"]["image_width"] = 1248
specs["inferencer_config"]["batch_size"] = 4
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
tlt_inference_job_id = subprocess.getoutput(f"tao-client detectnet-v2 model-inference --id {model_id} --job {train_job_id}")
print(tlt_inference_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
log_file = f"{tlt_inference_job_id}.txt"
my_tail(logs_dir, log_file)
from IPython.display import Image
import glob
job_dir = f"{home}/shared/users/{user_id}/models/{model_id}/{tlt_inference_job_id}"
sample_image = glob.glob(f"{job_dir}/images_annotated/*.png")[6]
Image(filename=sample_image)
# Default inference model specs
! tao-client detectnet-v2 model-inference-defaults --id {model_id} | tee ~/shared/users/{user_id}/models/{model_id}/specs/inference.json
# Customize TAO inference specs
specs_path = os.path.join(home, 'shared', 'users', user_id, 'models', model_id, 'specs', 'inference.json')
with open(specs_path , "r") as specs_file:
specs = json.load(specs_file)
specs["inferencer_config"]["image_height"] = 384
specs["inferencer_config"]["image_width"] = 1248
specs["inferencer_config"]["batch_size"] = 4
with open(specs_path, "w") as specs_file:
json.dump(specs, specs_file, indent=2)
print(json.dumps(specs, indent=2))
trt_inference_job_id = subprocess.getoutput(f"tao-client detectnet-v2 model-inference --id {model_id} --job {int8_export_job_id}")
print(trt_inference_job_id)
# Check status (the file won't exist until the backend Toolkit container is running -- can take several minutes)
log_file = f"{trt_inference_job_id}.txt"
my_tail(logs_dir, log_file)
! rm -rf ~/shared/users/{user_id}/models/{model_id}
! echo DONE
! rm -rf ~/shared/users/{user_id}/datasets/{train_dataset_id}
! rm -rf ~/shared/users/{user_id}/datasets/{eval_dataset_id}
! echo DONE
command = "umount ~/shared"
! echo {password} | sudo -S -k {command} && echo DONE
! pip3 uninstall -y nvidia-tao-client