Below is a video from a camera trap set in New Zealand for which we wish to detect the species of critter in the video.
Below is a video from a camera trap set in New Zealand for which we wish to detect the species of critter in the video
from IPython.display import Video Video('vid0027.mp4', embed=True)
We load the required modules, this notebook requires tensorflow==2.2 and openCV_python==2
import argparse import glob import os import statistics import sys import time import warnings import cv2 import json from model import efficientdet from utils import preprocess_image, postprocess_boxes import humanfriendly import numpy as np from tqdm import tqdm #from ct_utils import truncate_float #import visualization.visualization_utils as viz_utils
The following block contains the code used to load the model and generate the inference
# ignoring all "PIL cannot read EXIF metainfo for the images" warnings
warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
# Metadata Warning, tag 256 had too many entries: 42, expected 1
warnings.filterwarnings('ignore', 'Metadata warning', UserWarning)
# Numpy FutureWarnings from tensorflow import
warnings.filterwarnings('ignore', category=FutureWarning)
from tensorflow import keras
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU
config.log_device_placement = True # to log device placement (on which device the operation ran)
sess = tf.compat.v1.Session(config=config)
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
#tf.Session(config=config)
print('TensorFlow version:', tf.__version__)
print('Is GPU available? tf.test.is_gpu_available:', tf.test.is_gpu_available())
Device mapping: /job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device /job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device TensorFlow version: 2.2.0 WARNING:tensorflow:From <ipython-input-3-fd84f7977e8b>:21: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version. Instructions for updating: Use `tf.config.list_physical_devices('GPU')` instead. Is GPU available? tf.test.is_gpu_available: False
class VideoPathUtils: """A collection of utility functions supporting this stand-alone script""" # Stick this into filenames before the extension for the rendered result DETECTION_FILENAME_INSERT = '_detections' video_extensions = ['.avi', '.mov'] @staticmethod def is_video_file(s): """ Check a file's extension against a hard-coded set of image file extensions ' """ ext = os.path.splitext(s)[1] return ext.lower() in VideoPathUtils.video_extensions @staticmethod def find_video_files(strings): """ Given a list of strings that are potentially image file names, look for strings that actually look like image file names (based on extension). """ return [s for s in strings if VideoPathUtils.is_video_file(s)] @staticmethod def find_videos(dir_name, recursive=False): """ Find all files in a directory that look like image file names """ if recursive: strings = glob.glob(os.path.join(dir_name, '**', '*.*'), recursive=True) else: strings = glob.glob(os.path.join(dir_name, '*.*')) video_strings = VideoPathUtils.find_video_files(strings) return video_strings class TFDetector: """ A detector model loaded at the time of initialization. It is intended to be used with the MegaDetector (TF). The inference batch size is set to 1; code needs to be modified to support larger batch sizes, including resizing appropriately. """ # Number of decimal places to round to for confidence and bbox coordinates CONF_DIGITS = 3 COORD_DIGITS = 4 # MegaDetector was trained with batch size of 1, and the resizing function is a part # of the inference graph BATCH_SIZE = 1 # An enumeration of failure reasons FAILURE_TF_INFER = 'Failure TF inference' FAILURE_IMAGE_OPEN = 'Failure image access' DEFAULT_RENDERING_CONFIDENCE_THRESHOLD = 0.4 # to render bounding boxes DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD = 0.2 # to include in the output json file DEFAULT_DETECTOR_LABEL_MAP = { '1': 'rat', '2': 'possum', '3': 'stoat', '4': 'cat', '5': 'bird', '6': 'leaf' # will be available in megadetector v4 } NUM_DETECTOR_CATEGORIES = 5 # animal, person, group, vehicle - for color assignment def __init__(self, model_path,phi): """Loads the model at model_path and start a tf.Session with this graph. The necessary input and output tensor handles are obtained also.""" model, detection_graph = efficientdet(phi, num_classes=6, num_anchors=9, weighted_bifpn=False, freeze_bn=False, detect_quadrangle=False ) detection_graph.load_weights(model_path,by_name=True,skip_mismatch=True) self.model = detection_graph image_sizes = (512, 640, 768, 896, 1024, 1280, 1408) self.image_size = image_sizes[phi] @staticmethod def round_and_make_float(d, precision=4): return truncate_float(float(d), precision=precision) @staticmethod def __convert_coords(np_array): """ Two effects: convert the numpy floats to Python floats, and also change the coordinates from [y1, x1, y2, x2] to [x1, y1, width_box, height_box] (in relative coordinates still). Args: np_array: array of predicted bounding box coordinates from the TF detector Returns: array of predicted bounding box coordinates as Python floats and in [x1, y1, width_box, height_box] """ # change from [y1, x1, y2, x2] to [x1, y1, width_box, height_box] width_box = np_array[2] - np_array[0] height_box = np_array[3] - np_array[1] new = [np_array[0], np_array[1], width_box, height_box] # cannot be a numpy array; needs to be a list # convert numpy floats to Python floats for i, d in enumerate(new): new[i] = TFDetector.round_and_make_float(d, precision=TFDetector.COORD_DIGITS) return new def _generate_detections_one_image(self, image): n_image = preprocess_image(image,self.image_size) #n_image = preprocess.normalize_image(n_image) n_image = tf.expand_dims(n_image[0],0) box_tensor_out, score_tensor_out, class_tensor_out = self.model.predict(n_image) return box_tensor_out, score_tensor_out, class_tensor_out def generate_detections_one_image(self, image, image_id, detection_threshold=DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD): """Apply the detector to an image. Args: image: the PIL Image object image_id: a path to identify the image; will be in the `file` field of the output object detection_threshold: confidence above which to include the detection proposal Returns: A dict with the following fields, see https://github.com/microsoft/CameraTraps/tree/siyu/inference_refactor/api/batch_processing#batch-processing-api-output-format - image_id (always present) - max_detection_conf - detections, which is a list of detection objects containing `category`, `conf` and `bbox` - failure """ result = { 'frame': image_id } try: b_box, b_score, b_class = self._generate_detections_one_image(image) # our batch size is 1; need to loop the batch dim if supporting batch size > 1 boxes, scores, classes = b_box[0], b_score[0], b_class[0] detections_cur_image = [] # will be empty for an image with no confident detections max_detection_conf = 0.0 for b, s, c in zip(boxes, scores, classes): if s > detection_threshold: bbox2=TFDetector.__convert_coords(b) bbox2[0]=bbox2[0]/self.image_size bbox2[1]=bbox2[1]/self.image_size*16/9 bbox2[2]=bbox2[2]/self.image_size bbox2[3]=bbox2[3]/self.image_size*16/9 detection_entry = { 'category': str(int(c)), # use string type for the numerical class label, not int 'conf': truncate_float(float(s), # cast to float for json serialization precision=TFDetector.CONF_DIGITS), 'bbox': bbox2 } detections_cur_image.append(detection_entry) if s > max_detection_conf: max_detection_conf = s result['max_detection_conf'] = truncate_float(float(max_detection_conf), precision=TFDetector.CONF_DIGITS) result['detections'] = detections_cur_image except Exception as e: result['failure'] = TFDetector.FAILURE_TF_INFER print('TFDetector: image {} failed during inference: {}'.format(image_id, str(e))) return result #%% Main function def load_and_run_detector(model_file, phi, video_file_names, output_dir, render_confidence_threshold=TFDetector.DEFAULT_RENDERING_CONFIDENCE_THRESHOLD): if len(video_file_names) == 0: print('Warning: no files available') return # load and run detector on target images, and visualize the results start_time = time.time() gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) tf_detector = TFDetector(model_file, phi) elapsed = time.time() - start_time print('Loaded model in {}'.format(humanfriendly.format_timespan(elapsed))) time_load = [] time_infer = [] # since we'll be writing a bunch of files to the same folder, rename # as necessary to avoid collisions output_file_names = {} for vid_file in tqdm(video_file_names): try: detection_results = [] vid_images = cv2.VideoCapture(vid_file) frame_width = int(vid_images.get(3)) frame_height = int(vid_images.get(4)) fn = os.path.basename(vid_file).lower() name, ext = os.path.splitext(fn) fn = '{}{}{}'.format(name, VideoPathUtils.DETECTION_FILENAME_INSERT, '.avi') # save all as AVI if fn in output_file_names: n_collisions = output_file_names[fn] # if there were a collision, the count is at least 1 fn = str(n_collisions) + '_' + fn output_file_names[fn] = n_collisions + 1 else: output_file_names[fn] = 0 output_full_path = os.path.join(output_dir, fn) fname,ext=os.path.splitext(fn) fjson='{}{}'.format(fname,'.json') json_full_path = os.path.join(output_dir, fjson) result = { 'file': vid_file, 'width': frame_width, 'height': frame_height } detection_results.append(result) out = cv2.VideoWriter(output_full_path,cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height)) frame_no=0 while(vid_images.isOpened()): frame_no=frame_no+1 start_time = time.time() ret, image = vid_images.read() if ret == True: elapsed = time.time() - start_time time_load.append(elapsed) else: break start_time = time.time() result = tf_detector.generate_detections_one_image(image, frame_no) detection_results.append(result) elapsed = time.time() - start_time time_infer.append(elapsed) cv2.imwrite("tmp.jpg",image) tmp_image_file=viz_utils.load_image("tmp.jpg") viz_utils.render_detection_bounding_boxes(result['detections'], tmp_image_file, label_map=TFDetector.DEFAULT_DETECTOR_LABEL_MAP, confidence_threshold=render_confidence_threshold) out.write(np.asarray(tmp_image_file)) with open(json_full_path, 'w') as jsonoutfile: json.dump(detection_results,jsonoutfile) out.release() vid_images.release() except Exception as e: print('Video {} cannot be loaded. Exception: {}'.format(vid_file, e)) result = { 'file': vid_file, 'failure': TFDetector.FAILURE_IMAGE_OPEN } detection_results.append(result) continue
The following block runs the code, adds additional files to video_file_names to include them in the inference
video_file_names=[]
video_file_names.append('vid0027.AVI')
load_and_run_detector(model_file='d3_updated.h5',
phi=3,
video_file_names=video_file_names,
output_dir='./',
render_confidence_threshold=0.6)
WARNING:tensorflow:Layer boxes is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx. If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2. To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor. 100%|██████████| 1/1 [00:00<00:00, 6.05it/s] Loaded model in 10.58 seconds
We then visualize the annotated video of the detection in the notebook
Video('vid0027_detections.mp4', embed=True)