Scaling up Benthic Primary Productivity Estimates in a Large Intertidal Estuary using Remote Sensing

Zhanchao Shao, Prof. Karin Bryan, Dr Moritz Lemann, Georgina Flowers and Prof. Conrad Pilditch have been working together to develop a new methodology using machine learning and remote sensing data to map accurate and detailed spatial distribution of gross primary productivity in Tauranga Harbour. More information is coming soon. The codes are imported from Google Colab. If anyone needs the original data, please kindly contact Zhanchao. Thanks to Sanne Vaassen for providing detailed instructions and comments to make the codes easier to understand. Please read the instructions before starting. Here is the link: <>

To provide maps of the productivity of seagrass and unvegetated flats (MPB) in Tauranga Harbour.
Zhanchao Shao, Karin R. Bryan, Moritz Lehmann, Georgina Flowers and Conrad Pildtich (
To Be Determined

Chapter 1: Using supervised classification with random forest to detect seagrass and unvegetated flats.

Step 1 #connect google colab to google drive.

from google.colab import drive drive.mount('/content/gdrive', force_remount=True) root_dir = '/content/gdrive/My Drive/'

Step 2 #Import all the libraries we need.

import xlrd import statsmodels.api as sm ## gdal from osgeo import gdal, gdal_array import numpy as np from numpy.linalg import inv from scipy.stats.distributions import chi2 from itertools import chain import matplotlib.pyplot as plt from osgeo import ogr ## AI from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score, accuracy_score, f1_score, precision_score, recall_score

Step 3 #Convert your manually-labeled shapefile data to raster

raster = gdal.Open('/content/gdrive/My Drive/Reflectance/Rrs20220224/Rrs20220224_cl.tif'##Please merge your individual band to one Gtiff if you are using Sentinel-2 data vector = ogr.Open('/content/gdrive/My Drive/Supervised_classification/classification_reproject_summer1.shp') lyr = vector.GetLayer() geot = raster.GetGeoTransform() proj = raster.GetProjectionRef() vec_raster = gdal.GetDriverByName('GTiff') chn_ras = vec_raster.Create('/content/gdrive/My Drive/Supervised_classification/supervised_classification_ras_Summer.tif',raster.RasterXSize,raster.RasterYSize,1,gdal.GDT_Byte) chn_ras.SetGeoTransform(geot) chn_ras.SetProjection(proj) gdal.RasterizeLayer(chn_ras, [1], lyr, options=['ATTRIBUTE=id']) # the field name should correspond to the classificaiton that you would like to use for supervised classification chn_ras.GetRasterBand(1).SetNoDataValue(0) chn_ras = None*## This process can also be done in ArcGIS or QGIS.* # import the raster data we have just converted from shapefile\ vector_raster = gdal.Open('/content/gdrive/My Drive/Supervised_classification/supervised_classification_ras_Summer.tif') vec = vector_raster.GetRasterBand(1).ReadAsArray() #Create a container to save the result width = raster.RasterYSize; lenth = raster.RasterXSize; number = raster.RasterCount container = np.ones((width, lenth, number),gdal_array.GDALTypeCodeToNumericTypeCode(raster.GetRasterBand(1).DataType)) for i in range(number): container[:,:,i] = raster.GetRasterBand(i+1).ReadAsArray().astype(np.float32) fit_X  = container[vec>0,:] fit_y  = vec[vec>0]

Step 4 #Split the data into training and testing

fit_X_train, fit_X_test, fit_y_train, fit_y_test = train_test_split(fit_X,fit_y,test_size = 0.7)

Step 5 #Validation (please use the fivefold cross-validation to obtain the optimal values for parameters of RF)

The optimal values: n_estimators = 280,min_samples_split = 4,min_samples_leaf = 4,max_depth = 90

Step 6 #Test the model

RandomForest = RandomForestClassifier(n_estimators=280,min_samples_split=4,min_samples_leaf=4,max_depth=90)` randomforest =,fit_y_train) y_predict = randomforest.predict(fit_X_test)


Here is the classification report: (Class 1, 2 and 3 are sparse seagrass, dense seagrass and unvegetated flats, respectively).

Step 7 #Apply the model

Final_container = (container.shape[0]*container.shape[1],container.shape[2]) Final_container_re = container[:,:,:4].reshape(Final_container) supervised_classification = randomforest.predict(Final_container_re) supervised_classification = supervised_classification.reshape(container[:,:,0].shape) # reshape the results` plt.imshow(supervised_classification) # show the results` plt.colorbar()


Step 8 #Save the model and results

import joblib joblib.dump(randomforest,"/content/gdrive/My Drive/Supervised_classification/model_summer.joblib") raster = gdal.Open('/content/gdrive/My Drive/Reflectance/Rrs20211231/Rrs20220224_cl.tif') ##This tif file is used for georeference` geot = raster.GetGeoTransform() proj = raster.GetProjectionRef() filename = '/content/gdrive/My Drive/Supervised_classification/sc20220221.tif' xsize = np.shape(supervised_classification)[1] ysize = np.shape(supervised_classification)[0] driver = gdal.GetDriverByName('GTiff') dataset = driver.Create(filename, xsize, ysize, bands = 1, eType = gdal.GDT_UInt16) dataset.GetRasterBand(1).WriteArray(supervised_classification) dataset.SetGeoTransform(geot) dataset.SetProjection(proj) dataset.FlushCache() dataset=None
Chapter 2: Using ANN, SVR and RFR to estimate seagrass percentage coverage

Section 1: ANN

Step 1: #Connect google colab to google drive

from google.colab import drive drive.mount('/content/gdrive', force_remount=True) root_dir = '/content/gdrive/My Drive/'


Step 2: #Import the libraries we need

from tensorflow.keras import models, layers, utils, backend as K import matplotlib.pyplot as plt import shap import numpy as np from sklearn.model_selection import train_test_split from keras import models from keras import layers import tensorflow as tf import cv2 import xlrd


Step 3: # Read and split the data.

table = xlrd.open_workbook('/content/gdrive/My Drive/seagrass_coverage/original_data.xlsx') sheet = table.sheets()[0] x1 = sheet.col_values(6,1) #Band Blue x2 = sheet.col_values(7,1) #Band Green x3 = sheet.col_values(8,1) #Band Red x4 = sheet.col_values(9,1) #Band NIR x5 = sheet.col_values(10,1) #NDWI x6 = sheet.col_values(11,1) #NDVI x = np.array([x1,x2,x3,x4,x5,x6]).T #Transpose the array y = np.array(sheet.col_values(5,1))/100 #seagrass percentage coverage x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2) #Split the data; 80% for training x_test, x_val, y_test, y_val = train_test_split(x_test,y_test,test_size = 0.5) #10% for validation and 10% for testing


Step 4: # Validation

from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score import math result = [] for m in range(2,40,2): tf.keras.backend.clear_session() model = models.Sequential() tf.random.set_seed(1) model.add(layers.Dense(m, activation = 'relu', input_shape = (x_train.shape[1], ))) ##Change different activation functions to find out the best one. model.add(layers.Dense(m, activation = 'relu')), model.add(layers.Dense(1,)) opt = tf.keras.optimizers.Adam(learning_rate=0.01) ##Change different Learning rate to find out the best one. model.compile(opt, loss = 'mse', metrics = ['mse']),y_train,epochs = 1000,batch_size = 300, verbose = 0) y_val_test = model.predict(x_val) r2 = r2_score(y_val,y_val_test) #model.summary() #The optimal values: #Number of hidden layers: 2 [Please check the methods in the paper] #Number of neurons : 10 #Activation function: ReLu #Learning rate: 0.01


Step 5: # Testing

tf.random.set_seed(1) model = models.Sequential() tf.keras.backend.clear_session() model.add(layers.Dense(10, activation = 'relu', input_shape = (x_train.shape[1], ))) model.add(layers.Dense(10,activation = 'relu')), model.add(layers.Dense(1,)) opt = tf.keras.optimizers.Adam(learning_rate=0.01) model.compile(opt, loss = 'mse', metrics = ['mse']) reuslt =,y_train,epochs = 1000,batch_size = 300, verbose = 0) y_test_predict = model.predict(x_test) plt.figure(figsize=[5,5]) plt.scatter(y_test,y_test_predict) xx = [0,1] yy = [0,1] plt.plot(xx,yy,'-') plt.xlim([0,1]) plt.ylim([0,1]) from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score import math print(math.sqrt(mean_squared_error(y_test_predict.flatten(),y_test.flatten()))) 0.11175243324504268 print(r2_score(y_test_predict,y_test)) 0.7106385173472587


Section 2: SVR

Step 1: Import the library we need

from sklearn.svm import SVR

## Because we still use the same training, validating and testing data as we used in ANN, we skip splitting the dataset.

Step 2: Validation

result = [] for i in range(1,20): for m in np.arange(0.1, 2, 0.1): regressor = SVR(kernel = 'rbf', C =i, gamma=m) ##By changing the type of kernel function, we can find out the optimal one.,y_train) y_val_predict = regressor.predict(x_val) print (i, "+", m) print(mean_squared_error(y_val.flatten(),y_val_predict.flatten())) print(r2_score(y_val,y_val_predict)) result.append([i,m,r2_score(y_val,y_val_predict)])

The optimal values:

Kerel function: RBF

Gamma: 0.1

C: 13

Step 3: Testing

regressor = SVR(kernel = 'rbf', C =13, gamma=0.1),y_train) y_predict = regressor.predict(x_test) print((math.sqrt(mean_squared_error(y_test.flatten(),y_predict.flatten())))) #0.1256412972939246 print(r2_score(y_test,y_predict)) #0.7231141583032319 plt.figure(figsize=[8,8]) plt.scatter(y_test,y_predict) plt.ylim([0,1]) plt.xlim([0,1])


Section 3: RFR

Step 1: Import the libraries we need

from sklearn.ensemble import RandomForestRegressor

from sklearn.datasets import make_regression

## Because we still use the same training, validating and testing data as we used in ANN, we skip splitting the dataset.

Step 2: Validation

Accuracy = [] for i in range(20,500,20):##find out the optimum tree numbers for supervised classification for j in range(2,10,2):##find out the optimum min_samples_split for k in range(1,5,1):##find out the optimum min_samples_leaf for l in range(10,120,10):##find out the optimum max depth RandomForest = RandomForestRegressor(n_estimators=i,min_samples_split=j,min_samples_leaf=k,max_depth=l) randomforest =,y_train) y_val_predict = randomforest.predict(x_val) AS = mean_squared_error(y_val_predict.flatten(),y_val.flatten()) r2 = r2_score(y_val,y_val_predict) result = [AS, r2, i, j, k, l] Accuracy.append(result) r = [] for i in range(len(Accuracy)): r.append(Accuracy[i][0]) print(np.nanmin(r)) idx = r.index(min(r)) print("The optimum estimator, min sample split, min sample leaf, max depth are",(Accuracy[idx][2]),(Accuracy[idx][3]),(Accuracy[idx][4]),(Accuracy[idx][5]),"respectively") print('R2 value is ',(Accuracy[idx][1]))


MSE is 0.022766173925363092

The optimum estimator, min sample split, min sample leaf, max depth are 20 6 4 110 respectively

R2 value is 0.6290120379529942

Step 3: Testing

y_predict_test_RFR = regr.predict(x_test)







0.1317344161085005 0.6956071641374675



Section 4: Apply the optimal regression model (ANN) to each seagrass pixel

seagrass_clip = cv2.imread('/content/gdrive/My Drive/Supervised_classification/export/sc20220224.tif_seagrass.tif',-1) ##We have removed other classes. You can easily use np.where to exclude the values we do not need. import gdal YG_dataset = gdal.Open(r'/content/gdrive/My Drive/Reflectance/Rrs20220221/Rrs20220224_cl.tif') #We can composite the four bands into one tif file in ArcGIS. You can easily find this tool (Composite bands) in the tool box #import the one remote sensing image, this image is used as a basis to set size, resolution, geosystem.... YG_width = YG_dataset.RasterXSize YG_height = YG_dataset.RasterYSizev2.imread('/content/gdrive/My Drive/Supervised_classification/export/sc20220224.tif_seagrass.tif',-1) YG_geotrans = YG_dataset.GetGeoTransform() YG_proj = YG_dataset.GetProjection() YG_data = YG_dataset.ReadAsArray(0, 0, YG_width, YG_height) Band2 = YG_data[0,0:YG_height,0:YG_width] #Read Band Blue (The order of each band corresponds to the ones that you set in the Composite Bands tool. Band2 = np.array(np.where(Band2<0,np.nan,Band2)) Band2 = Band2*seagrass_clip ##We only need Band3 = YG_data[1,0:YG_height,0:YG_width] Band3 = np.array(np.where(Band3<0,np.nan,Band3)) Band3 = Band3*seagrass_clip Band4 = YG_data[2,0:YG_height,0:YG_width] Band4 = np.array(np.where(Band4<0,np.nan,Band4)) Band4 = Band4*seagrass_clip Band8 = YG_data[3,0:YG_height,0:YG_width] Band8 = np.array(np.where(Band8<0,np.nan,Band8)) Band8 = Band8*seagrass_clip NDVI = (Band8-Band4)/(Band8+Band4) ## Calculate NDVI NDVI = np.where(NDVI==0,np.nan,NDVI) NDWI = (Band3-Band8)/(Band3+Band8) ##Calculate NDWI NDWI = np.where(NDWI==0,np.nan,NDWI)

Run model on each pixel

percent = [] for i in range(len(Band2)): result = [] percent.append(result) for j in range(len(Band2[i])): if np.isnan(Band2[i][j]) == True: Y = np.nan result.append(Y) else: X = np.array([[Band2[i,j],Band3[i,j],Band4[i,j],Band8[i,j],NDVI[i,j],NDWI[i,j]],[Band2[i,j],Band3[i,j],Band4[i,j],Band8[i,j],NDVI[i,j],NDWI[i,j]]]) Y = model.predict(X)[0][0] result.append(Y) print([i,j,Y])


Save the result as a tif file

raster = gdal.Open('/content/gdrive/My Drive/Reflectance/Rrs

geot = raster.GetGeoTransform()

proj = raster.GetProjectionRef()

filename = '/content/gdrive/My Drive/Supervised_classification/seagrass/sg20220224.tif'

xsize = 2841

ysize = 3276

driver = gdal.GetDriverByName('GTiff')

dataset = driver.Create(filename, xsize, ysize,1,gdal.GDT_Float32)




Chapter 3: Calculate the GPP of seagrass in Tauranga Harbour

Step 1: Import the libraries we need

!pip uninstall xlr

!pip install xlrd==1.2.0


!pip install fiona

!pip install rasterio

!pip install geopandas

import fiona

import rasterio

import rasterio.mask

from rasterio.plot import show

from rasterio.warp import calculate_default_transform, reproject, Resampling

from osgeo import gdal

import geopandas as gpd

import os

import numpy as np

import matplotlib.pyplot as plt

import xlrd

import pandas as pd

import requests

import datetime

import json

import math

import cv2

!pip install pyproj

from pyproj import Proj,transform

from scipy.optimize import curve_fit

from dateutil.parser import parse

import seaborn as sns

import pandas as pd

from xlrd import xldate_as_tuple

from datetime import datetime

import scipy

Step 2: Read the light data

xlrd.xlsx.ensure_elementtree_imported(False, None) xlrd.xlsx.Element_has_iter = True table = xlrd.open_workbook('/content/gdrive/My Drive/light/Light_tide.xlsx') sheet = table.sheets()[0] time = np.array(sheet.col_values(0,1)) ttime = [] for i in range(len(time)): cellValue1 = sheet.cell(i+1,0).value cellValue2 = xldate_as_tuple(cellValue1,0) cellValue3 = datetime(*cellValue2).strftime("%Y-%m-%d %H:%M:%S") ttime.append(cellValue3) tide = np.array(sheet.col_values(1,1)) solar = np.array(sheet.col_values(2,1)) d = pd.DataFrame() d['time'] = pd.to_datetime(ttime) d['tide'] = tide d['solar'] = solar*4.6/0.0036 ##Convert from MJ to mol


Step 3: Define some functions we need to use

def LI(I0,kd,z): #Calculate Light intensity at a specific water depth return I0*np.exp(-kd*z) def period(A): #Help recognize the time year = str(A)[0:4] month = int(str(A)[4:6]) if 3 <= month <=5: pos_start = year+'-'+'03'+'-'+'01' +' 00:00:00' pos_end = year+'-'+'05'+'-'+'31'+' 23:00:00' if month <=8 and month>=6: pos_start = year+'-'+'06'+'-'+'01'+' 00:00:00' pos_end = year+'-'+'08'+'-'+'31'+' 23:00:00' if month <=11 and month>=9: pos_start = year+'-'+'09'+'-'+'01'+' 00:00:00' pos_end = year+'-'+'11'+'-'+'30'+' 23:00:00' if month == 12: pos_start = year+'-'+'12'+'-'+'01'+' 00:00:00' pos_end = str(int(year)+1)+'-'+'02'+'-'+'28'+' 23:00:00' if month == 1 or month == 2: pos_start = str(int(year)-1)+'-'+'12'+'-01'+' 00:00:00' pos_end = year+'-'+'02'+'-'+'28'+' 23:00:00' mask1 = d['time'] == pos_start mask2 = d['time'] == pos_end Start_index = np.flatnonzero(mask1)[0] End_index = np.flatnonzero(mask2)[0] return [Start_index, End_index] def LightAmount(A,bathymetric,kdpar): ##Calculate the GPP over a season TTime = A[2:10] index = period(TTime) Time = range(index[0],index[1]) PP = [] for m in range(len(Time)): if d['solar'][m]<0.0001: light = 0 PP.append(light) else: if d['tide'][m]+ bathymetric < 0: light = d['solar'][m] P = eme_seagrass(light) PP.append(P) else: light = LI(d['solar'][m],kdpar,d['tide'][m]+bathymetric) P = sub_seagrass(light) PP.append(P) PP = np.array(PP) Timee = range(0,index[1]-index[0]) return scipy.integrate.trapz(PP,Timee) def neareast(x): #The nearest neighbourhood intepolation Result = [] for i in range(len(x)): s = [] Result.append(s) for j in range(len(x[i])): if np.isnan(x[i][j])==True: try: m = np.nanmean([x[i-1][j-1],x[i-1][j],x[i-1][j+1], x[i][j-1],x[i][j],x[i][j+1], x[i+1][j-1],x[i+1][j],x[i+1][j+1]]) s.append(m) except: s.append(np.nan) else: s.append(x[i][j]) return np.array(Result) def sub_seagrass(x): ## PI curve for seagrass return (6240*(1-np.exp(-32.4*x/6240))) def eme_seagrass(x): return (4519*(1-np.exp(-22.5*x/4519)))


Step 4 Import the data we need

kd = cv2.imread('/content/gdrive/My Drive/Kd_chapter2/Resample/20220224_clip_re.tif',-1)

kd = neareast(kd)

bathy = cv2.imread('/content/gdrive/My Drive/Kd_chapter2/Resample/bathy_re.tif',-1)

bathy = np.where(bathy< -100,np.nan, bathy)

Step 5 Run the model and save the results

result = [] for i in range(len(bathy)): s = [] result.append(s) for j in range(len(bathy[i])): kk = LightAmount("sg20220224",bathy[i][j],kd[i][j]) print([i,j]) s.append(kk) result = np.array(result).astype(int) result = np.where(result<0,np.nan,result) result = result/1000000*12.0107 ##Convert the unit to gC m-2 h-1 # You will get a map of GPP if seagrass is distributed in every pixel in the Harbour. data = gdal.Open('/content/gdrive/My Drive/Kd_chapter2/New folder/2019_autumn.tif') im_width = data.RasterXSize im_height = data.RasterYSize im_bands = data.RasterCount im_geotrans = data.GetGeoTransform() im_proj = data.GetProjection() im_data = data.GetRasterBand(1).ReadAsArray(xoff=0,yoff=0,win_xsize=im_width,win_ysize=im_height) driver = gdal.GetDriverByName("GTiff") output = driver.Create("/content/gdrive/My Drive/Kd_chapter2/GPP_Seagrass/202200224.tif", im_width, im_height, 1, gdal.GDT_Float32) output.SetGeoTransform(im_geotrans) output.SetProjection(im_proj) output.GetRasterBand(1).WriteArray(result)


Step 6 Use Raster calculator from QGIS or ARCGIS to mutiply seagrass coverage and GPP.