Zhanchao Shao, Prof. Karin Bryan, Dr Moritz Lemann, Georgina Flowers and Prof. Conrad Pilditch have been working together to develop a new methodology using machine learning and remote sensing data to map accurate and detailed spatial distribution of gross primary productivity in Tauranga Harbour. More information is coming soon. The codes are imported from Google Colab. If anyone needs the original data, please kindly contact Zhanchao. Thanks to Sanne Vaassen for providing detailed instructions and comments to make the codes easier to understand. Please read the instructions before starting. Here is the link: <https://docs.google.com/document/d/e/2PACX-1vQ8rWOALsJWAha6clCJFZ2sXyLj6QW5jvW3XgAs5WxkcC6dE3Yv1wRzNfDdpQkAFl-r9ZFNdp9mkAgx/pub> We also have applied the methods on other estuaries in New Zealand. Here is the link: <https://drive.google.com/drive/folders/1Fs8pqC4V-j4355lq_rXwddAqYow8zCfk?usp=drive_link>
Step 1 #connect google colab to google drive.
from google.colab import drive drive.mount('/content/gdrive', force_remount=True) root_dir = '/content/gdrive/My Drive/'
Step 2 #Import all the libraries we need.
import xlrd import statsmodels.api as sm ## gdal from osgeo import gdal, gdal_array import numpy as np from numpy.linalg import inv from scipy.stats.distributions import chi2 from itertools import chain import matplotlib.pyplot as plt from osgeo import ogr ## AI from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score, accuracy_score, f1_score, precision_score, recall_score
Step 3 #Convert your manually-labeled shapefile data to raster
raster = gdal.Open('/content/gdrive/My Drive/Reflectance/Rrs20220224/Rrs20220224_cl.tif') ##Please merge your individual band to one Gtiff if you are using Sentinel-2 data vector = ogr.Open('/content/gdrive/My Drive/Supervised_classification/classification_reproject_summer1.shp') lyr = vector.GetLayer() geot = raster.GetGeoTransform() proj = raster.GetProjectionRef() vec_raster = gdal.GetDriverByName('GTiff') chn_ras = vec_raster.Create('/content/gdrive/My Drive/Supervised_classification/supervised_classification_ras_Summer.tif',raster.RasterXSize,raster.RasterYSize,1,gdal.GDT_Byte) chn_ras.SetGeoTransform(geot) chn_ras.SetProjection(proj) gdal.RasterizeLayer(chn_ras, [1], lyr, options=['ATTRIBUTE=id']) # the field name should correspond to the classificaiton that you would like to use for supervised classification chn_ras.GetRasterBand(1).SetNoDataValue(0) chn_ras = None*## This process can also be done in ArcGIS or QGIS.* # import the raster data we have just converted from shapefile\ vector_raster = gdal.Open('/content/gdrive/My Drive/Supervised_classification/supervised_classification_ras_Summer.tif') vec = vector_raster.GetRasterBand(1).ReadAsArray() #Create a container to save the result width = raster.RasterYSize; lenth = raster.RasterXSize; number = raster.RasterCount container = np.ones((width, lenth, number),gdal_array.GDALTypeCodeToNumericTypeCode(raster.GetRasterBand(1).DataType)) for i in range(number): container[:,:,i] = raster.GetRasterBand(i+1).ReadAsArray().astype(np.float32) fit_X = container[vec>0,:] fit_y = vec[vec>0]
Step 4 #Split the data into training and testing
fit_X_train, fit_X_test, fit_y_train, fit_y_test = train_test_split(fit_X,fit_y,test_size = 0.7)
Step 5 #Validation (please use the fivefold cross-validation to obtain the optimal values for parameters of RF)
The optimal values: n_estimators = 280,min_samples_split = 4,min_samples_leaf = 4,max_depth = 90
Step 6 #Test the model
RandomForest = RandomForestClassifier(n_estimators=280,min_samples_split=4,min_samples_leaf=4,max_depth=90)` randomforest = RandomForest.fit(fit_X_train,fit_y_train) y_predict = randomforest.predict(fit_X_test)
``
Here is the classification report: (Class 1, 2 and 3 are sparse seagrass, dense seagrass and unvegetated flats, respectively).
Step 7 #Apply the model
Final_container = (container.shape[0]*container.shape[1],container.shape[2]) Final_container_re = container[:,:,:4].reshape(Final_container) supervised_classification = randomforest.predict(Final_container_re) supervised_classification = supervised_classification.reshape(container[:,:,0].shape) # reshape the results` plt.imshow(supervised_classification) # show the results` plt.colorbar()
``
Step 8 #Save the model and results
import joblib joblib.dump(randomforest,"/content/gdrive/My Drive/Supervised_classification/model_summer.joblib") raster = gdal.Open('/content/gdrive/My Drive/Reflectance/Rrs20211231/Rrs20220224_cl.tif') ##This tif file is used for georeference` geot = raster.GetGeoTransform() proj = raster.GetProjectionRef() filename = '/content/gdrive/My Drive/Supervised_classification/sc20220221.tif' xsize = np.shape(supervised_classification)[1] ysize = np.shape(supervised_classification)[0] driver = gdal.GetDriverByName('GTiff') dataset = driver.Create(filename, xsize, ysize, bands = 1, eType = gdal.GDT_UInt16) dataset.GetRasterBand(1).WriteArray(supervised_classification) dataset.SetGeoTransform(geot) dataset.SetProjection(proj) dataset.FlushCache() dataset=None
Section 1: ANN
Step 1: #Connect google colab to google drive
from google.colab import drive drive.mount('/content/gdrive', force_remount=True) root_dir = '/content/gdrive/My Drive/'
``
Step 2: #Import the libraries we need
from tensorflow.keras import models, layers, utils, backend as K import matplotlib.pyplot as plt import shap import numpy as np from sklearn.model_selection import train_test_split from keras import models from keras import layers import tensorflow as tf import cv2 import xlrd
``
Step 3: # Read and split the data.
table = xlrd.open_workbook('/content/gdrive/My Drive/seagrass_coverage/original_data.xlsx') sheet = table.sheets()[0] x1 = sheet.col_values(6,1) #Band Blue x2 = sheet.col_values(7,1) #Band Green x3 = sheet.col_values(8,1) #Band Red x4 = sheet.col_values(9,1) #Band NIR x5 = sheet.col_values(10,1) #NDWI x6 = sheet.col_values(11,1) #NDVI x = np.array([x1,x2,x3,x4,x5,x6]).T #Transpose the array y = np.array(sheet.col_values(5,1))/100 #seagrass percentage coverage x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2) #Split the data; 80% for training x_test, x_val, y_test, y_val = train_test_split(x_test,y_test,test_size = 0.5) #10% for validation and 10% for testing
``
Step 4: # Validation
from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score import math result = [] for m in range(2,40,2): tf.keras.backend.clear_session() model = models.Sequential() tf.random.set_seed(1) model.add(layers.Dense(m, activation = 'relu', input_shape = (x_train.shape[1], ))) ##Change different activation functions to find out the best one. model.add(layers.Dense(m, activation = 'relu')), model.add(layers.Dense(1,)) opt = tf.keras.optimizers.Adam(learning_rate=0.01) ##Change different Learning rate to find out the best one. model.compile(opt, loss = 'mse', metrics = ['mse']) model.fit(x_train,y_train,epochs = 1000,batch_size = 300, verbose = 0) y_val_test = model.predict(x_val) r2 = r2_score(y_val,y_val_test) #model.summary() #The optimal values: #Number of hidden layers: 2 [Please check the methods in the paper] #Number of neurons : 10 #Activation function: ReLu #Learning rate: 0.01
``
Step 5: # Testing
tf.random.set_seed(1) model = models.Sequential() tf.keras.backend.clear_session() model.add(layers.Dense(10, activation = 'relu', input_shape = (x_train.shape[1], ))) model.add(layers.Dense(10,activation = 'relu')), model.add(layers.Dense(1,)) opt = tf.keras.optimizers.Adam(learning_rate=0.01) model.compile(opt, loss = 'mse', metrics = ['mse']) reuslt = model.fit(x_train,y_train,epochs = 1000,batch_size = 300, verbose = 0) y_test_predict = model.predict(x_test) plt.figure(figsize=[5,5]) plt.scatter(y_test,y_test_predict) xx = [0,1] yy = [0,1] plt.plot(xx,yy,'-') plt.xlim([0,1]) plt.ylim([0,1]) from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score import math print(math.sqrt(mean_squared_error(y_test_predict.flatten(),y_test.flatten()))) 0.11175243324504268 print(r2_score(y_test_predict,y_test)) 0.7106385173472587
``
Section 2: SVR
Step 1: Import the library we need
from sklearn.svm import SVR
## Because we still use the same training, validating and testing data as we used in ANN, we skip splitting the dataset.
Step 2: Validation
result = [] for i in range(1,20): for m in np.arange(0.1, 2, 0.1): regressor = SVR(kernel = 'rbf', C =i, gamma=m) ##By changing the type of kernel function, we can find out the optimal one. regressor.fit(x_train,y_train) y_val_predict = regressor.predict(x_val) print (i, "+", m) print(mean_squared_error(y_val.flatten(),y_val_predict.flatten())) print(r2_score(y_val,y_val_predict)) result.append([i,m,r2_score(y_val,y_val_predict)])
The optimal values:
Kerel function: RBF
Gamma: 0.1
C: 13
Step 3: Testing
regressor = SVR(kernel = 'rbf', C =13, gamma=0.1) regressor.fit(x_train,y_train) y_predict = regressor.predict(x_test) print((math.sqrt(mean_squared_error(y_test.flatten(),y_predict.flatten())))) #0.1256412972939246 print(r2_score(y_test,y_predict)) #0.7231141583032319 plt.figure(figsize=[8,8]) plt.scatter(y_test,y_predict) plt.ylim([0,1]) plt.xlim([0,1])
``
Section 3: RFR
Step 1: Import the libraries we need
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
## Because we still use the same training, validating and testing data as we used in ANN, we skip splitting the dataset.
Step 2: Validation
Accuracy = [] for i in range(20,500,20):##find out the optimum tree numbers for supervised classification for j in range(2,10,2):##find out the optimum min_samples_split for k in range(1,5,1):##find out the optimum min_samples_leaf for l in range(10,120,10):##find out the optimum max depth RandomForest = RandomForestRegressor(n_estimators=i,min_samples_split=j,min_samples_leaf=k,max_depth=l) randomforest = RandomForest.fit(x_train,y_train) y_val_predict = randomforest.predict(x_val) AS = mean_squared_error(y_val_predict.flatten(),y_val.flatten()) r2 = r2_score(y_val,y_val_predict) result = [AS, r2, i, j, k, l] Accuracy.append(result) r = [] for i in range(len(Accuracy)): r.append(Accuracy[i][0]) print(np.nanmin(r)) idx = r.index(min(r)) print("The optimum estimator, min sample split, min sample leaf, max depth are",(Accuracy[idx][2]),(Accuracy[idx][3]),(Accuracy[idx][4]),(Accuracy[idx][5]),"respectively") print('R2 value is ',(Accuracy[idx][1]))
``
MSE is 0.022766173925363092
The optimum estimator, min sample split, min sample leaf, max depth are 20 6 4 110 respectively
R2 value is 0.6290120379529942
Step 3: Testing
y_predict_test_RFR = regr.predict(x_test)
plt.figure(figsize=[8,8])
plt.scatter(y_test,y_predict_test_RFR)
plt.ylim([0,1])
plt.xlim([0,1])
plt.plot(xx,yy,'-')
print(math.sqrt(mean_squared_error(y_test.flatten(),y_predict_test_RFR.flatten())))
0.1317344161085005 0.6956071641374675
print(r2_score(y_test,y_predict_test_RFR))
0.6956071641374675
Section 4: Apply the optimal regression model (ANN) to each seagrass pixel
seagrass_clip = cv2.imread('/content/gdrive/My Drive/Supervised_classification/export/sc20220224.tif_seagrass.tif',-1) ##We have removed other classes. You can easily use np.where to exclude the values we do not need. import gdal YG_dataset = gdal.Open(r'/content/gdrive/My Drive/Reflectance/Rrs20220221/Rrs20220224_cl.tif') #We can composite the four bands into one tif file in ArcGIS. You can easily find this tool (Composite bands) in the tool box #import the one remote sensing image, this image is used as a basis to set size, resolution, geosystem.... YG_width = YG_dataset.RasterXSize YG_height = YG_dataset.RasterYSizev2.imread('/content/gdrive/My Drive/Supervised_classification/export/sc20220224.tif_seagrass.tif',-1) YG_geotrans = YG_dataset.GetGeoTransform() YG_proj = YG_dataset.GetProjection() YG_data = YG_dataset.ReadAsArray(0, 0, YG_width, YG_height) Band2 = YG_data[0,0:YG_height,0:YG_width] #Read Band Blue (The order of each band corresponds to the ones that you set in the Composite Bands tool. Band2 = np.array(np.where(Band2<0,np.nan,Band2)) Band2 = Band2*seagrass_clip ##We only need Band3 = YG_data[1,0:YG_height,0:YG_width] Band3 = np.array(np.where(Band3<0,np.nan,Band3)) Band3 = Band3*seagrass_clip Band4 = YG_data[2,0:YG_height,0:YG_width] Band4 = np.array(np.where(Band4<0,np.nan,Band4)) Band4 = Band4*seagrass_clip Band8 = YG_data[3,0:YG_height,0:YG_width] Band8 = np.array(np.where(Band8<0,np.nan,Band8)) Band8 = Band8*seagrass_clip NDVI = (Band8-Band4)/(Band8+Band4) ## Calculate NDVI NDVI = np.where(NDVI==0,np.nan,NDVI) NDWI = (Band3-Band8)/(Band3+Band8) ##Calculate NDWI NDWI = np.where(NDWI==0,np.nan,NDWI)
Run model on each pixel
percent = [] for i in range(len(Band2)): result = [] percent.append(result) for j in range(len(Band2[i])): if np.isnan(Band2[i][j]) == True: Y = np.nan result.append(Y) else: X = np.array([[Band2[i,j],Band3[i,j],Band4[i,j],Band8[i,j],NDVI[i,j],NDWI[i,j]],[Band2[i,j],Band3[i,j],Band4[i,j],Band8[i,j],NDVI[i,j],NDWI[i,j]]]) Y = model.predict(X)[0][0] result.append(Y) print([i,j,Y])
``
Save the result as a tif file
raster = gdal.Open('/content/gdrive/My Drive/Reflectance/Rrs
geot = raster.GetGeoTransform()
proj = raster.GetProjectionRef()
filename = '/content/gdrive/My Drive/Supervised_classification/seagrass/sg20220224.tif'
xsize = 2841
ysize = 3276
driver = gdal.GetDriverByName('GTiff')
dataset = driver.Create(filename, xsize, ysize,1,gdal.GDT_Float32)
dataset.GetRasterBand(1).WriteArray(percent)
dataset.SetGeoTransform(geot)
dataset.SetProjection(proj)
Step 1: Import the libraries we need
!pip uninstall xlr
!pip install xlrd==1.2.0
<!--StartFragment-->
!pip install fiona
!pip install rasterio
!pip install geopandas
import fiona
import rasterio
import rasterio.mask
from rasterio.plot import show
from rasterio.warp import calculate_default_transform, reproject, Resampling
from osgeo import gdal
import geopandas as gpd
import os
import numpy as np
import matplotlib.pyplot as plt
import xlrd
import pandas as pd
import requests
import datetime
import json
import math
import cv2
!pip install pyproj
from pyproj import Proj,transform
from scipy.optimize import curve_fit
from dateutil.parser import parse
import seaborn as sns
import pandas as pd
from xlrd import xldate_as_tuple
from datetime import datetime
import scipy
Step 2: Read the light data
xlrd.xlsx.ensure_elementtree_imported(False, None) xlrd.xlsx.Element_has_iter = True table = xlrd.open_workbook('/content/gdrive/My Drive/light/Light_tide.xlsx') sheet = table.sheets()[0] time = np.array(sheet.col_values(0,1)) ttime = [] for i in range(len(time)): cellValue1 = sheet.cell(i+1,0).value cellValue2 = xldate_as_tuple(cellValue1,0) cellValue3 = datetime(*cellValue2).strftime("%Y-%m-%d %H:%M:%S") ttime.append(cellValue3) tide = np.array(sheet.col_values(1,1)) solar = np.array(sheet.col_values(2,1)) d = pd.DataFrame() d['time'] = pd.to_datetime(ttime) d['tide'] = tide d['solar'] = solar*4.6/0.0036 ##Convert from MJ to mol
``
Step 3: Define some functions we need to use
def LI(I0,kd,z): #Calculate Light intensity at a specific water depth
return I0*np.exp(-kd*z)
def period(A): #Help recognize the time
year = str(A)[0:4]
month = int(str(A)[4:6])
if 3 <= month <=5:
pos_start = year+'-'+'03'+'-'+'01' +' 00:00:00'
pos_end = year+'-'+'05'+'-'+'31'+' 23:00:00'
if month <=8 and month>=6:
pos_start = year+'-'+'06'+'-'+'01'+' 00:00:00'
pos_end = year+'-'+'08'+'-'+'31'+' 23:00:00'
if month <=11 and month>=9:
pos_start = year+'-'+'09'+'-'+'01'+' 00:00:00'
pos_end = year+'-'+'11'+'-'+'30'+' 23:00:00'
if month == 12:
pos_start = year+'-'+'12'+'-'+'01'+' 00:00:00'
pos_end = str(int(year)+1)+'-'+'02'+'-'+'28'+' 23:00:00'
if month == 1 or month == 2:
pos_start = str(int(year)-1)+'-'+'12'+'-01'+' 00:00:00'
pos_end = year+'-'+'02'+'-'+'28'+' 23:00:00'
mask1 = d['time'] == pos_start
mask2 = d['time'] == pos_end
Start_index = np.flatnonzero(mask1)[0]
End_index = np.flatnonzero(mask2)[0]
return [Start_index, End_index]
def LightAmount(A,bathymetric,kdpar): ##Calculate the GPP over a season
TTime = A[2:10]
index = period(TTime)
Time = range(index[0],index[1])
PP = []
for m in range(len(Time)):
if d['solar'][m]<0.0001:
light = 0
PP.append(light)
else:
if d['tide'][m]+ bathymetric < 0:
light = d['solar'][m]
P = eme_seagrass(light)
PP.append(P)
else:
light = LI(d['solar'][m],kdpar,d['tide'][m]+bathymetric)
P = sub_seagrass(light)
PP.append(P)
PP = np.array(PP)
Timee = range(0,index[1]-index[0])
return scipy.integrate.trapz(PP,Timee)
def neareast(x): #The nearest neighbourhood intepolation
Result = []
for i in range(len(x)):
s = []
Result.append(s)
for j in range(len(x[i])):
if np.isnan(x[i][j])==True:
try:
m = np.nanmean([x[i-1][j-1],x[i-1][j],x[i-1][j+1],
x[i][j-1],x[i][j],x[i][j+1],
x[i+1][j-1],x[i+1][j],x[i+1][j+1]])
s.append(m)
except:
s.append(np.nan)
else:
s.append(x[i][j])
return np.array(Result)
def sub_seagrass(x): ## PI curve for seagrass
return (6240*(1-np.exp(-32.4*x/6240)))
def eme_seagrass(x):
return (4519*(1-np.exp(-22.5*x/4519)))
``
Step 4 Import the data we need
kd = cv2.imread('/content/gdrive/My Drive/Kd_chapter2/Resample/20220224_clip_re.tif',-1)
kd = neareast(kd)
bathy = cv2.imread('/content/gdrive/My Drive/Kd_chapter2/Resample/bathy_re.tif',-1)
bathy = np.where(bathy< -100,np.nan, bathy)
Step 5 Run the model and save the results
result = [] for i in range(len(bathy)): s = [] result.append(s) for j in range(len(bathy[i])): kk = LightAmount("sg20220224",bathy[i][j],kd[i][j]) print([i,j]) s.append(kk) result = np.array(result).astype(int) result = np.where(result<0,np.nan,result) result = result/1000000*12.0107 ##Convert the unit to gC m-2 h-1 # You will get a map of GPP if seagrass is distributed in every pixel in the Harbour. data = gdal.Open('/content/gdrive/My Drive/Kd_chapter2/New folder/2019_autumn.tif') im_width = data.RasterXSize im_height = data.RasterYSize im_bands = data.RasterCount im_geotrans = data.GetGeoTransform() im_proj = data.GetProjection() im_data = data.GetRasterBand(1).ReadAsArray(xoff=0,yoff=0,win_xsize=im_width,win_ysize=im_height) driver = gdal.GetDriverByName("GTiff") output = driver.Create("/content/gdrive/My Drive/Kd_chapter2/GPP_Seagrass/202200224.tif", im_width, im_height, 1, gdal.GDT_Float32) output.SetGeoTransform(im_geotrans) output.SetProjection(im_proj) output.GetRasterBand(1).WriteArray(result)
``
Step 6 Use Raster calculator from QGIS or ARCGIS to mutiply seagrass coverage and GPP.