I have Landsat 8 preprocessed image I want to classify using random forest(RF) classification in python. The size of the image is 3,721,804 pixels with 7 bands. I have 250 training data shapefiles which were rasterized and yielded y (labels) and trainingData. The training labels(y) have five classes [1,2,3,4,5] with (250,) dimension. The trainingData has a dimension of (250,7) i.e. 250 sampling rasters each with 7 band.
I am encountering memoryerror when trying to predict the classes of the unknown (class)pixels because the process consumes so much RAM memory attributable to the byte-size of noSamples (rows x columns) in the input Landsat Image. I have read that Random Forest does not do well on sparse matrix (+1000 columns). But others have produced a classification using RF method.
How can I solve this problem?
The Code:
import numpy as np import os import pandas as pd from osgeo import gdal,gdal_array,ogr from sklearn import metrics from sklearn.ensemble import RandomForestClassifier from matplotlib import pyplot as plt # Get the current working directory print(os.getcwd()) # Open the dataset from the file dataset = ogr.Open('./training/training_data.shp') # Make sure the dataset exists -- it would be None if we couldn't open it if not dataset: print('Error: could not open dataset') #Let's get the driver from this file driver = dataset.GetDriver() print('Dataset driver is: {n}\n'.format(n=driver.name)) #How many layers are contained in this Shapefile? layer_count = dataset.GetLayerCount() print('The shapefile has {n} layer(s)\n'.format(n=layer_count)) #What is the name of the 1 layer? layer = dataset.GetLayerByIndex(0) print('The layer is named: {n}\n'.format(n=layer.GetName())) # Get the spatial reference spatial_ref = layer.GetSpatialRef() # Export this spatial reference to something we can read... like the Proj4 proj4 = spatial_ref.ExportToProj4() print('Layer projection is: {proj4}\n'.format(proj4=proj4)) ### How many features are in the layer? feature_count = layer.GetFeatureCount() print('Layer has {n} features\n'.format(n=feature_count)) ### How many fields are in the shapefile, and what are their names? # First we need to capture the layer definition defn = layer.GetLayerDefn() # How many fields field_count = defn.GetFieldCount() print('Layer has {n} fields'.format(n=field_count)) # What are their names? print('Their names are: ') for i in range(field_count): field_defn = defn.GetFieldDefn(i) print('\t{name} - {datatype}'.format(name=field_defn.GetName(), datatype=field_defn.GetTypeName())) #In order to pair up our vector data with our input image raster pixels, #we will need a way of co-aligning the datasets in space. #First we will open our input raster image, to understand how we will want to rasterize our training vector raster_ds = gdal.Open('./Landsat8_GeoTiff_Image/Sangamon_Image_Clip.tif', gdal.GA_ReadOnly) # Fetch number of rows and columns ncol = raster_ds.RasterXSize nrow = raster_ds.RasterYSize # Fetch projection and extent proj = raster_ds.GetProjectionRef() ext = raster_ds.GetGeoTransform() raster_ds = None # Create the raster dataset memory_driver = gdal.GetDriverByName('GTiff') out_raster_ds = memory_driver.Create('./training/training_data.gtif', ncol, nrow, 1, gdal.GDT_Byte) # Set the ROI image(rasterized training vector)'s projection and extent to our input raster's projection and extent out_raster_ds.SetProjection(proj) out_raster_ds.SetGeoTransform(ext) # Fill our output band with the 0 blank, no class label, value b = out_raster_ds.GetRasterBand(1) b.Fill(0) # Rasterize the shapefile layer to our new dataset status = gdal.RasterizeLayer(out_raster_ds, # output to our new dataset [1], # output to our new dataset's first band layer, # rasterize this layer None, None, # don't worry about transformations since we're in same projection [0], # burn value 0 ['ALL_TOUCHED=TRUE', # rasterize all pixels touched by polygons 'ATTRIBUTE=itree_New_'] # put raster values according to the 'itree_New_' field values ) # Close dataset out_raster_ds = None # Check rasterized layer roi_ds = gdal.Open('./training/training_data.gtif', gdal.GA_ReadOnly) roi = roi_ds.GetRasterBand(1).ReadAsArray() if status != 0: print("I don't think it worked...") else: print("Success") # How many pixels are in each class? classes = np.unique(roi) # Iterate over all class labels in the ROI image, printing out some information for c in classes: print('Class {c} contains {n} pixels'.format(c=c, n=(roi == c).sum())) # Tell GDAL to throw Python exceptions, and register all drivers gdal.UseExceptions() gdal.AllRegister() #Read in our image and ROI image img_ds = gdal.Open('./Landsat8_GeoTiff_Image/Sangamon_Image_Clip.tif', gdal.GA_ReadOnly) roi_ds = gdal.Open('./training/training_data.gtif', gdal.GA_ReadOnly) img = np.zeros((img_ds.RasterYSize, img_ds.RasterXSize, img_ds.RasterCount), gdal_array.GDALTypeCodeToNumericTypeCode(img_ds.GetRasterBand(1).DataType)) for b in range(img.shape[2]): img[:, :, b] = img_ds.GetRasterBand(b + 1).ReadAsArray() roi = roi_ds.GetRasterBand(1).ReadAsArray().astype(np.uint8) # In order to plot a multiband image , we need to normalize # the array of each band and also, you should convert the data to float32 or uInt8 for matplotlib. def normalize(img): ''' Function to normalize an input array to 0-1 ''' img_min = img.min() img_max = img.max() return (img - img_min) / (img_max - img_min) normalize(img) index=np.array([4,3,2]) colors=img[:,:,index].astype(np.float64) # Display them plt.subplot(121) plt.imshow(colors,cmap=plt.cm.Spectral) plt.colorbar() #plt.imshow(img[:, :, 4], cmap=plt.cm.Greys_r) plt.title('Normalized Multiband Image') plt.subplot(122) plt.imshow(roi,cmap=plt.cm.Greys_r) plt.title('ROI Training Data') plt.show() def createGeotiff(outRaster, data, geo_transform, projection): # Create a GeoTIFF file with the given data driver = gdal.GetDriverByName('GTiff') rows, cols = data.shape rasterDS = driver.Create(outRaster, cols, rows, 1, gdal.GDT_Byte) rasterDS.SetGeoTransform(geo_transform) rasterDS.SetProjection(projection) band = img_ds.GetRasterBand(1) band.WriteArray(data) dataset = None '''Now, We have the ROI Image(rasterized training point shapefiles) and the input multiband image, let's pair Y with X. The image we want to classify is our X feature inputs, and the ROI with the land cover labels is our Y labeled data, we need to pair them up in NumPy arrays so we may feed them to Random Forest:''' # Find how many non-zero entries we have -- i.e. how many training data samples? n_samples = (roi > 0).sum() print('We have {n} samples'.format(n=n_samples)) # What are our classification labels? labels = np.unique(roi[roi > 0]) print('The training data include {n} classes: {classes}'.format(n=labels.size, classes=labels)) # We will need a "X" matrix containing our features, and a "y" array containing our labels # These will have n_samples rows # In other languages we would need to allocate these and them loop to fill them, but NumPy can be faster X = img[roi > 0, :] # include 8th band, which is Fmask, for now y = roi[roi > 0] # The matrix is number of samples(250) x number of bands(7) print('Our X matrix is sized: {sz}'.format(sz=X.shape)) print('Our y array is sized: {sz}'.format(sz=y.shape)) '''Training the Random Forest Now that we have our X matrix of feature inputs (the spectral bands) and our y array (the labels), we can train our model.''' # Initialize our model with 500 trees rf = RandomForestClassifier(n_estimators=1000, oob_score=True) # Fit our model to training data rf = rf.fit(X, y) print('Our OOB("Out-Of-Bag") prediction of accuracy is: {oob}%'.format(oob=rf.oob_score_ * 100)) '''To help us get an idea of which spectral bands were important, we can look at the feature importance scores:''' bands = [1, 2, 3, 4, 5, 7, 6] for b, imp in zip(bands, rf.feature_importances_): print('Band {b} importance: {imp}'.format(b=b, imp=imp)) # Extract band's data and transform into a numpy array bandsData = [] for b in range(1, img_ds.RasterCount+1): band = img_ds.GetRasterBand(b) bandsData.append(band.ReadAsArray()) bandsData = np.dstack(bandsData) rows, cols, noBands = bandsData.shape # Prepare training data (set of pixels used for training) and labels # The nonzero pixels are training data print("Training Data dimension",X.shape) print("Labels data dimension",y.shape) # Train a Random Forest classifier classifier = RandomForestClassifier(n_jobs=1, n_estimators=10) classifier.fit(X, y) # Predict class label of unknown pixels noSamples = rows*cols print ("The number of pixel Samples from the input image is:", noSamples,"with", noBands, "bands") flat_pixels = bandsData.reshape((noSamples, noBands)) result = classifier.predict(flat_pixels) classification = result.reshape((rows, cols)) # Create a GeoTIFF file with the given data createGeotiff(outRaster, classification, geo_transform, projection) img = Image.open('randomForest.tiff') img.save('randomForest.png','png')