{ "cells": [ { "cell_type": "markdown", "id": "2fe4c392-fd2c-4dc5-8ed2-866a26447165", "metadata": {}, "source": [ "# Classification in Python using pyjeo and sklearn" ] }, { "cell_type": "code", "execution_count": null, "id": "cbe6356c-b543-4886-a321-f3e2b71495c2", "metadata": {}, "outputs": [], "source": [ "from IPython.display import display\n", "import geopandas as gpd" ] }, { "cell_type": "code", "execution_count": null, "id": "633dfbb3-d00a-45d2-8ebf-2ad9d0984b6a", "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": null, "id": "0a414281-01c7-45b7-b303-2df17a488653", "metadata": {}, "outputs": [], "source": [ "from datetime import datetime\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pyjeo as pj\n", "import pandas as pd\n", "import geopandas as gpd" ] }, { "cell_type": "markdown", "id": "a5d851ff-d154-4481-8872-63e67e491766", "metadata": {}, "source": [ "## Create reference data for training" ] }, { "cell_type": "code", "execution_count": null, "id": "64fea62b", "metadata": {}, "outputs": [], "source": [ "# ! wget -P /media/sf_LVM_shared/my_SE_data/exercise https://github.com/ec-jrc/jeolib-pyjeo/blob/master/tests/data/modis_ndvi_2010.tif\n", "# ! wget -P /media/sf_LVM_shared/my_SE_data/exercise https://github.com/ec-jrc/jeolib-pyjeo/blob/master/tests/data/modis_ndvi_training.sqlite\n", "\n", "\n", "! curl -H 'Accept: application/vnd.github.v3.raw' -O -L 'https://github.com/ec-jrc/jeolib-pyjeo/raw/master/tests/data/modis_ndvi_training.sqlite'\n", "! curl -H 'Accept: application/vnd.github.v3.raw' -O -L 'https://github.com/ec-jrc/jeolib-pyjeo/raw/master/tests/data/modis_ndvi_2010.tif'\n", "\n", "\n", "!pwd\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ba2afdd4-03a1-4862-b360-476c6c46c0fa", "metadata": {}, "outputs": [], "source": [ "reference = pj.JimVect('modis_ndvi_training.sqlite')\n", "jim = pj.Jim('modis_ndvi_2010.tif', band2plane=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "7d372258-7e66-4582-9e03-42678d5cf07e", "metadata": {}, "outputs": [], "source": [ "dates = [datetime.strptime('01-' + str(month) + '-2010', \"%d-%m-%Y\") for month in range(1, 13)]\n", "jim.properties.setDimension({'band': ['NDVI'], 'plane': dates})" ] }, { "cell_type": "code", "execution_count": null, "id": "d9b362c0-d50a-48ec-9004-bc852d7164e7", "metadata": {}, "outputs": [], "source": [ "jim.xr()" ] }, { "cell_type": "code", "execution_count": null, "id": "d8fbc58f-61f3-4f57-9b0a-49bd491f02f0", "metadata": {}, "outputs": [], "source": [ "jim.xr().NDVI.plot(col='time', col_wrap=6)" ] }, { "cell_type": "code", "execution_count": null, "id": "63f8de4a-0780-4f35-9ffe-301dd55bde46", "metadata": {}, "outputs": [], "source": [ "pd.DataFrame(reference.dict())" ] }, { "cell_type": "code", "execution_count": null, "id": "0b61a3c4-1289-400a-bb71-3ce6b03c5ac9", "metadata": {}, "outputs": [], "source": [ "featurevect = pj.geometry.extract(reference, jim, rule=['allpoints'],\n", " output='/vsimem/features.sqlite',\n", " oformat='SQLite',\n", " co=['OVERWRITE=YES'],\n", " classes=[1, 2],\n", " copy='label')\n", "gdf = gpd.read_file('/vsimem/features.sqlite')\n", "gdf" ] }, { "cell_type": "code", "execution_count": null, "id": "39f069fa-399d-43a9-817d-dab7f4886011", "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(16, 8))\n", "ax = plt.subplot()\n", "jim.xr().NDVI.isel(time = 0).plot(ax = ax)\n", "gdf.plot(column = 'label', ax = ax, legend = True, categorical=True, cmap='Set1')" ] }, { "cell_type": "code", "execution_count": null, "id": "89f69fb9-f9d3-4161-878e-cdb1d2938993", "metadata": {}, "outputs": [], "source": [ "pd.DataFrame(featurevect.dict())" ] }, { "cell_type": "markdown", "id": "f199150c-ec9e-47e9-a921-da6ce3b4eca2", "metadata": {}, "source": [ "## Train the model" ] }, { "cell_type": "code", "execution_count": null, "id": "428e7974-5a8b-4884-b415-e69dc5e3c98d", "metadata": {}, "outputs": [], "source": [ "x = featurevect.np()[:, 1:]\n", "y = featurevect.np()[:, 0:1]\n", "x_train, x_test, y_train, y_test = train_test_split(x, y,\n", " test_size=0.33,\n", " random_state=42)\n", "rfModel = RandomForestClassifier(n_estimators=100,\n", " max_depth=9,\n", " min_samples_leaf=5,\n", " min_samples_split=3,\n", " criterion='gini')\n", "rfModel.fit(x_train, y_train.ravel())" ] }, { "cell_type": "code", "execution_count": null, "id": "87c1348f-1cb9-49e7-8863-3e19c70c93af", "metadata": {}, "outputs": [], "source": [ "y_predict = rfModel.predict(x_test)\n", "print(confusion_matrix(y_test, y_predict))\n", "print('accuracy score: {}'.format(accuracy_score(y_test, y_predict)))" ] }, { "cell_type": "markdown", "id": "ba85ab47-957a-4321-a349-6aa282620710", "metadata": {}, "source": [ "## Prediction" ] }, { "cell_type": "code", "execution_count": null, "id": "d71fbc73-2972-4c8f-a09f-69f1dfdec7fd", "metadata": {}, "outputs": [], "source": [ "x = jim.np()\n", "x = x.reshape(jim.properties.nrOfPlane(), jim.properties.nrOfRow() * \\\n", " jim.properties.nrOfCol()).T\n", "\n", "jim_class = pj.Jim(ncol=jim.properties.nrOfCol(),\n", " nrow=jim.properties.nrOfRow(),\n", " otype='Byte')\n", "jim_class.properties.copyGeoReference(jim)\n", "jim_class.np()[:] = rfModel.predict(x).astype(np.dtype(np.uint8)).\\\n", " reshape(jim.properties.nrOfRow(), jim.properties.nrOfCol())\n", "jim_class.properties.setDimension(['water'], 'band')" ] }, { "cell_type": "code", "execution_count": null, "id": "0abc73c2-e3f9-4a6a-8782-7d45fbdd256c", "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(20, 10))\n", "ax1 = plt.subplot(121)\n", "jim.xr().NDVI.isel(time = 0).plot(ax = ax1)\n", "ax2 = plt.subplot(122)\n", "jim_class.xr().water.plot(cmap = 'Set2', levels = [1, 2], ax = ax2)" ] }, { "cell_type": "markdown", "id": "0bc990f9-4c84-4881-85d7-6e92557bfbae", "metadata": { "tags": [] }, "source": [ "## Exercise 1\n", "Use a single feature to train the classifier (e.g., month of June only) \n", "Check the accuracy" ] }, { "cell_type": "markdown", "id": "2c11a3e7-882d-4a30-bb84-a96f2fdb7cac", "metadata": {}, "source": [ "What is the accuracy?" ] }, { "cell_type": "markdown", "id": "3a7c5ec4-fcc5-4feb-a87f-24811e644873", "metadata": { "tags": [] }, "source": [ "## Exercise 2\n", "Replace the Random Forest with a Support Vector Machine\n", "(hint: use the `preprocessing.MinMaxScaler` to scale the input data)" ] }, { "cell_type": "code", "execution_count": null, "id": "d7b8afb3-15ab-434c-9293-44895a8b6cdc", "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", "from sklearn import preprocessing" ] }, { "cell_type": "markdown", "id": "a46d633b-9c7b-4cb3-a221-19915b75699b", "metadata": {}, "source": [ "What is the accuracy?" ] }, { "cell_type": "code", "execution_count": null, "id": "76b1e8d6-05df-4d47-aff8-d4dcabaec18e", "metadata": {}, "outputs": [], "source": [ "y_predict = svmModel.predict(preprocessing.MinMaxScaler().fit_transform(x_test))\n", "print(confusion_matrix(y_test, y_predict))\n", "print('accuracy score: {}'.format(accuracy_score(y_test, y_predict)))" ] }, { "cell_type": "markdown", "id": "96e45eb6-b556-4449-9e16-ac50d9fab2a7", "metadata": {}, "source": [ "Prediction" ] }, { "cell_type": "code", "execution_count": null, "id": "655ca3c4-d2d5-4db7-b3a4-bc9d1d931756", "metadata": {}, "outputs": [], "source": [ "x = jim.np()\n", "x = x.reshape(jim.properties.nrOfPlane(), jim.properties.nrOfRow() * \\\n", " jim.properties.nrOfCol()).T\n", "\n", "\n", "jim_class = pj.Jim(ncol=jim.properties.nrOfCol(),\n", " nrow=jim.properties.nrOfRow(), otype='Byte')\n", "jim_class.properties.copyGeoReference(jim)\n", "jim_class.np()[:] = \n", "jim_class.properties.setDimension(['water'], 'band')" ] }, { "cell_type": "markdown", "id": "f3440c6c-ff2d-46b2-b2de-934e16a3c495", "metadata": {}, "source": [ "Plot" ] }, { "cell_type": "code", "execution_count": null, "id": "653c0ae1-ec5c-487d-a3f7-bc0c7a1d64f7", "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(20, 10))\n", "ax1 = plt.subplot(121)\n", "jim.xr().NDVI.isel(time = 0).plot(ax = ax1)\n", "ax2 = plt.subplot(122)\n", "jim_class.xr().water.plot(cmap = 'Set2', levels = [1, 2], ax = ax2)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }