AWS-Big-Data-Projects
diff --git a/‎.gitignore‎
Lines changed: 103 additions & 0 deletions b/‎.gitignore‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎AWS_SageMaker/Demo.ipynb‎
Lines changed: 233 additions & 0 deletions b/‎AWS_SageMaker/Demo.ipynb‎
Lines changed: 233 additions & 0 deletions
@@ -0,0 +1,103 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
@@ -0,0 +1,233 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Demo Notebook for SageMaker Endpoint\n",
+ "\n",
+ "- Demo SageMaker Endpoint on Forest Fire Cause Prediction"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import boto3\n",
+ "\n",
+ "endpoint_name = \"sagemaker/endpoint/model\"\n",
+ "runtime = boto3.Session().client(service_name='sagemaker-runtime',\n",
+ " region_name='us-east-2')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Preprocess the raw data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "# test data, just take first row for sample\n",
+ "bucket='your/bucket/name'\n",
+ "data_key = 'the/etl/output/test/set'\n",
+ "data_location = 's3://{}/{}'.format(bucket, data_key)\n",
+ "print(data_location)\n",
+ "\n",
+ "test_df = pd.read_csv(data_location)\n",
+ "test_df_orig = test_df.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample_data = test_df_orig.iloc[:2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn import tree, preprocessing\n",
+ "# Preprocessing \n",
+ "test_df['DATE'] = pd.to_datetime(test_df['discovery_date'] - pd.Timestamp(0).to_julian_date(), unit='D')\n",
+ "test_df['MONTH'] = pd.DatetimeIndex(test_df['DATE']).month\n",
+ "test_df['DAY_OF_WEEK'] = test_df['DATE'].dt.weekday_name\n",
+ "le = preprocessing.LabelEncoder()\n",
+ "test_df['STATE'] = le.fit_transform(test_df['state'])\n",
+ "test_df['DAY_OF_WEEK'] = le.fit_transform(test_df['DAY_OF_WEEK'])\n",
+ "\n",
+ "def set_label(cat):\n",
+ " cause = 0\n",
+ " natural = ['Lightning']\n",
+ " accidental = ['Structure','Fireworks','Powerline','Railroad','Smoking',\n",
+ " 'Children','Campfire','Equipment Use','Debris Burning']\n",
+ " malicious = ['Arson']\n",
+ " other = ['Missing/Undefined','Miscellaneous']\n",
+ " if cat in natural:\n",
+ " cause = 1\n",
+ " elif cat in accidental:\n",
+ " cause = 2\n",
+ " elif cat in malicious:\n",
+ " cause = 3\n",
+ " else:\n",
+ " cause = 4\n",
+ " return cause\n",
+ " \n",
+ "\n",
+ "test_df['LABEL'] = test_df['stat_cause_descr'].apply(lambda x: set_label(x)) # I created a copy of the original test_df earlier in the kernel\n",
+ "test_df = test_df.drop('stat_cause_descr',axis=1)\n",
+ "test_df.drop(['state', 'fire_size_class', 'discovery_date', 'DATE', 'cont_date'], axis=1, inplace=True)\n",
+ "test_df = test_df.dropna()\n",
+ "\n",
+ "\n",
+ "sample_test_X = test_df.drop(['LABEL'], axis=1).values[:2]\n",
+ "sample_test_y = test_df['LABEL'].values[:2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(sample_test_X)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Makes sense, because Lightning is encoded to label/category 1 as natural disasters\n",
+ "sample_test_y"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_payload = pd.DataFrame(sample_test_X)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import io\n",
+ "payload_file = io.StringIO()\n",
+ "df_payload.to_csv(payload_file, header = None, index = None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Invoke Endpoint\n",
+ "\n",
+ "- SageMaker Scikit-learn model server provides a default implementation of input_fn. This function deserializes JSON, CSV, or NPY encoded data into a NumPy array."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# text/csv didn't work because of reshaping (sklearn requires 2 dimensions)\n",
+ "# application/jsonlines not available\n",
+ "# application/json tries to convert JSON to float\n",
+ "\n",
+ "response = runtime.invoke_endpoint(EndpointName=endpoint_name, \n",
+ " ContentType='text/csv', \n",
+ " Body=payload_file.getvalue())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Print predictions\n",
+ "print(response['Body'].read().decode())\n",
+ "print(\"[natural, natural]\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Print actual labels\n",
+ "sample_test_y.tolist()\n",
+ "print(\"[natural, misc.]\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}