chrischia06
diff --git a/‎.DS_Store‎
10 KB b/‎.DS_Store‎
10 KB
diff --git a/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.ipynb_checkpoints/Feature Engineering-checkpoint.ipynb‎
Lines changed: 239 additions & 0 deletions b/‎.ipynb_checkpoints/Feature Engineering-checkpoint.ipynb‎
Lines changed: 239 additions & 0 deletions
@@ -0,0 +1,2 @@
+.csv filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,239 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "%matplotlib inline\n",
+ "\n",
+ "pd.options.display.max_rows=100\n",
+ "pd.options.display.max_columns=100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(\"data-training.csv\")\n",
+ "df.iloc[:,15:30] = df.iloc[:,15:30].fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.heatmap(df.corr().values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['askRate0'].corr(df['askSize0'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "plt.bar(df['y'].value_counts().sort_index().index,df['y'].value_counts().sort_index().values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['y'].plot(figsize=(20,10))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "(df['askSize0'] / (df['askSize0']+df['bidSize0'])).hist(bins=100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "((df['askRate0'] + df['bidRate0']) / 2).diff(1).corr(df['y'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for i in range(10):\n",
+ " maxVolumes['askSize'+str(i)] = df.iloc[:,15+i].max()\n",
+ " maxVolumes['bidSize'+str(i)] = df.iloc[:,45+i].max()\n",
+ " df['askSize'+str(i)] /= maxVolumes['askSize'+str(i)]\n",
+ " df['bidSize'+str(i)] /= maxVolumes['bidSize'+str(i)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "(df['askRate5'] - df['askRate4']).median()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for i in range(1,15):\n",
+ " df.loc[df['askRate'+str(i)].isna(), 'askRate'+str(i)] = df.loc[df['askRate'+str(i)].isna(), 'askRate'+str(i-1)]+0.5"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.decomposition import PCA\n",
+ "\n",
+ "\n",
+ "pca = PCA()\n",
+ "askVolumes = pca.fit_transform(df.iloc[:,15:23])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.scatter(df['bidRate0'],df['y'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.001696283864909054"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.linear_model import LinearRegression\n",
+ "\n",
+ "lr = LinearRegression()\n",
+ "\n",
+ "from sklearn.model_selection import cross_val_score\n",
+ "scores = cross_val_score(lr,askVolumes,df['y'].values.reshape(-1,),cv=3)\n",
+ "scores.mean()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "askSize0 -0.044874\n",
+ "askSize1 -0.029520\n",
+ "askSize2 -0.018320\n",
+ "askSize3 -0.014050\n",
+ "askSize4 -0.018614\n",
+ "askSize5 -0.015285\n",
+ "askSize6 -0.013968\n",
+ "askSize7 -0.009049\n",
+ "askSize8 -0.005798\n",
+ "askSize9 -0.001430\n",
+ "askSize10 0.001433\n",
+ "askSize11 0.002797\n",
+ "askSize12 0.004413\n",
+ "askSize13 -0.001598\n",
+ "askSize14 -0.002608\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.iloc[:,15:30].corrwith(df['y'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+.csv filter=lfs diff=lfs merge=lfs -text`
	`2`	`+*.csv filter=lfs diff=lfs merge=lfs -text`