Prise de note cours 2 + mise au propre

2018-12-19 18:23:17 +01:00 · 2018-12-19 18:23:17 +01:00 · e261a531bb
parent 46fe592e96
commit e261a531bb
9 changed files with 1676 additions and 2 deletions
--- a/Applications/Exercices/TP1/.ipynb_checkpoints/Exercice
+++ b/Applications/Exercices/TP1/.ipynb_checkpoints/Exercice
--- a/Applications/Exercices/TP1/.ipynb_checkpoints/Exercice
+++ b/Applications/Exercices/TP1/.ipynb_checkpoints/Exercice
--- a/Applications/Exercices/TP1/Exercice
+++ b/Applications/Exercices/TP1/Exercice
--- a/Applications/Exercices/TP1/Exercice
+++ b/Applications/Exercices/TP1/Exercice
--- a/Applications/Exercices/TP2/.ipynb_checkpoints/Exercice
+++ b/Applications/Exercices/TP2/.ipynb_checkpoints/Exercice
@ -0,0 +1,326 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,\n",
       "         4.9800e+00],\n",
       "        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,\n",
       "         9.1400e+00],\n",
       "        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,\n",
       "         4.0300e+00],\n",
       "        ...,\n",
       "        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n",
       "         5.6400e+00],\n",
       "        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,\n",
       "         6.4800e+00],\n",
       "        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n",
       "         7.8800e+00]]),\n",
       " 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,\n",
       "        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,\n",
       "        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,\n",
       "        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,\n",
       "        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,\n",
       "        35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,\n",
       "        19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,\n",
       "        20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,\n",
       "        23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,\n",
       "        33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,\n",
       "        21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,\n",
       "        20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,\n",
       "        23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,\n",
       "        15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,\n",
       "        17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,\n",
       "        25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,\n",
       "        23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,\n",
       "        32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,\n",
       "        34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,\n",
       "        20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,\n",
       "        26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,\n",
       "        31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,\n",
       "        22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,\n",
       "        42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,\n",
       "        36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,\n",
       "        32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,\n",
       "        20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,\n",
       "        20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,\n",
       "        22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,\n",
       "        21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,\n",
       "        19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,\n",
       "        32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,\n",
       "        18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,\n",
       "        16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,\n",
       "        13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,\n",
       "         7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,\n",
       "        12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,\n",
       "        27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,\n",
       "         8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,\n",
       "         9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,\n",
       "        10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,\n",
       "        15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,\n",
       "        19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,\n",
       "        29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,\n",
       "        20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,\n",
       "        23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),\n",
       " 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n",
       "        'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),\n",
       " 'DESCR': \"Boston House Prices dataset\\n===========================\\n\\nNotes\\n------\\nData Set Characteristics:  \\n\\n    :Number of Instances: 506 \\n\\n    :Number of Attributes: 13 numeric/categorical predictive\\n    \\n    :Median Value (attribute 14) is usually the target\\n\\n    :Attribute Information (in order):\\n        - CRIM     per capita crime rate by town\\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\\n        - INDUS    proportion of non-retail business acres per town\\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\\n        - NOX      nitric oxides concentration (parts per 10 million)\\n        - RM       average number of rooms per dwelling\\n        - AGE      proportion of owner-occupied units built prior to 1940\\n        - DIS      weighted distances to five Boston employment centres\\n        - RAD      index of accessibility to radial highways\\n        - TAX      full-value property-tax rate per $10,000\\n        - PTRATIO  pupil-teacher ratio by town\\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\\n        - LSTAT    % lower status of the population\\n        - MEDV     Median value of owner-occupied homes in $1000's\\n\\n    :Missing Attribute Values: None\\n\\n    :Creator: Harrison, D. and Rubinfeld, D.L.\\n\\nThis is a copy of UCI ML housing dataset.\\nhttp://archive.ics.uci.edu/ml/datasets/Housing\\n\\n\\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\\n\\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\\nprices and the demand for clean air', J. Environ. Economics & Management,\\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\\npages 244-261 of the latter.\\n\\nThe Boston house-price data has been used in many machine learning papers that address regression\\nproblems.   \\n     \\n**References**\\n\\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\\n   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)\\n\"}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import matplotlib\n",
    "\n",
    "matplotlib.use(\"TkAgg\")\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.datasets import load_boston\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "boston = load_boston()\n",
    "\n",
    "boston\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['data', 'target', 'feature_names', 'DESCR'])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boston.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"Boston House Prices dataset\\n===========================\\n\\nNotes\\n------\\nData Set Characteristics:  \\n\\n    :Number of Instances: 506 \\n\\n    :Number of Attributes: 13 numeric/categorical predictive\\n    \\n    :Median Value (attribute 14) is usually the target\\n\\n    :Attribute Information (in order):\\n        - CRIM     per capita crime rate by town\\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\\n        - INDUS    proportion of non-retail business acres per town\\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\\n        - NOX      nitric oxides concentration (parts per 10 million)\\n        - RM       average number of rooms per dwelling\\n        - AGE      proportion of owner-occupied units built prior to 1940\\n        - DIS      weighted distances to five Boston employment centres\\n        - RAD      index of accessibility to radial highways\\n        - TAX      full-value property-tax rate per $10,000\\n        - PTRATIO  pupil-teacher ratio by town\\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\\n        - LSTAT    % lower status of the population\\n        - MEDV     Median value of owner-occupied homes in $1000's\\n\\n    :Missing Attribute Values: None\\n\\n    :Creator: Harrison, D. and Rubinfeld, D.L.\\n\\nThis is a copy of UCI ML housing dataset.\\nhttp://archive.ics.uci.edu/ml/datasets/Housing\\n\\n\\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\\n\\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\\nprices and the demand for clean air', J. Environ. Economics & Management,\\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\\npages 244-261 of the latter.\\n\\nThe Boston house-price data has been used in many machine learning papers that address regression\\nproblems.   \\n     \\n**References**\\n\\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\\n   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)\\n\""
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boston['DESCR']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n",
       "       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boston['feature_names']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(506, 13)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = boston['data']\n",
    "Y = boston['target']\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(506,)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(300, 13)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training = X[0:300]\n",
    "validation = X[300:400]\n",
    "test = X[400:]\n",
    "training.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100, 13)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "validation.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(106, 13)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. What is the range of the target variable? Describe the target variable?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y.shape\n",
    "# Y il the median value of owner occupied homes in 1000$ ??"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Find three different ways of normalizing the target variable and write the their associated functions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot target VS CRIM\n",
    "plt.plot(X[:,0], Y, 'bc')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/Applications/Exercices/TP2/Exercice
+++ b/Applications/Exercices/TP2/Exercice
@ -0,0 +1,380 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,\n",
       "         4.9800e+00],\n",
       "        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,\n",
       "         9.1400e+00],\n",
       "        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,\n",
       "         4.0300e+00],\n",
       "        ...,\n",
       "        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n",
       "         5.6400e+00],\n",
       "        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,\n",
       "         6.4800e+00],\n",
       "        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,\n",
       "         7.8800e+00]]),\n",
       " 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,\n",
       "        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,\n",
       "        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,\n",
       "        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,\n",
       "        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,\n",
       "        35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,\n",
       "        19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,\n",
       "        20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,\n",
       "        23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,\n",
       "        33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,\n",
       "        21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,\n",
       "        20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,\n",
       "        23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,\n",
       "        15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,\n",
       "        17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,\n",
       "        25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,\n",
       "        23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,\n",
       "        32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,\n",
       "        34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,\n",
       "        20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,\n",
       "        26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,\n",
       "        31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,\n",
       "        22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,\n",
       "        42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,\n",
       "        36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,\n",
       "        32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,\n",
       "        20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,\n",
       "        20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,\n",
       "        22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,\n",
       "        21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,\n",
       "        19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,\n",
       "        32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,\n",
       "        18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,\n",
       "        16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,\n",
       "        13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,\n",
       "         7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,\n",
       "        12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,\n",
       "        27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,\n",
       "         8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,\n",
       "         9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,\n",
       "        10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,\n",
       "        15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,\n",
       "        19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,\n",
       "        29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,\n",
       "        20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,\n",
       "        23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),\n",
       " 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n",
       "        'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),\n",
       " 'DESCR': \"Boston House Prices dataset\\n===========================\\n\\nNotes\\n------\\nData Set Characteristics:  \\n\\n    :Number of Instances: 506 \\n\\n    :Number of Attributes: 13 numeric/categorical predictive\\n    \\n    :Median Value (attribute 14) is usually the target\\n\\n    :Attribute Information (in order):\\n        - CRIM     per capita crime rate by town\\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\\n        - INDUS    proportion of non-retail business acres per town\\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\\n        - NOX      nitric oxides concentration (parts per 10 million)\\n        - RM       average number of rooms per dwelling\\n        - AGE      proportion of owner-occupied units built prior to 1940\\n        - DIS      weighted distances to five Boston employment centres\\n        - RAD      index of accessibility to radial highways\\n        - TAX      full-value property-tax rate per $10,000\\n        - PTRATIO  pupil-teacher ratio by town\\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\\n        - LSTAT    % lower status of the population\\n        - MEDV     Median value of owner-occupied homes in $1000's\\n\\n    :Missing Attribute Values: None\\n\\n    :Creator: Harrison, D. and Rubinfeld, D.L.\\n\\nThis is a copy of UCI ML housing dataset.\\nhttp://archive.ics.uci.edu/ml/datasets/Housing\\n\\n\\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\\n\\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\\nprices and the demand for clean air', J. Environ. Economics & Management,\\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\\npages 244-261 of the latter.\\n\\nThe Boston house-price data has been used in many machine learning papers that address regression\\nproblems.   \\n     \\n**References**\\n\\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\\n   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)\\n\"}"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import matplotlib\n",
    "\n",
    "matplotlib.use(\"TkAgg\")\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.datasets import load_boston\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "boston = load_boston()\n",
    "\n",
    "boston\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['data', 'target', 'feature_names', 'DESCR'])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boston.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"Boston House Prices dataset\\n===========================\\n\\nNotes\\n------\\nData Set Characteristics:  \\n\\n    :Number of Instances: 506 \\n\\n    :Number of Attributes: 13 numeric/categorical predictive\\n    \\n    :Median Value (attribute 14) is usually the target\\n\\n    :Attribute Information (in order):\\n        - CRIM     per capita crime rate by town\\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\\n        - INDUS    proportion of non-retail business acres per town\\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\\n        - NOX      nitric oxides concentration (parts per 10 million)\\n        - RM       average number of rooms per dwelling\\n        - AGE      proportion of owner-occupied units built prior to 1940\\n        - DIS      weighted distances to five Boston employment centres\\n        - RAD      index of accessibility to radial highways\\n        - TAX      full-value property-tax rate per $10,000\\n        - PTRATIO  pupil-teacher ratio by town\\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\\n        - LSTAT    % lower status of the population\\n        - MEDV     Median value of owner-occupied homes in $1000's\\n\\n    :Missing Attribute Values: None\\n\\n    :Creator: Harrison, D. and Rubinfeld, D.L.\\n\\nThis is a copy of UCI ML housing dataset.\\nhttp://archive.ics.uci.edu/ml/datasets/Housing\\n\\n\\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\\n\\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\\nprices and the demand for clean air', J. Environ. Economics & Management,\\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\\npages 244-261 of the latter.\\n\\nThe Boston house-price data has been used in many machine learning papers that address regression\\nproblems.   \\n     \\n**References**\\n\\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\\n   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)\\n\""
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boston['DESCR']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',\n",
       "       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boston['feature_names']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(506, 13)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = boston['data']\n",
    "Y = boston['target']\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(506,)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(300, 13)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training = X[0:300]\n",
    "validation = X[300:400]\n",
    "test = X[400:]\n",
    "training.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100, 13)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "validation.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(106, 13)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. What is the range of the target variable? Describe the target variable?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(506,)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y.shape\n",
    "# Y il the median value of owner occupied homes in 1000$ ??"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Find three different ways of normalizing the target variable and write the their associated functions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-44-e9075341953f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# plot target VS CRIM\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscatter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/applications/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py\u001b[0m in \u001b[0;36mshow\u001b[0;34m(*args, **kw)\u001b[0m\n\u001b[1;32m    251\u001b[0m     \"\"\"\n\u001b[1;32m    252\u001b[0m     \u001b[0;32mglobal\u001b[0m \u001b[0m_show\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 253\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_show\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    255\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/applications/anaconda3/lib/python3.7/site-packages/matplotlib/backend_bases.py\u001b[0m in \u001b[0;36mshow\u001b[0;34m(cls, block)\u001b[0m\n\u001b[1;32m    206\u001b[0m                 \u001b[0mblock\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    207\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m             \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m     \u001b[0;31m# This method is the one actually exporting the required methods.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/applications/anaconda3/lib/python3.7/site-packages/matplotlib/backends/_backend_tk.py\u001b[0m in \u001b[0;36mmainloop\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1073\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1074\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1075\u001b[0;31m         \u001b[0mTk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/applications/anaconda3/lib/python3.7/tkinter/__init__.py\u001b[0m in \u001b[0;36mmainloop\u001b[0;34m(n)\u001b[0m\n\u001b[1;32m    555\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    556\u001b[0m     \u001b[0;34m\"\"\"Run the main loop of Tcl.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 557\u001b[0;31m     \u001b[0m_default_root\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    558\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    559\u001b[0m \u001b[0mgetint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# plot target VS CRIM\n",
    "plt.scatter(X[:,0], Y)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-45-104d7c01bc69>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# c'est inexploitable, on log\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscatter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/applications/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py\u001b[0m in \u001b[0;36mshow\u001b[0;34m(*args, **kw)\u001b[0m\n\u001b[1;32m    251\u001b[0m     \"\"\"\n\u001b[1;32m    252\u001b[0m     \u001b[0;32mglobal\u001b[0m \u001b[0m_show\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 253\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_show\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    255\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/applications/anaconda3/lib/python3.7/site-packages/matplotlib/backend_bases.py\u001b[0m in \u001b[0;36mshow\u001b[0;34m(cls, block)\u001b[0m\n\u001b[1;32m    206\u001b[0m                 \u001b[0mblock\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    207\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m             \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m     \u001b[0;31m# This method is the one actually exporting the required methods.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/applications/anaconda3/lib/python3.7/site-packages/matplotlib/backends/_backend_tk.py\u001b[0m in \u001b[0;36mmainloop\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1073\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1074\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1075\u001b[0;31m         \u001b[0mTk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/applications/anaconda3/lib/python3.7/tkinter/__init__.py\u001b[0m in \u001b[0;36mmainloop\u001b[0;34m(n)\u001b[0m\n\u001b[1;32m    555\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    556\u001b[0m     \u001b[0;34m\"\"\"Run the main loop of Tcl.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 557\u001b[0;31m     \u001b[0m_default_root\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmainloop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    558\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    559\u001b[0m \u001b[0mgetint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# c'est inexploitable, on log \n",
    "plt.scatter(np.log(X[:,0]), Y)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/Applications/Exercices/TP2/Exercice
+++ b/Applications/Exercices/TP2/Exercice
@ -0,0 +1,138 @@
 # -*- coding: utf-8 -*-
 """
 Exercice 1 du cours 2 de machine learning avec F.Baradel
 """
 import matplotlib
 matplotlib.use("TkAgg")
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.datasets import load_boston
 from sklearn.linear_model import LinearRegression
 boston = load_boston()
 boston
 X = boston['data']
 Y = boston['target']
 X.shape
 n_train = 300
 crim = X[:, 0].copy()
 """ 4. Right now we will be using only the first features called CRIM for 
 modelling the target variable. Plot CRIM vs target with and without normalizing your data. 
 What do you observe?
 """
 # plot target VS CRIM
 plt.scatter(crim, Y)
 plt.show()
 # c'est inexploitable, on log 
 plt.scatter(np.log(crim), Y)
 plt.show()
 # C'est mieux
 #on écrit la fonction de normalisation
 def normalize (y):
    return (y - np.min(y)) / (np.max(y) - np.min(y))
 """5. Use LinearRegression() for modelling target with CRIM on the training set and compute the predicted
 values for the validation set."""
 model = LinearRegression().fit(crim[:n_train].reshape(-1, 1), Y[:n_train])
 w, b = model.coef_, model.intercept_
 print(w, b)
 """ 6. Plot the predictions and the actual ground-truth for the training and the validation set."""
 print("entrainement")
 preds = model.predict(crim[:n_train].reshape(-1, 1))
 plt.scatter(crim[:n_train], preds, color="green")
 plt.scatter(crim[:n_train], Y[:n_train], color="red")
 plt.show()
 n_valid = 400
 print("validation")
 valids = model.predict(crim[n_train:n_valid].reshape(-1,1))
 plt.scatter(crim[n_train:n_valid], valids, color="green")
 plt.scatter(crim[n_train:n_valid], Y[n_train:n_valid], color="orange")
 plt.show()
 # On utilise maintenant le log . Je m'a planté et ait cru que c'était la normalisation d'ou les noms fallacieux.
 # Ne pas oublier de caler l'exponentielle (réciproque de log) pour remettre valeur dans le bon champ
 print(" avec log")
 normY = np.log(Y)
 model = LinearRegression().fit(crim[:n_train].reshape(-1, 1), normY[:n_train])
 print("entrainement")
 normPreds = np.exp(model.predict(crim[:n_train].reshape(-1, 1)))
 plt.scatter(crim[:n_train], normPreds, color="green")
 plt.scatter(crim[:n_train], np.exp(normY[:n_train]), color="red")
 plt.show()
 print("validation")
 normValids = np.exp(model.predict(crim[n_train:n_valid].reshape(-1,1)))
 plt.scatter(crim[n_train:n_valid], normValids, color="green")
 plt.scatter(crim[n_train:n_valid], np.exp(normY[n_train:n_valid]), color="orange")
 plt.show
 """
 7. Implement a function which is computing the Root-Mean-Square-Error (RMSE). What is the RMSE on
 the training set and on the validation set?
 On veut calculer un score, pour savoir si le modèle est bon ou non
 on définit une fonction qui prend mes prédiciton,s mes valeurs, et retourne le mean square error
 (ŷi-yi)²
 à calculer sur le trainig set et le validation set
 Ensuite modéliser log(target) = w +b.CRIM et MSE train/val
 """
 def mse(preds, vals):
    return np.mean((preds - vals)**2)
 mse(preds, Y[:n_train])
 mse(valids, Y[n_train:n_valid])
 # meme chose avec log   
 mse(normPreds, Y[:n_train])
 mse(normValids, Y[n_train:n_valid])
 # Ensuite, faire la meme chose avec une deuxièm variabl du tableau X : ZN
 crimZn = X[:, :2]
 model = LinearRegression().fit(crimZn[:n_train].reshape(-1, 2), Y[:n_train])
 w, b = model.coef_, model.intercept_
 print(w, b)
 print("entrainement")
 preds = model.predict(crim[:n_train].reshape(-1, 2))
 plt.scatter(crimZn[:n_train], preds, color="green")
 plt.scatter(crimZn[:n_train], Y[:n_train], color="red")
 plt.show()
 n_valid = 400
 print("validation")
 valids = model.predict(crim[n_train:n_valid].reshape(-1,1))
 plt.scatter(crim[n_train:n_valid], valids, color="green")
 plt.scatter(crim[n_train:n_valid], Y[n_train:n_valid], color="orange")
 plt.show()
--- a/Applications/Exercices/TP2/tp_2.pdf
+++ b/Applications/Exercices/TP2/tp_2.pdf
--- a/Applications/Introduction
+++ b/Applications/Introduction
@ -145,7 +145,7 @@ Comme dans la réalité on a beaucoup trop de données pour s'appuyer sur toutes
 ### Exemple
-Nous sommes de charmants vendeurs de gales ambulants. On se pose la question : "Quand température est de $x$, combien je vais vendre de glace ?". On s'appuie sur plusieurs expériences de cas réels où on a vendu $y$ glaces alors que la température était de $x$.
+Nous sommes de charmants vendeurs de glaces ambulants. On se pose la question : "Quand la température est de $x$, combien je vais vendre de glace ?". On s'appuie sur plusieurs expériences de cas réels où on a vendu $y$ glaces alors que la température était de $x$.
 On va résoudre le problème en partant du principe qu'on va vendre $wx + b$ glaces, où x est la température. Si $x = 0$ on va vendre $b$ glaces. $b$ est appelé l'intercepte, et $w$ la pente.
@ -160,4 +160,49 @@ Le chapeau indique la prédiction. Pour trouver le coût moyen (de tous les poin
 À la fin des exercies on mate un graph avec trois méthodes : closed-form, gradient descend et stochastic gradient descent. On remarque que les 3 sont très proches.
-On voit ensuite comment faire une descente de gradient en 3 lignes de code : scikit-learn
+On voit ensuite comment faire une descente de gradient en 3 lignes de code : scikit-learn
 ### Régression linéaire multivariée
 Même principe que linéaire, mais avec un nombre *p* de variables et un concept de vecteurs.
 ## Overfitting
 La problématique en ML c'est de généraliser. Nous, humains, on peut généraliser et extrapoler vite (on voit deux chats on peut vite dire si ce qu'on voit ensuite est un chat ou non). En ML, y a besoin de beaucoup plus de données. 
 Pour résumer la généralisation: 
 On divise le dataset en 3 : donnés d’apprentissage, de validation et de test (genre 70%/15%/15%). On entraîne la machine uniquement avec les données d'apprentissage. Une fois qu'elle a suffisamment appris, on essaye ses déductions sur la base de validation (on donne x on demande de trouver ŷ (on connaît y nous, on peut donc comparer avec)). On retravaille ensuite l'algo sur les données d'apprentissage et puis on recommence la validation. Au bout d'un certain temps, on passe au test : on fait calculer le total des ŷ et on voit le pourcentage de réussite  global, sans avoir accès aux ŷ trouvés. 
 L'objectif, c'est d'éviter l'underfitting et l'overfitting. L'over c'est de trop coller aux données qu'on a, et l'under c'est d'en être trop éloigné. Donc la solution va consister à alterner entre l'un et l'autre pour se rapprocher d'un modèle optimal : "good fit".
 On va donc pour ça partir de fonctions complexes, et tâcher de simplifier ces fonctions.
 ### Tips and tricks
 La normalisation : dans un wx + b, on tente de minimiser le w. Il y a normalisation L2 et normalisation L1. Pour la L2, on obtient toujours quelque chose de convexe. La L1, non. Donc la L1 est plutôt utile pour faire de la sélection de variable.
 Si on ajoute L1 et L2, c'est l'"Elastic Net". Qu'on ajoute L1, L2 ou L1+L2 à la fonction de coût, on obtient des résultats différents. Il faut aussi pondérer l'hyperparamètre C pour pas qu'il soit trop petit ou trop grand.
 #### Numerical and Categorical variable
 Il s'agit de regrouper des variables ou de changer leur type en leur assignant des nombres. Sur une variable constante par exemple (de 0 à n), on peut décréter que de x à x' on est dans la variable [1,0,0], de x'' à x''' [O,1,0] et tout le reste [0,0,1].
 #### Normalisation, standardisation
 L'idée est de ne pas se retrouver à donner plus d'importance à un paramètre qui a un champ plus large que d'autres (genre l'age de 1 à 100 va valoir plus que la taille de 0 à 2) on ramène tout à "de 0 à 1". 
 Formule de normalisation de j pour sa donnée x : $\bar x^j = \frac{x^j - min^j}{max^j - min^j}$
 Le Z-score, normalise en fonction de la moyenne, non plus en fonction min et max : c'est plus robuste : $\hat x = \frac{x^j - \mu^j}{\sigma^j}$
 #### Transformation de cible
 L'idée est, sur des paramètres difficiles à expliquer, d'arriver sur des données plus "jolies à voir", plus facilement exploitables pour nous. Dans l'exemple sur le PDF, on a juste appliqué la fonction log aux données.
 #### Interaction entre les variables
 Parfois il ne suffit pas de prendre les paramètres mais de regarder les relations entre eux : on appelle ça une interaction. Les interactions consistent à rajouter un terme (paramètre) qui est le produit des deux paramètres.
 $y = b +w_1.age + w_2 .taille +w_3.age.taille$