mirror of
https://github.com/autistic-symposium/tensorflow-for-deep-learning-py.git
synced 2025-05-10 18:54:57 -04:00
661 lines
No EOL
24 KiB
Text
661 lines
No EOL
24 KiB
Text
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "feature_sets.ipynb",
|
|
"provenance": [],
|
|
"collapsed_sections": [
|
|
"JndnmDMp66FL",
|
|
"IGINhMIJ5Wyt",
|
|
"pZa8miwu6_tQ"
|
|
]
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
}
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "JndnmDMp66FL",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"#### Copyright 2017 Google LLC."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "hMqWDc_m6rUC",
|
|
"colab_type": "code",
|
|
"cellView": "both",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
|
|
"# you may not use this file except in compliance with the License.\n",
|
|
"# You may obtain a copy of the License at\n",
|
|
"#\n",
|
|
"# https://www.apache.org/licenses/LICENSE-2.0\n",
|
|
"#\n",
|
|
"# Unless required by applicable law or agreed to in writing, software\n",
|
|
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
|
|
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
|
|
"# See the License for the specific language governing permissions and\n",
|
|
"# limitations under the License."
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "zbIgBK-oXHO7",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"# Feature Sets"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "bL04rAQwH3pH",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Learning Objective:** Create a minimal set of features that performs just as well as a more complex feature set"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "F8Hci6tAH3pH",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"So far, we've thrown all of our features into the model. Models with fewer features use fewer resources and are easier to maintain. Let's see if we can build a model on a minimal set of housing features that will perform equally as well as one that uses all the features in the data set."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "F5ZjVwK_qOyR",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"## Setup\n",
|
|
"\n",
|
|
"As before, let's load and prepare the California housing data."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "SrOYRILAH3pJ",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"from __future__ import print_function\n",
|
|
"\n",
|
|
"import math\n",
|
|
"\n",
|
|
"from IPython import display\n",
|
|
"from matplotlib import cm\n",
|
|
"from matplotlib import gridspec\n",
|
|
"from matplotlib import pyplot as plt\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"from sklearn import metrics\n",
|
|
"import tensorflow as tf\n",
|
|
"from tensorflow.python.data import Dataset\n",
|
|
"\n",
|
|
"tf.logging.set_verbosity(tf.logging.ERROR)\n",
|
|
"pd.options.display.max_rows = 10\n",
|
|
"pd.options.display.float_format = '{:.1f}'.format\n",
|
|
"\n",
|
|
"california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n",
|
|
"\n",
|
|
"california_housing_dataframe = california_housing_dataframe.reindex(\n",
|
|
" np.random.permutation(california_housing_dataframe.index))"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "dGnXo7flH3pM",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"def preprocess_features(california_housing_dataframe):\n",
|
|
" \"\"\"Prepares input features from California housing data set.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" california_housing_dataframe: A Pandas DataFrame expected to contain data\n",
|
|
" from the California housing data set.\n",
|
|
" Returns:\n",
|
|
" A DataFrame that contains the features to be used for the model, including\n",
|
|
" synthetic features.\n",
|
|
" \"\"\"\n",
|
|
" selected_features = california_housing_dataframe[\n",
|
|
" [\"latitude\",\n",
|
|
" \"longitude\",\n",
|
|
" \"housing_median_age\",\n",
|
|
" \"total_rooms\",\n",
|
|
" \"total_bedrooms\",\n",
|
|
" \"population\",\n",
|
|
" \"households\",\n",
|
|
" \"median_income\"]]\n",
|
|
" processed_features = selected_features.copy()\n",
|
|
" # Create a synthetic feature.\n",
|
|
" processed_features[\"rooms_per_person\"] = (\n",
|
|
" california_housing_dataframe[\"total_rooms\"] /\n",
|
|
" california_housing_dataframe[\"population\"])\n",
|
|
" return processed_features\n",
|
|
"\n",
|
|
"def preprocess_targets(california_housing_dataframe):\n",
|
|
" \"\"\"Prepares target features (i.e., labels) from California housing data set.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" california_housing_dataframe: A Pandas DataFrame expected to contain data\n",
|
|
" from the California housing data set.\n",
|
|
" Returns:\n",
|
|
" A DataFrame that contains the target feature.\n",
|
|
" \"\"\"\n",
|
|
" output_targets = pd.DataFrame()\n",
|
|
" # Scale the target to be in units of thousands of dollars.\n",
|
|
" output_targets[\"median_house_value\"] = (\n",
|
|
" california_housing_dataframe[\"median_house_value\"] / 1000.0)\n",
|
|
" return output_targets"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "jLXC8y4AqsIy",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"# Choose the first 12000 (out of 17000) examples for training.\n",
|
|
"training_examples = preprocess_features(california_housing_dataframe.head(12000))\n",
|
|
"training_targets = preprocess_targets(california_housing_dataframe.head(12000))\n",
|
|
"\n",
|
|
"# Choose the last 5000 (out of 17000) examples for validation.\n",
|
|
"validation_examples = preprocess_features(california_housing_dataframe.tail(5000))\n",
|
|
"validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))\n",
|
|
"\n",
|
|
"# Double-check that we've done the right thing.\n",
|
|
"print(\"Training examples summary:\")\n",
|
|
"display.display(training_examples.describe())\n",
|
|
"print(\"Validation examples summary:\")\n",
|
|
"display.display(validation_examples.describe())\n",
|
|
"\n",
|
|
"print(\"Training targets summary:\")\n",
|
|
"display.display(training_targets.describe())\n",
|
|
"print(\"Validation targets summary:\")\n",
|
|
"display.display(validation_targets.describe())"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "hLvmkugKLany",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"## Task 1: Develop a Good Feature Set\n",
|
|
"\n",
|
|
"**What's the best performance you can get with just 2 or 3 features?**\n",
|
|
"\n",
|
|
"A **correlation matrix** shows pairwise correlations, both for each feature compared to the target and for each feature compared to other features.\n",
|
|
"\n",
|
|
"Here, correlation is defined as the [Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient). You don't have to understand the mathematical details for this exercise.\n",
|
|
"\n",
|
|
"Correlation values have the following meanings:\n",
|
|
"\n",
|
|
" * `-1.0`: perfect negative correlation\n",
|
|
" * `0.0`: no correlation\n",
|
|
" * `1.0`: perfect positive correlation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "UzoZUSdLIolF",
|
|
"colab_type": "code",
|
|
"cellView": "both",
|
|
"colab": {
|
|
"test": {
|
|
"output": "ignore",
|
|
"timeout": 600
|
|
}
|
|
}
|
|
},
|
|
"source": [
|
|
"correlation_dataframe = training_examples.copy()\n",
|
|
"correlation_dataframe[\"target\"] = training_targets[\"median_house_value\"]\n",
|
|
"\n",
|
|
"correlation_dataframe.corr()"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "RQpktkNpia2P",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"Features that have strong positive or negative correlations with the target will add information to our model. We can use the correlation matrix to find such strongly correlated features.\n",
|
|
"\n",
|
|
"We'd also like to have features that aren't so strongly correlated with each other, so that they add independent information.\n",
|
|
"\n",
|
|
"Use this information to try removing features. You can also try developing additional synthetic features, such as ratios of two raw features.\n",
|
|
"\n",
|
|
"For convenience, we've included the training code from the previous exercise."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "bjR5jWpFr2xs",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"def construct_feature_columns(input_features):\n",
|
|
" \"\"\"Construct the TensorFlow Feature Columns.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" input_features: The names of the numerical input features to use.\n",
|
|
" Returns:\n",
|
|
" A set of feature columns\n",
|
|
" \"\"\" \n",
|
|
" return set([tf.feature_column.numeric_column(my_feature)\n",
|
|
" for my_feature in input_features])"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "jsvKHzRciH9T",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):\n",
|
|
" \"\"\"Trains a linear regression model.\n",
|
|
" \n",
|
|
" Args:\n",
|
|
" features: pandas DataFrame of features\n",
|
|
" targets: pandas DataFrame of targets\n",
|
|
" batch_size: Size of batches to be passed to the model\n",
|
|
" shuffle: True or False. Whether to shuffle the data.\n",
|
|
" num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely\n",
|
|
" Returns:\n",
|
|
" Tuple of (features, labels) for next data batch\n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" # Convert pandas data into a dict of np arrays.\n",
|
|
" features = {key:np.array(value) for key,value in dict(features).items()} \n",
|
|
" \n",
|
|
" # Construct a dataset, and configure batching/repeating.\n",
|
|
" ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit\n",
|
|
" ds = ds.batch(batch_size).repeat(num_epochs)\n",
|
|
"\n",
|
|
" # Shuffle the data, if specified.\n",
|
|
" if shuffle:\n",
|
|
" ds = ds.shuffle(10000)\n",
|
|
" \n",
|
|
" # Return the next batch of data.\n",
|
|
" features, labels = ds.make_one_shot_iterator().get_next()\n",
|
|
" return features, labels"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "g3kjQV9WH3pb",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"def train_model(\n",
|
|
" learning_rate,\n",
|
|
" steps,\n",
|
|
" batch_size,\n",
|
|
" training_examples,\n",
|
|
" training_targets,\n",
|
|
" validation_examples,\n",
|
|
" validation_targets):\n",
|
|
" \"\"\"Trains a linear regression model.\n",
|
|
" \n",
|
|
" In addition to training, this function also prints training progress information,\n",
|
|
" as well as a plot of the training and validation loss over time.\n",
|
|
" \n",
|
|
" Args:\n",
|
|
" learning_rate: A `float`, the learning rate.\n",
|
|
" steps: A non-zero `int`, the total number of training steps. A training step\n",
|
|
" consists of a forward and backward pass using a single batch.\n",
|
|
" batch_size: A non-zero `int`, the batch size.\n",
|
|
" training_examples: A `DataFrame` containing one or more columns from\n",
|
|
" `california_housing_dataframe` to use as input features for training.\n",
|
|
" training_targets: A `DataFrame` containing exactly one column from\n",
|
|
" `california_housing_dataframe` to use as target for training.\n",
|
|
" validation_examples: A `DataFrame` containing one or more columns from\n",
|
|
" `california_housing_dataframe` to use as input features for validation.\n",
|
|
" validation_targets: A `DataFrame` containing exactly one column from\n",
|
|
" `california_housing_dataframe` to use as target for validation.\n",
|
|
" \n",
|
|
" Returns:\n",
|
|
" A `LinearRegressor` object trained on the training data.\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" periods = 10\n",
|
|
" steps_per_period = steps / periods\n",
|
|
"\n",
|
|
" # Create a linear regressor object.\n",
|
|
" my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)\n",
|
|
" my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)\n",
|
|
" linear_regressor = tf.estimator.LinearRegressor(\n",
|
|
" feature_columns=construct_feature_columns(training_examples),\n",
|
|
" optimizer=my_optimizer\n",
|
|
" )\n",
|
|
" \n",
|
|
" # Create input functions.\n",
|
|
" training_input_fn = lambda: my_input_fn(training_examples, \n",
|
|
" training_targets[\"median_house_value\"], \n",
|
|
" batch_size=batch_size)\n",
|
|
" predict_training_input_fn = lambda: my_input_fn(training_examples, \n",
|
|
" training_targets[\"median_house_value\"], \n",
|
|
" num_epochs=1, \n",
|
|
" shuffle=False)\n",
|
|
" predict_validation_input_fn = lambda: my_input_fn(validation_examples, \n",
|
|
" validation_targets[\"median_house_value\"], \n",
|
|
" num_epochs=1, \n",
|
|
" shuffle=False)\n",
|
|
"\n",
|
|
" # Train the model, but do so inside a loop so that we can periodically assess\n",
|
|
" # loss metrics.\n",
|
|
" print(\"Training model...\")\n",
|
|
" print(\"RMSE (on training data):\")\n",
|
|
" training_rmse = []\n",
|
|
" validation_rmse = []\n",
|
|
" for period in range (0, periods):\n",
|
|
" # Train the model, starting from the prior state.\n",
|
|
" linear_regressor.train(\n",
|
|
" input_fn=training_input_fn,\n",
|
|
" steps=steps_per_period,\n",
|
|
" )\n",
|
|
" # Take a break and compute predictions.\n",
|
|
" training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)\n",
|
|
" training_predictions = np.array([item['predictions'][0] for item in training_predictions])\n",
|
|
" \n",
|
|
" validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)\n",
|
|
" validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])\n",
|
|
" \n",
|
|
" # Compute training and validation loss.\n",
|
|
" training_root_mean_squared_error = math.sqrt(\n",
|
|
" metrics.mean_squared_error(training_predictions, training_targets))\n",
|
|
" validation_root_mean_squared_error = math.sqrt(\n",
|
|
" metrics.mean_squared_error(validation_predictions, validation_targets))\n",
|
|
" # Occasionally print the current loss.\n",
|
|
" print(\" period %02d : %0.2f\" % (period, training_root_mean_squared_error))\n",
|
|
" # Add the loss metrics from this period to our list.\n",
|
|
" training_rmse.append(training_root_mean_squared_error)\n",
|
|
" validation_rmse.append(validation_root_mean_squared_error)\n",
|
|
" print(\"Model training finished.\")\n",
|
|
"\n",
|
|
" \n",
|
|
" # Output a graph of loss metrics over periods.\n",
|
|
" plt.ylabel(\"RMSE\")\n",
|
|
" plt.xlabel(\"Periods\")\n",
|
|
" plt.title(\"Root Mean Squared Error vs. Periods\")\n",
|
|
" plt.tight_layout()\n",
|
|
" plt.plot(training_rmse, label=\"training\")\n",
|
|
" plt.plot(validation_rmse, label=\"validation\")\n",
|
|
" plt.legend()\n",
|
|
"\n",
|
|
" return linear_regressor"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "varLu7RNH3pf",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"Spend 5 minutes searching for a good set of features and training parameters. Then check the solution to see what we chose. Don't forget that different features may require different learning parameters."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "DSgUxRIlH3pg",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"#\n",
|
|
"# Your code here: add your features of choice as a list of quoted strings.\n",
|
|
"#\n",
|
|
"minimal_features = [\n",
|
|
"]\n",
|
|
"\n",
|
|
"assert minimal_features, \"You must select at least one feature!\"\n",
|
|
"\n",
|
|
"minimal_training_examples = training_examples[minimal_features]\n",
|
|
"minimal_validation_examples = validation_examples[minimal_features]\n",
|
|
"\n",
|
|
"#\n",
|
|
"# Don't forget to adjust these parameters.\n",
|
|
"#\n",
|
|
"train_model(\n",
|
|
" learning_rate=0.001,\n",
|
|
" steps=500,\n",
|
|
" batch_size=5,\n",
|
|
" training_examples=minimal_training_examples,\n",
|
|
" training_targets=training_targets,\n",
|
|
" validation_examples=minimal_validation_examples,\n",
|
|
" validation_targets=validation_targets)"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "IGINhMIJ5Wyt",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"### Solution\n",
|
|
"\n",
|
|
"Click below for a solution."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "BAGoXFPZ5ZE3",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"minimal_features = [\n",
|
|
" \"median_income\",\n",
|
|
" \"latitude\",\n",
|
|
"]\n",
|
|
"\n",
|
|
"minimal_training_examples = training_examples[minimal_features]\n",
|
|
"minimal_validation_examples = validation_examples[minimal_features]\n",
|
|
"\n",
|
|
"_ = train_model(\n",
|
|
" learning_rate=0.01,\n",
|
|
" steps=500,\n",
|
|
" batch_size=5,\n",
|
|
" training_examples=minimal_training_examples,\n",
|
|
" training_targets=training_targets,\n",
|
|
" validation_examples=minimal_validation_examples,\n",
|
|
" validation_targets=validation_targets)"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "RidI9YhKOiY2",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"## Task 2: Make Better Use of Latitude\n",
|
|
"\n",
|
|
"Plotting `latitude` vs. `median_house_value` shows that there really isn't a linear relationship there.\n",
|
|
"\n",
|
|
"Instead, there are a couple of peaks, which roughly correspond to Los Angeles and San Francisco."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "hfGUKj2IR_F1",
|
|
"colab_type": "code",
|
|
"cellView": "both",
|
|
"colab": {
|
|
"test": {
|
|
"output": "ignore",
|
|
"timeout": 600
|
|
}
|
|
}
|
|
},
|
|
"source": [
|
|
"plt.scatter(training_examples[\"latitude\"], training_targets[\"median_house_value\"])"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "6N0p91k2iFCP",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Try creating some synthetic features that do a better job with latitude.**\n",
|
|
"\n",
|
|
"For example, you could have a feature that maps `latitude` to a value of `|latitude - 38|`, and call this `distance_from_san_francisco`.\n",
|
|
"\n",
|
|
"Or you could break the space into 10 different buckets. `latitude_32_to_33`, `latitude_33_to_34`, etc., each showing a value of `1.0` if `latitude` is within that bucket range and a value of `0.0` otherwise.\n",
|
|
"\n",
|
|
"Use the correlation matrix to help guide development, and then add them to your model if you find something that looks good.\n",
|
|
"\n",
|
|
"What's the best validation performance you can get?"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "wduJ2B28yMFl",
|
|
"colab_type": "code",
|
|
"cellView": "form",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"#\n",
|
|
"# YOUR CODE HERE: Train on a new data set that includes synthetic features based on latitude.\n",
|
|
"#"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "pZa8miwu6_tQ",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"### Solution\n",
|
|
"\n",
|
|
"Click below for a solution."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "PzABdyjq7IZU",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"Aside from `latitude`, we'll also keep `median_income`, to compare with the previous results.\n",
|
|
"\n",
|
|
"We decided to bucketize the latitude. This is fairly straightforward in Pandas using `Series.apply`."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "xdVF8siZ7Lup",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"def select_and_transform_features(source_df):\n",
|
|
" LATITUDE_RANGES = zip(range(32, 44), range(33, 45))\n",
|
|
" selected_examples = pd.DataFrame()\n",
|
|
" selected_examples[\"median_income\"] = source_df[\"median_income\"]\n",
|
|
" for r in LATITUDE_RANGES:\n",
|
|
" selected_examples[\"latitude_%d_to_%d\" % r] = source_df[\"latitude\"].apply(\n",
|
|
" lambda l: 1.0 if l >= r[0] and l < r[1] else 0.0)\n",
|
|
" return selected_examples\n",
|
|
"\n",
|
|
"selected_training_examples = select_and_transform_features(training_examples)\n",
|
|
"selected_validation_examples = select_and_transform_features(validation_examples)"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "U4iAdY6t7Pkh",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"source": [
|
|
"_ = train_model(\n",
|
|
" learning_rate=0.01,\n",
|
|
" steps=500,\n",
|
|
" batch_size=5,\n",
|
|
" training_examples=selected_training_examples,\n",
|
|
" training_targets=training_targets,\n",
|
|
" validation_examples=selected_validation_examples,\n",
|
|
" validation_targets=validation_targets)"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
}
|
|
]
|
|
} |