diff --git a/.DS_Store b/.DS_Store
index 4d80356..ab0bf70 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/Docker_Images/Dockerfile b/Caffee/Dockerfile
similarity index 100%
rename from Docker_Images/Dockerfile
rename to Caffee/Dockerfile
diff --git a/Caffee/README.md b/Caffee/README.md
new file mode 100644
index 0000000..f9c8d2b
--- /dev/null
+++ b/Caffee/README.md
@@ -0,0 +1,31 @@
+## caffee deep learning example
+
+
+
+build with:
+
+
+
+```
+docker build -t caffe:cpu .
+```
+
+
+
+test with:
+
+
+
+```
+docker run -ti caffe:cpu bash -c "cd /opt/caffe/build; make runtest"
+```
+
+
+
+play with:
+
+
+
+```
+docker run -ti --volume=$(pwd):/workspace caffe:cpu
+```
diff --git a/Deep_Art/deepdream/README.md b/Deep_Art/README.md
similarity index 99%
rename from Deep_Art/deepdream/README.md
rename to Deep_Art/README.md
index ecd37b2..edee2ea 100644
--- a/Deep_Art/deepdream/README.md
+++ b/Deep_Art/README.md
@@ -1,6 +1,6 @@
# Deep Dream
-## Running it at AWS
+## Running it in AWS
Create an AWS's ``` g2.2xlarge``` instance with the AMI [```cs231n_caffe_torch7_keras_lasagne_v2```](http://cs231n.github.io/aws-tutorial/), AMI ID: ```ami-125b2c72```, in the ```us-west-1 region```.
* It cointains Caffe, Torch7, Theano, Keras and Lasagne are pre-installed.
diff --git a/Deep_Art/deepdream/bat-country-test/a.sh b/Deep_Art/bat-country-test/a.sh
similarity index 100%
rename from Deep_Art/deepdream/bat-country-test/a.sh
rename to Deep_Art/bat-country-test/a.sh
diff --git a/Deep_Art/deepdream/bat-country-test/bat_dream.py b/Deep_Art/bat-country-test/bat_dream.py
similarity index 100%
rename from Deep_Art/deepdream/bat-country-test/bat_dream.py
rename to Deep_Art/bat-country-test/bat_dream.py
diff --git a/Deep_Art/deepdream/bat-country-test/bat_dream2.py b/Deep_Art/bat-country-test/bat_dream2.py
similarity index 100%
rename from Deep_Art/deepdream/bat-country-test/bat_dream2.py
rename to Deep_Art/bat-country-test/bat_dream2.py
diff --git a/Deep_Art/deepdream/bat-country-test/saturn.jpg b/Deep_Art/bat-country-test/saturn.jpg
similarity index 100%
rename from Deep_Art/deepdream/bat-country-test/saturn.jpg
rename to Deep_Art/bat-country-test/saturn.jpg
diff --git a/Deep_Art/deepdream/caffe/bvlc_googlenet.caffemodel b/Deep_Art/caffe/bvlc_googlenet.caffemodel
similarity index 100%
rename from Deep_Art/deepdream/caffe/bvlc_googlenet.caffemodel
rename to Deep_Art/caffe/bvlc_googlenet.caffemodel
diff --git a/Deep_Art/deepdream/docker/Dockerfile b/Deep_Art/docker/Dockerfile
similarity index 100%
rename from Deep_Art/deepdream/docker/Dockerfile
rename to Deep_Art/docker/Dockerfile
diff --git a/Deep_Art/deepdream/docker/dream.ipynb b/Deep_Art/docker/dream.ipynb
similarity index 100%
rename from Deep_Art/deepdream/docker/dream.ipynb
rename to Deep_Art/docker/dream.ipynb
diff --git a/Deep_Art/deepdream/docker/flowers.jpg b/Deep_Art/docker/flowers.jpg
similarity index 100%
rename from Deep_Art/deepdream/docker/flowers.jpg
rename to Deep_Art/docker/flowers.jpg
diff --git a/Deep_Art/deepdream/docker/sky1024px.jpg b/Deep_Art/docker/sky1024px.jpg
similarity index 100%
rename from Deep_Art/deepdream/docker/sky1024px.jpg
rename to Deep_Art/docker/sky1024px.jpg
diff --git a/Deep_Art/deepdream/dream/dream.ipynb b/Deep_Art/dream/dream.ipynb
similarity index 100%
rename from Deep_Art/deepdream/dream/dream.ipynb
rename to Deep_Art/dream/dream.ipynb
diff --git a/Deep_Art/deepdream/dream/flowers.jpg b/Deep_Art/dream/flowers.jpg
similarity index 100%
rename from Deep_Art/deepdream/dream/flowers.jpg
rename to Deep_Art/dream/flowers.jpg
diff --git a/Deep_Art/deepdream/dream/sky1024px.jpg b/Deep_Art/dream/sky1024px.jpg
similarity index 100%
rename from Deep_Art/deepdream/dream/sky1024px.jpg
rename to Deep_Art/dream/sky1024px.jpg
diff --git a/Deep_Art/deepdream/examples/00-classification.ipynb b/Deep_Art/examples/00-classification.ipynb
similarity index 100%
rename from Deep_Art/deepdream/examples/00-classification.ipynb
rename to Deep_Art/examples/00-classification.ipynb
diff --git a/Deep_Art/deepdream/examples/01-learning-lenet.ipynb b/Deep_Art/examples/01-learning-lenet.ipynb
similarity index 100%
rename from Deep_Art/deepdream/examples/01-learning-lenet.ipynb
rename to Deep_Art/examples/01-learning-lenet.ipynb
diff --git a/Deep_Art/deepdream/examples/02-brewing-logreg.ipynb b/Deep_Art/examples/02-brewing-logreg.ipynb
similarity index 100%
rename from Deep_Art/deepdream/examples/02-brewing-logreg.ipynb
rename to Deep_Art/examples/02-brewing-logreg.ipynb
diff --git a/Deep_Art/deepdream/examples/03-fine-tuning.ipynb b/Deep_Art/examples/03-fine-tuning.ipynb
similarity index 100%
rename from Deep_Art/deepdream/examples/03-fine-tuning.ipynb
rename to Deep_Art/examples/03-fine-tuning.ipynb
diff --git a/Deep_Art/deepdream/examples/CMakeLists.txt b/Deep_Art/examples/CMakeLists.txt
similarity index 100%
rename from Deep_Art/deepdream/examples/CMakeLists.txt
rename to Deep_Art/examples/CMakeLists.txt
diff --git a/Deep_Art/deepdream/examples/bgk1.jpg b/Deep_Art/examples/bgk1.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/bgk1.jpg
rename to Deep_Art/examples/bgk1.jpg
diff --git a/Deep_Art/deepdream/examples/bgk2.jpg b/Deep_Art/examples/bgk2.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/bgk2.jpg
rename to Deep_Art/examples/bgk2.jpg
diff --git a/Deep_Art/deepdream/examples/bgk3.jpg b/Deep_Art/examples/bgk3.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/bgk3.jpg
rename to Deep_Art/examples/bgk3.jpg
diff --git a/Deep_Art/deepdream/examples/ctl1.jpg b/Deep_Art/examples/ctl1.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/ctl1.jpg
rename to Deep_Art/examples/ctl1.jpg
diff --git a/Deep_Art/deepdream/examples/d1.jpg b/Deep_Art/examples/d1.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/d1.jpg
rename to Deep_Art/examples/d1.jpg
diff --git a/Deep_Art/deepdream/examples/d2.jpg b/Deep_Art/examples/d2.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/d2.jpg
rename to Deep_Art/examples/d2.jpg
diff --git a/Deep_Art/deepdream/examples/d3.jpg b/Deep_Art/examples/d3.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/d3.jpg
rename to Deep_Art/examples/d3.jpg
diff --git a/Deep_Art/deepdream/examples/d4.jpeg b/Deep_Art/examples/d4.jpeg
similarity index 100%
rename from Deep_Art/deepdream/examples/d4.jpeg
rename to Deep_Art/examples/d4.jpeg
diff --git a/Deep_Art/deepdream/examples/d5.jpg b/Deep_Art/examples/d5.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/d5.jpg
rename to Deep_Art/examples/d5.jpg
diff --git a/Deep_Art/deepdream/examples/d6.jpg b/Deep_Art/examples/d6.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/d6.jpg
rename to Deep_Art/examples/d6.jpg
diff --git a/Deep_Art/deepdream/examples/d7.jpeg b/Deep_Art/examples/d7.jpeg
similarity index 100%
rename from Deep_Art/deepdream/examples/d7.jpeg
rename to Deep_Art/examples/d7.jpeg
diff --git a/Deep_Art/deepdream/examples/detection.ipynb b/Deep_Art/examples/detection.ipynb
similarity index 100%
rename from Deep_Art/deepdream/examples/detection.ipynb
rename to Deep_Art/examples/detection.ipynb
diff --git a/Deep_Art/deepdream/examples/dream.ipynb b/Deep_Art/examples/dream.ipynb
similarity index 100%
rename from Deep_Art/deepdream/examples/dream.ipynb
rename to Deep_Art/examples/dream.ipynb
diff --git a/Deep_Art/deepdream/examples/flowers.jpg b/Deep_Art/examples/flowers.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/flowers.jpg
rename to Deep_Art/examples/flowers.jpg
diff --git a/Deep_Art/deepdream/examples/net_surgery.ipynb b/Deep_Art/examples/net_surgery.ipynb
similarity index 100%
rename from Deep_Art/deepdream/examples/net_surgery.ipynb
rename to Deep_Art/examples/net_surgery.ipynb
diff --git a/Deep_Art/deepdream/examples/sky1024px.jpg b/Deep_Art/examples/sky1024px.jpg
similarity index 100%
rename from Deep_Art/deepdream/examples/sky1024px.jpg
rename to Deep_Art/examples/sky1024px.jpg
diff --git a/Deep_Art/deepdream/examples/tmp.prototxt b/Deep_Art/examples/tmp.prototxt
similarity index 100%
rename from Deep_Art/deepdream/examples/tmp.prototxt
rename to Deep_Art/examples/tmp.prototxt
diff --git a/Docker_Images/README.md b/Docker_Images/README.md
deleted file mode 100644
index 8a0e776..0000000
--- a/Docker_Images/README.md
+++ /dev/null
@@ -1,17 +0,0 @@
-### Build with:
-
-```
-$ docker build -t caffe:cpu .
-```
-
-### You can test with:
-
-```
-$ docker run -ti caffe:cpu bash -c "cd /opt/caffe/build; make runtest"
-```
-
-### Or play with:
-
-```
-$ docker run -ti --volume=$(pwd):/workspace caffe:cpu
-```
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 23cb790..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,339 +0,0 @@
- GNU GENERAL PUBLIC LICENSE
- Version 2, June 1991
-
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
- Preamble
-
- The licenses for most software are designed to take away your
-freedom to share and change it. By contrast, the GNU General Public
-License is intended to guarantee your freedom to share and change free
-software--to make sure the software is free for all its users. This
-General Public License applies to most of the Free Software
-Foundation's software and to any other program whose authors commit to
-using it. (Some other Free Software Foundation software is covered by
-the GNU Lesser General Public License instead.) You can apply it to
-your programs, too.
-
- When we speak of free software, we are referring to freedom, not
-price. Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-this service if you wish), that you receive source code or can get it
-if you want it, that you can change the software or use pieces of it
-in new free programs; and that you know you can do these things.
-
- To protect your rights, we need to make restrictions that forbid
-anyone to deny you these rights or to ask you to surrender the rights.
-These restrictions translate to certain responsibilities for you if you
-distribute copies of the software, or if you modify it.
-
- For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must give the recipients all the rights that
-you have. You must make sure that they, too, receive or can get the
-source code. And you must show them these terms so they know their
-rights.
-
- We protect your rights with two steps: (1) copyright the software, and
-(2) offer you this license which gives you legal permission to copy,
-distribute and/or modify the software.
-
- Also, for each author's protection and ours, we want to make certain
-that everyone understands that there is no warranty for this free
-software. If the software is modified by someone else and passed on, we
-want its recipients to know that what they have is not the original, so
-that any problems introduced by others will not reflect on the original
-authors' reputations.
-
- Finally, any free program is threatened constantly by software
-patents. We wish to avoid the danger that redistributors of a free
-program will individually obtain patent licenses, in effect making the
-program proprietary. To prevent this, we have made it clear that any
-patent must be licensed for everyone's free use or not licensed at all.
-
- The precise terms and conditions for copying, distribution and
-modification follow.
-
- GNU GENERAL PUBLIC LICENSE
- TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
- 0. This License applies to any program or other work which contains
-a notice placed by the copyright holder saying it may be distributed
-under the terms of this General Public License. The "Program", below,
-refers to any such program or work, and a "work based on the Program"
-means either the Program or any derivative work under copyright law:
-that is to say, a work containing the Program or a portion of it,
-either verbatim or with modifications and/or translated into another
-language. (Hereinafter, translation is included without limitation in
-the term "modification".) Each licensee is addressed as "you".
-
-Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope. The act of
-running the Program is not restricted, and the output from the Program
-is covered only if its contents constitute a work based on the
-Program (independent of having been made by running the Program).
-Whether that is true depends on what the Program does.
-
- 1. You may copy and distribute verbatim copies of the Program's
-source code as you receive it, in any medium, provided that you
-conspicuously and appropriately publish on each copy an appropriate
-copyright notice and disclaimer of warranty; keep intact all the
-notices that refer to this License and to the absence of any warranty;
-and give any other recipients of the Program a copy of this License
-along with the Program.
-
-You may charge a fee for the physical act of transferring a copy, and
-you may at your option offer warranty protection in exchange for a fee.
-
- 2. You may modify your copy or copies of the Program or any portion
-of it, thus forming a work based on the Program, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
- a) You must cause the modified files to carry prominent notices
- stating that you changed the files and the date of any change.
-
- b) You must cause any work that you distribute or publish, that in
- whole or in part contains or is derived from the Program or any
- part thereof, to be licensed as a whole at no charge to all third
- parties under the terms of this License.
-
- c) If the modified program normally reads commands interactively
- when run, you must cause it, when started running for such
- interactive use in the most ordinary way, to print or display an
- announcement including an appropriate copyright notice and a
- notice that there is no warranty (or else, saying that you provide
- a warranty) and that users may redistribute the program under
- these conditions, and telling the user how to view a copy of this
- License. (Exception: if the Program itself is interactive but
- does not normally print such an announcement, your work based on
- the Program is not required to print an announcement.)
-
-These requirements apply to the modified work as a whole. If
-identifiable sections of that work are not derived from the Program,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works. But when you
-distribute the same sections as part of a whole which is a work based
-on the Program, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Program.
-
-In addition, mere aggregation of another work not based on the Program
-with the Program (or with a work based on the Program) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
- 3. You may copy and distribute the Program (or a work based on it,
-under Section 2) in object code or executable form under the terms of
-Sections 1 and 2 above provided that you also do one of the following:
-
- a) Accompany it with the complete corresponding machine-readable
- source code, which must be distributed under the terms of Sections
- 1 and 2 above on a medium customarily used for software interchange; or,
-
- b) Accompany it with a written offer, valid for at least three
- years, to give any third party, for a charge no more than your
- cost of physically performing source distribution, a complete
- machine-readable copy of the corresponding source code, to be
- distributed under the terms of Sections 1 and 2 above on a medium
- customarily used for software interchange; or,
-
- c) Accompany it with the information you received as to the offer
- to distribute corresponding source code. (This alternative is
- allowed only for noncommercial distribution and only if you
- received the program in object code or executable form with such
- an offer, in accord with Subsection b above.)
-
-The source code for a work means the preferred form of the work for
-making modifications to it. For an executable work, complete source
-code means all the source code for all modules it contains, plus any
-associated interface definition files, plus the scripts used to
-control compilation and installation of the executable. However, as a
-special exception, the source code distributed need not include
-anything that is normally distributed (in either source or binary
-form) with the major components (compiler, kernel, and so on) of the
-operating system on which the executable runs, unless that component
-itself accompanies the executable.
-
-If distribution of executable or object code is made by offering
-access to copy from a designated place, then offering equivalent
-access to copy the source code from the same place counts as
-distribution of the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
- 4. You may not copy, modify, sublicense, or distribute the Program
-except as expressly provided under this License. Any attempt
-otherwise to copy, modify, sublicense or distribute the Program is
-void, and will automatically terminate your rights under this License.
-However, parties who have received copies, or rights, from you under
-this License will not have their licenses terminated so long as such
-parties remain in full compliance.
-
- 5. You are not required to accept this License, since you have not
-signed it. However, nothing else grants you permission to modify or
-distribute the Program or its derivative works. These actions are
-prohibited by law if you do not accept this License. Therefore, by
-modifying or distributing the Program (or any work based on the
-Program), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Program or works based on it.
-
- 6. Each time you redistribute the Program (or any work based on the
-Program), the recipient automatically receives a license from the
-original licensor to copy, distribute or modify the Program subject to
-these terms and conditions. You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties to
-this License.
-
- 7. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License. If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Program at all. For example, if a patent
-license would not permit royalty-free redistribution of the Program by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Program.
-
-If any portion of this section is held invalid or unenforceable under
-any particular circumstance, the balance of the section is intended to
-apply and the section as a whole is intended to apply in other
-circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system, which is
-implemented by public license practices. Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
- 8. If the distribution and/or use of the Program is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Program under this License
-may add an explicit geographical distribution limitation excluding
-those countries, so that distribution is permitted only in or among
-countries not thus excluded. In such case, this License incorporates
-the limitation as if written in the body of this License.
-
- 9. The Free Software Foundation may publish revised and/or new versions
-of the General Public License from time to time. Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-Each version is given a distinguishing version number. If the Program
-specifies a version number of this License which applies to it and "any
-later version", you have the option of following the terms and conditions
-either of that version or of any later version published by the Free
-Software Foundation. If the Program does not specify a version number of
-this License, you may choose any version ever published by the Free Software
-Foundation.
-
- 10. If you wish to incorporate parts of the Program into other free
-programs whose distribution conditions are different, write to the author
-to ask for permission. For software which is copyrighted by the Free
-Software Foundation, write to the Free Software Foundation; we sometimes
-make exceptions for this. Our decision will be guided by the two goals
-of preserving the free status of all derivatives of our free software and
-of promoting the sharing and reuse of software generally.
-
- NO WARRANTY
-
- 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
-FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
-OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
-PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
-OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
-TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
-PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
-REPAIR OR CORRECTION.
-
- 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
-REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
-INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
-OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
-TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
-YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
-PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGES.
-
- END OF TERMS AND CONDITIONS
-
- How to Apply These Terms to Your New Programs
-
- If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
- To do so, attach the following notices to the program. It is safest
-to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
- {description}
- Copyright (C) {year} {fullname}
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-Also add information on how to contact you by electronic and paper mail.
-
-If the program is interactive, make it output a short notice like this
-when it starts in an interactive mode:
-
- Gnomovision version 69, Copyright (C) year name of author
- Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
- This is free software, and you are welcome to redistribute it
- under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License. Of course, the commands you use may
-be called something other than `show w' and `show c'; they could even be
-mouse-clicks or menu items--whatever suits your program.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the program, if
-necessary. Here is a sample; alter the names:
-
- Yoyodyne, Inc., hereby disclaims all copyright interest in the program
- `Gnomovision' (which makes passes at compilers) written by James Hacker.
-
- {signature of Ty Coon}, 1 April 1989
- Ty Coon, President of Vice
-
-This General Public License does not permit incorporating your program into
-proprietary programs. If your program is a subroutine library, you may
-consider it more useful to permit linking proprietary applications with the
-library. If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.
diff --git a/Notebooks/PrinciplesOfAlgorithmicProblemSolvingDRAFT.pdf b/Notebooks/PrinciplesOfAlgorithmicProblemSolvingDRAFT.pdf
new file mode 100644
index 0000000..3d36502
Binary files /dev/null and b/Notebooks/PrinciplesOfAlgorithmicProblemSolvingDRAFT.pdf differ
diff --git a/Notebooks/README.md b/Notebooks/README.md
index 367ce6f..065fe0b 100644
--- a/Notebooks/README.md
+++ b/Notebooks/README.md
@@ -1,25 +1,38 @@
-## Jupyter Notebooks
+## my old ml jupyter notebooks
-### Installing
+
-Install any dependences for [Jupyter](http://jupyter.readthedocs.io/en/latest/install.html):
+
+#### 👾 **[my deep dreams (with caffe) adapted, on notebook.community](https://notebook.community/bt3gl/Machine-Learning-Resources/machine_learning_examples/deep_art/deepdream/examples/dream)**
+
+
+
+---
+
+### installing and running these notebooks
+
+
+
+
+
+install any dependences for [jupyter](http://jupyter.readthedocs.io/en/latest/install.html). for example, on a debian-based linux box:
+
+
```shell
-$ apt-get install build-essential python3-dev
-$ pip install jupyter
+apt-get install build-essential python3-dev
+pip install jupyter
```
-### Running
+
-On the notebook directory:
+on the notebook directory, simply run:
+
+
```shell
-$ jupyter notebook
+jupyter notebook
```
-
-### Basics
-
-* A notebook is made up of a number of cells with Python code. You can execute a cell by clicking on it and pressing ```Shift-Enter```.
diff --git a/Notebooks/basics.ipynb b/Notebooks/basics.ipynb
new file mode 100644
index 0000000..d5ccd18
--- /dev/null
+++ b/Notebooks/basics.ipynb
@@ -0,0 +1,467 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "iPpI7RaYoZuE"
+ },
+ "source": [
+ "##### Copyright 2018 The TensorFlow Authors."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "cellView": "form",
+ "colab": {},
+ "colab_type": "code",
+ "id": "hro2InpHobKk"
+ },
+ "outputs": [],
+ "source": [
+ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "U9i2Dsh-ziXr"
+ },
+ "source": [
+ "# Customization basics: tensors and operations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Hndw-YcxoOJK"
+ },
+ "source": [
+ "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+ " \u003ctd\u003e\n",
+ " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/tutorials/customization/basics\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+ " \u003c/td\u003e\n",
+ " \u003ctd\u003e\n",
+ " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/customization/basics.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+ " \u003c/td\u003e\n",
+ " \u003ctd\u003e\n",
+ " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/customization/basics.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+ " \u003c/td\u003e\n",
+ " \u003ctd\u003e\n",
+ " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/docs/site/en/tutorials/customization/basics.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+ " \u003c/td\u003e\n",
+ "\u003c/table\u003e"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "6sILUVbHoSgH"
+ },
+ "source": [
+ "This is an introductory TensorFlow tutorial shows how to:\n",
+ "\n",
+ "* Import the required package\n",
+ "* Create and use tensors\n",
+ "* Use GPU acceleration\n",
+ "* Demonstrate `tf.data.Dataset`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "miTaGiqV9RjO"
+ },
+ "outputs": [],
+ "source": [
+ "from __future__ import absolute_import, division, print_function\n",
+ "\n",
+ "try:\n",
+ " # %tensorflow_version only exists in Colab.\n",
+ " %tensorflow_version 2.x\n",
+ "except Exception:\n",
+ " pass\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "z1JcS5iBXMRO"
+ },
+ "source": [
+ "## Import TensorFlow\n",
+ "\n",
+ "To get started, import the `tensorflow` module. As of TensorFlow 2.0, eager execution is turned on by default. This enables a more interactive frontend to TensorFlow, the details of which we will discuss much later."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "vjBPmYjLdFmk"
+ },
+ "outputs": [],
+ "source": [
+ "import tensorflow as tf"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "H9UySOPLXdaw"
+ },
+ "source": [
+ "## Tensors\n",
+ "\n",
+ "A Tensor is a multi-dimensional array. Similar to NumPy `ndarray` objects, `tf.Tensor` objects have a data type and a shape. Additionally, `tf.Tensor`s can reside in accelerator memory (like a GPU). TensorFlow offers a rich library of operations ([tf.add](https://www.tensorflow.org/api_docs/python/tf/add), [tf.matmul](https://www.tensorflow.org/api_docs/python/tf/matmul), [tf.linalg.inv](https://www.tensorflow.org/api_docs/python/tf/linalg/inv) etc.) that consume and produce `tf.Tensor`s. These operations automatically convert native Python types, for example:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "cellView": "code",
+ "colab": {},
+ "colab_type": "code",
+ "id": "ngUe237Wt48W"
+ },
+ "outputs": [],
+ "source": [
+ "print(tf.add(1, 2))\n",
+ "print(tf.add([1, 2], [3, 4]))\n",
+ "print(tf.square(5))\n",
+ "print(tf.reduce_sum([1, 2, 3]))\n",
+ "\n",
+ "# Operator overloading is also supported\n",
+ "print(tf.square(2) + tf.square(3))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "IDY4WsYRhP81"
+ },
+ "source": [
+ "Each `tf.Tensor` has a shape and a datatype:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "srYWH1MdJNG7"
+ },
+ "outputs": [],
+ "source": [
+ "x = tf.matmul([[1]], [[2, 3]])\n",
+ "print(x)\n",
+ "print(x.shape)\n",
+ "print(x.dtype)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "eBPw8e8vrsom"
+ },
+ "source": [
+ "The most obvious differences between NumPy arrays and `tf.Tensor`s are:\n",
+ "\n",
+ "1. Tensors can be backed by accelerator memory (like GPU, TPU).\n",
+ "2. Tensors are immutable."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Dwi1tdW3JBw6"
+ },
+ "source": [
+ "### NumPy Compatibility\n",
+ "\n",
+ "Converting between a TensorFlow `tf.Tensor`s and a NumPy `ndarray` is easy:\n",
+ "\n",
+ "* TensorFlow operations automatically convert NumPy ndarrays to Tensors.\n",
+ "* NumPy operations automatically convert Tensors to NumPy ndarrays.\n",
+ "\n",
+ "Tensors are explicitly converted to NumPy ndarrays using their `.numpy()` method. These conversions are typically cheap since the array and `tf.Tensor` share the underlying memory representation, if possible. However, sharing the underlying representation isn't always possible since the `tf.Tensor` may be hosted in GPU memory while NumPy arrays are always backed by host memory, and the conversion involves a copy from GPU to host memory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "lCUWzso6mbqR"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "ndarray = np.ones([3, 3])\n",
+ "\n",
+ "print(\"TensorFlow operations convert numpy arrays to Tensors automatically\")\n",
+ "tensor = tf.multiply(ndarray, 42)\n",
+ "print(tensor)\n",
+ "\n",
+ "\n",
+ "print(\"And NumPy operations convert Tensors to numpy arrays automatically\")\n",
+ "print(np.add(tensor, 1))\n",
+ "\n",
+ "print(\"The .numpy() method explicitly converts a Tensor to a numpy array\")\n",
+ "print(tensor.numpy())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "PBNP8yTRfu_X"
+ },
+ "source": [
+ "## GPU acceleration\n",
+ "\n",
+ "Many TensorFlow operations are accelerated using the GPU for computation. Without any annotations, TensorFlow automatically decides whether to use the GPU or CPU for an operation—copying the tensor between CPU and GPU memory, if necessary. Tensors produced by an operation are typically backed by the memory of the device on which the operation executed, for example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "cellView": "code",
+ "colab": {},
+ "colab_type": "code",
+ "id": "3Twf_Rw-gQFM"
+ },
+ "outputs": [],
+ "source": [
+ "x = tf.random.uniform([3, 3])\n",
+ "\n",
+ "print(\"Is there a GPU available: \"),\n",
+ "print(tf.config.experimental.list_physical_devices(\"GPU\"))\n",
+ "\n",
+ "print(\"Is the Tensor on GPU #0: \"),\n",
+ "print(x.device.endswith('GPU:0'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "vpgYzgVXW2Ud"
+ },
+ "source": [
+ "### Device Names\n",
+ "\n",
+ "The `Tensor.device` property provides a fully qualified string name of the device hosting the contents of the tensor. This name encodes many details, such as an identifier of the network address of the host on which this program is executing and the device within that host. This is required for distributed execution of a TensorFlow program. The string ends with `GPU:\u003cN\u003e` if the tensor is placed on the `N`-th GPU on the host."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "ZWZQCimzuqyP"
+ },
+ "source": [
+ "\n",
+ "\n",
+ "### Explicit Device Placement\n",
+ "\n",
+ "In TensorFlow, *placement* refers to how individual operations are assigned (placed on) a device for execution. As mentioned, when there is no explicit guidance provided, TensorFlow automatically decides which device to execute an operation and copies tensors to that device, if needed. However, TensorFlow operations can be explicitly placed on specific devices using the `tf.device` context manager, for example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "RjkNZTuauy-Q"
+ },
+ "outputs": [],
+ "source": [
+ "import time\n",
+ "\n",
+ "def time_matmul(x):\n",
+ " start = time.time()\n",
+ " for loop in range(10):\n",
+ " tf.matmul(x, x)\n",
+ "\n",
+ " result = time.time()-start\n",
+ "\n",
+ " print(\"10 loops: {:0.2f}ms\".format(1000*result))\n",
+ "\n",
+ "# Force execution on CPU\n",
+ "print(\"On CPU:\")\n",
+ "with tf.device(\"CPU:0\"):\n",
+ " x = tf.random.uniform([1000, 1000])\n",
+ " assert x.device.endswith(\"CPU:0\")\n",
+ " time_matmul(x)\n",
+ "\n",
+ "# Force execution on GPU #0 if available\n",
+ "if tf.config.experimental.list_physical_devices(\"GPU\"):\n",
+ " print(\"On GPU:\")\n",
+ " with tf.device(\"GPU:0\"): # Or GPU:1 for the 2nd GPU, GPU:2 for the 3rd etc.\n",
+ " x = tf.random.uniform([1000, 1000])\n",
+ " assert x.device.endswith(\"GPU:0\")\n",
+ " time_matmul(x)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "o1K4dlhhHtQj"
+ },
+ "source": [
+ "## Datasets\n",
+ "\n",
+ "This section uses the [`tf.data.Dataset` API](https://www.tensorflow.org/guide/datasets) to build a pipeline for feeding data to your model. The `tf.data.Dataset` API is used to build performant, complex input pipelines from simple, re-usable pieces that will feed your model's training or evaluation loops."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "zI0fmOynH-Ne"
+ },
+ "source": [
+ "### Create a source `Dataset`\n",
+ "\n",
+ "Create a *source* dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices), or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset). See the [TensorFlow Dataset guide](https://www.tensorflow.org/guide/datasets#reading_input_data) for more information."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "F04fVOHQIBiG"
+ },
+ "outputs": [],
+ "source": [
+ "ds_tensors = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n",
+ "\n",
+ "# Create a CSV file\n",
+ "import tempfile\n",
+ "_, filename = tempfile.mkstemp()\n",
+ "\n",
+ "with open(filename, 'w') as f:\n",
+ " f.write(\"\"\"Line 1\n",
+ "Line 2\n",
+ "Line 3\n",
+ " \"\"\")\n",
+ "\n",
+ "ds_file = tf.data.TextLineDataset(filename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "vbxIhC-5IPdf"
+ },
+ "source": [
+ "### Apply transformations\n",
+ "\n",
+ "Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch), and [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle) to apply transformations to dataset records."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "uXSDZWE-ISsd"
+ },
+ "outputs": [],
+ "source": [
+ "ds_tensors = ds_tensors.map(tf.square).shuffle(2).batch(2)\n",
+ "\n",
+ "ds_file = ds_file.batch(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "A8X1GNfoIZKJ"
+ },
+ "source": [
+ "### Iterate\n",
+ "\n",
+ "`tf.data.Dataset` objects support iteration to loop over records:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "ws-WKRk5Ic6-"
+ },
+ "outputs": [],
+ "source": [
+ "print('Elements of ds_tensors:')\n",
+ "for x in ds_tensors:\n",
+ " print(x)\n",
+ "\n",
+ "print('\\nElements in ds_file:')\n",
+ "for x in ds_file:\n",
+ " print(x)"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "collapsed_sections": [],
+ "last_runtime": {
+ "build_target": "",
+ "kind": "local"
+ },
+ "name": "basics.ipynb",
+ "private_outputs": true,
+ "provenance": [],
+ "toc_visible": true,
+ "version": "0.3.2"
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/Notebooks/feature_crosses.ipynb b/Notebooks/feature_crosses.ipynb
new file mode 100644
index 0000000..282cfbb
--- /dev/null
+++ b/Notebooks/feature_crosses.ipynb
@@ -0,0 +1,929 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "feature_crosses.ipynb",
+ "provenance": [],
+ "collapsed_sections": [
+ "JndnmDMp66FL",
+ "ZTDHHM61NPTw",
+ "0i7vGo9PTaZl"
+ ]
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "g4T-_IsVbweU",
+ "colab_type": "text"
+ },
+ "source": [
+ "# Feature Crosses"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JndnmDMp66FL",
+ "colab_type": "text"
+ },
+ "source": [
+ "#### Copyright 2017 Google LLC."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "hMqWDc_m6rUC",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {}
+ },
+ "source": [
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "F7dke6skIK-k",
+ "colab_type": "text"
+ },
+ "source": [
+ "**Learning Objectives:**\n",
+ " * Improve a linear regression model with the addition of additional synthetic features (this is a continuation of the previous exercise)\n",
+ " * Use an input function to convert pandas `DataFrame` objects to `Tensors` and invoke the input function in `fit()` and `predict()` operations\n",
+ " * Use the FTRL optimization algorithm for model training\n",
+ " * Create new synthetic features through one-hot encoding, binning, and feature crosses"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "NS_fcQRd8B97",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Setup"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4IdzD8IdIK-l",
+ "colab_type": "text"
+ },
+ "source": [
+ "First, as we've done in previous exercises, let's define the input and create the data-loading code."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "CsfdiLiDIK-n",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "from __future__ import print_function\n",
+ "\n",
+ "import math\n",
+ "\n",
+ "from IPython import display\n",
+ "from matplotlib import cm\n",
+ "from matplotlib import gridspec\n",
+ "from matplotlib import pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn import metrics\n",
+ "import tensorflow as tf\n",
+ "from tensorflow.python.data import Dataset\n",
+ "\n",
+ "tf.logging.set_verbosity(tf.logging.ERROR)\n",
+ "pd.options.display.max_rows = 10\n",
+ "pd.options.display.float_format = '{:.1f}'.format\n",
+ "\n",
+ "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n",
+ "\n",
+ "california_housing_dataframe = california_housing_dataframe.reindex(\n",
+ " np.random.permutation(california_housing_dataframe.index))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "10rhoflKIK-s",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def preprocess_features(california_housing_dataframe):\n",
+ " \"\"\"Prepares input features from California housing data set.\n",
+ "\n",
+ " Args:\n",
+ " california_housing_dataframe: A Pandas DataFrame expected to contain data\n",
+ " from the California housing data set.\n",
+ " Returns:\n",
+ " A DataFrame that contains the features to be used for the model, including\n",
+ " synthetic features.\n",
+ " \"\"\"\n",
+ " selected_features = california_housing_dataframe[\n",
+ " [\"latitude\",\n",
+ " \"longitude\",\n",
+ " \"housing_median_age\",\n",
+ " \"total_rooms\",\n",
+ " \"total_bedrooms\",\n",
+ " \"population\",\n",
+ " \"households\",\n",
+ " \"median_income\"]]\n",
+ " processed_features = selected_features.copy()\n",
+ " # Create a synthetic feature.\n",
+ " processed_features[\"rooms_per_person\"] = (\n",
+ " california_housing_dataframe[\"total_rooms\"] /\n",
+ " california_housing_dataframe[\"population\"])\n",
+ " return processed_features\n",
+ "\n",
+ "def preprocess_targets(california_housing_dataframe):\n",
+ " \"\"\"Prepares target features (i.e., labels) from California housing data set.\n",
+ "\n",
+ " Args:\n",
+ " california_housing_dataframe: A Pandas DataFrame expected to contain data\n",
+ " from the California housing data set.\n",
+ " Returns:\n",
+ " A DataFrame that contains the target feature.\n",
+ " \"\"\"\n",
+ " output_targets = pd.DataFrame()\n",
+ " # Scale the target to be in units of thousands of dollars.\n",
+ " output_targets[\"median_house_value\"] = (\n",
+ " california_housing_dataframe[\"median_house_value\"] / 1000.0)\n",
+ " return output_targets"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ufplEkjN8KUp",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# Choose the first 12000 (out of 17000) examples for training.\n",
+ "training_examples = preprocess_features(california_housing_dataframe.head(12000))\n",
+ "training_targets = preprocess_targets(california_housing_dataframe.head(12000))\n",
+ "\n",
+ "# Choose the last 5000 (out of 17000) examples for validation.\n",
+ "validation_examples = preprocess_features(california_housing_dataframe.tail(5000))\n",
+ "validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))\n",
+ "\n",
+ "# Double-check that we've done the right thing.\n",
+ "print(\"Training examples summary:\")\n",
+ "display.display(training_examples.describe())\n",
+ "print(\"Validation examples summary:\")\n",
+ "display.display(validation_examples.describe())\n",
+ "\n",
+ "print(\"Training targets summary:\")\n",
+ "display.display(training_targets.describe())\n",
+ "print(\"Validation targets summary:\")\n",
+ "display.display(validation_targets.describe())"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "oJlrB4rJ_2Ma",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def construct_feature_columns(input_features):\n",
+ " \"\"\"Construct the TensorFlow Feature Columns.\n",
+ "\n",
+ " Args:\n",
+ " input_features: The names of the numerical input features to use.\n",
+ " Returns:\n",
+ " A set of feature columns\n",
+ " \"\"\"\n",
+ " return set([tf.feature_column.numeric_column(my_feature)\n",
+ " for my_feature in input_features])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "NBxoAfp2AcB6",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):\n",
+ " \"\"\"Trains a linear regression model.\n",
+ " \n",
+ " Args:\n",
+ " features: pandas DataFrame of features\n",
+ " targets: pandas DataFrame of targets\n",
+ " batch_size: Size of batches to be passed to the model\n",
+ " shuffle: True or False. Whether to shuffle the data.\n",
+ " num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely\n",
+ " Returns:\n",
+ " Tuple of (features, labels) for next data batch\n",
+ " \"\"\"\n",
+ " \n",
+ " # Convert pandas data into a dict of np arrays.\n",
+ " features = {key:np.array(value) for key,value in dict(features).items()} \n",
+ " \n",
+ " # Construct a dataset, and configure batching/repeating.\n",
+ " ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit\n",
+ " ds = ds.batch(batch_size).repeat(num_epochs)\n",
+ " \n",
+ " # Shuffle the data, if specified.\n",
+ " if shuffle:\n",
+ " ds = ds.shuffle(10000)\n",
+ " \n",
+ " # Return the next batch of data.\n",
+ " features, labels = ds.make_one_shot_iterator().get_next()\n",
+ " return features, labels"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "hweDyy31LBsV",
+ "colab_type": "text"
+ },
+ "source": [
+ "## FTRL Optimization Algorithm\n",
+ "\n",
+ "High dimensional linear models benefit from using a variant of gradient-based optimization called FTRL. This algorithm has the benefit of scaling the learning rate differently for different coefficients, which can be useful if some features rarely take non-zero values (it also is well suited to support L1 regularization). We can apply FTRL using the [FtrlOptimizer](https://www.tensorflow.org/api_docs/python/tf/train/FtrlOptimizer)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "S0SBf1X1IK_O",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def train_model(\n",
+ " learning_rate,\n",
+ " steps,\n",
+ " batch_size,\n",
+ " feature_columns,\n",
+ " training_examples,\n",
+ " training_targets,\n",
+ " validation_examples,\n",
+ " validation_targets):\n",
+ " \"\"\"Trains a linear regression model.\n",
+ " \n",
+ " In addition to training, this function also prints training progress information,\n",
+ " as well as a plot of the training and validation loss over time.\n",
+ " \n",
+ " Args:\n",
+ " learning_rate: A `float`, the learning rate.\n",
+ " steps: A non-zero `int`, the total number of training steps. A training step\n",
+ " consists of a forward and backward pass using a single batch.\n",
+ " feature_columns: A `set` specifying the input feature columns to use.\n",
+ " training_examples: A `DataFrame` containing one or more columns from\n",
+ " `california_housing_dataframe` to use as input features for training.\n",
+ " training_targets: A `DataFrame` containing exactly one column from\n",
+ " `california_housing_dataframe` to use as target for training.\n",
+ " validation_examples: A `DataFrame` containing one or more columns from\n",
+ " `california_housing_dataframe` to use as input features for validation.\n",
+ " validation_targets: A `DataFrame` containing exactly one column from\n",
+ " `california_housing_dataframe` to use as target for validation.\n",
+ " \n",
+ " Returns:\n",
+ " A `LinearRegressor` object trained on the training data.\n",
+ " \"\"\"\n",
+ "\n",
+ " periods = 10\n",
+ " steps_per_period = steps / periods\n",
+ "\n",
+ " # Create a linear regressor object.\n",
+ " my_optimizer = tf.train.FtrlOptimizer(learning_rate=learning_rate)\n",
+ " my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)\n",
+ " linear_regressor = tf.estimator.LinearRegressor(\n",
+ " feature_columns=feature_columns,\n",
+ " optimizer=my_optimizer\n",
+ " )\n",
+ " \n",
+ " training_input_fn = lambda: my_input_fn(training_examples, \n",
+ " training_targets[\"median_house_value\"], \n",
+ " batch_size=batch_size)\n",
+ " predict_training_input_fn = lambda: my_input_fn(training_examples, \n",
+ " training_targets[\"median_house_value\"], \n",
+ " num_epochs=1, \n",
+ " shuffle=False)\n",
+ " predict_validation_input_fn = lambda: my_input_fn(validation_examples, \n",
+ " validation_targets[\"median_house_value\"], \n",
+ " num_epochs=1, \n",
+ " shuffle=False)\n",
+ "\n",
+ " # Train the model, but do so inside a loop so that we can periodically assess\n",
+ " # loss metrics.\n",
+ " print(\"Training model...\")\n",
+ " print(\"RMSE (on training data):\")\n",
+ " training_rmse = []\n",
+ " validation_rmse = []\n",
+ " for period in range (0, periods):\n",
+ " # Train the model, starting from the prior state.\n",
+ " linear_regressor.train(\n",
+ " input_fn=training_input_fn,\n",
+ " steps=steps_per_period\n",
+ " )\n",
+ " # Take a break and compute predictions.\n",
+ " training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)\n",
+ " training_predictions = np.array([item['predictions'][0] for item in training_predictions])\n",
+ " validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)\n",
+ " validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])\n",
+ " \n",
+ " # Compute training and validation loss.\n",
+ " training_root_mean_squared_error = math.sqrt(\n",
+ " metrics.mean_squared_error(training_predictions, training_targets))\n",
+ " validation_root_mean_squared_error = math.sqrt(\n",
+ " metrics.mean_squared_error(validation_predictions, validation_targets))\n",
+ " # Occasionally print the current loss.\n",
+ " print(\" period %02d : %0.2f\" % (period, training_root_mean_squared_error))\n",
+ " # Add the loss metrics from this period to our list.\n",
+ " training_rmse.append(training_root_mean_squared_error)\n",
+ " validation_rmse.append(validation_root_mean_squared_error)\n",
+ " print(\"Model training finished.\")\n",
+ "\n",
+ " \n",
+ " # Output a graph of loss metrics over periods.\n",
+ " plt.ylabel(\"RMSE\")\n",
+ " plt.xlabel(\"Periods\")\n",
+ " plt.title(\"Root Mean Squared Error vs. Periods\")\n",
+ " plt.tight_layout()\n",
+ " plt.plot(training_rmse, label=\"training\")\n",
+ " plt.plot(validation_rmse, label=\"validation\")\n",
+ " plt.legend()\n",
+ "\n",
+ " return linear_regressor"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "1Cdr02tLIK_Q",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "_ = train_model(\n",
+ " learning_rate=1.0,\n",
+ " steps=500,\n",
+ " batch_size=100,\n",
+ " feature_columns=construct_feature_columns(training_examples),\n",
+ " training_examples=training_examples,\n",
+ " training_targets=training_targets,\n",
+ " validation_examples=validation_examples,\n",
+ " validation_targets=validation_targets)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "i4lGvqajDWlw",
+ "colab_type": "text"
+ },
+ "source": [
+ "## One-Hot Encoding for Discrete Features\n",
+ "\n",
+ "Discrete (i.e. strings, enumerations, integers) features are usually converted into families of binary features before training a logistic regression model.\n",
+ "\n",
+ "For example, suppose we created a synthetic feature that can take any of the values `0`, `1` or `2`, and that we have a few training points:\n",
+ "\n",
+ "| # | feature_value |\n",
+ "|---|---------------|\n",
+ "| 0 | 2 |\n",
+ "| 1 | 0 |\n",
+ "| 2 | 1 |\n",
+ "\n",
+ "For each possible categorical value, we make a new **binary** feature of **real values** that can take one of just two possible values: 1.0 if the example has that value, and 0.0 if not. In the example above, the categorical feature would be converted into three features, and the training points now look like:\n",
+ "\n",
+ "| # | feature_value_0 | feature_value_1 | feature_value_2 |\n",
+ "|---|-----------------|-----------------|-----------------|\n",
+ "| 0 | 0.0 | 0.0 | 1.0 |\n",
+ "| 1 | 1.0 | 0.0 | 0.0 |\n",
+ "| 2 | 0.0 | 1.0 | 0.0 |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "KnssXowblKm7",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Bucketized (Binned) Features\n",
+ "\n",
+ "Bucketization is also known as binning.\n",
+ "\n",
+ "We can bucketize `population` into the following 3 buckets (for instance):\n",
+ "- `bucket_0` (`< 5000`): corresponding to less populated blocks\n",
+ "- `bucket_1` (`5000 - 25000`): corresponding to mid populated blocks\n",
+ "- `bucket_2` (`> 25000`): corresponding to highly populated blocks\n",
+ "\n",
+ "Given the preceding bucket definitions, the following `population` vector:\n",
+ "\n",
+ " [[10001], [42004], [2500], [18000]]\n",
+ "\n",
+ "becomes the following bucketized feature vector:\n",
+ "\n",
+ " [[1], [2], [0], [1]]\n",
+ "\n",
+ "The feature values are now the bucket indices. Note that these indices are considered to be discrete features. Typically, these will be further converted in one-hot representations as above, but this is done transparently.\n",
+ "\n",
+ "To define feature columns for bucketized features, instead of using `numeric_column`, we can use [`bucketized_column`](https://www.tensorflow.org/api_docs/python/tf/feature_column/bucketized_column), which takes a numeric column as input and transforms it to a bucketized feature using the bucket boundaries specified in the `boundaries` argument. The following code defines bucketized feature columns for `households` and `longitude`; the `get_quantile_based_boundaries` function calculates boundaries based on quantiles, so that each bucket contains an equal number of elements."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "cc9qZrtRy-ED",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def get_quantile_based_boundaries(feature_values, num_buckets):\n",
+ " boundaries = np.arange(1.0, num_buckets) / num_buckets\n",
+ " quantiles = feature_values.quantile(boundaries)\n",
+ " return [quantiles[q] for q in quantiles.keys()]\n",
+ "\n",
+ "# Divide households into 7 buckets.\n",
+ "households = tf.feature_column.numeric_column(\"households\")\n",
+ "bucketized_households = tf.feature_column.bucketized_column(\n",
+ " households, boundaries=get_quantile_based_boundaries(\n",
+ " california_housing_dataframe[\"households\"], 7))\n",
+ "\n",
+ "# Divide longitude into 10 buckets.\n",
+ "longitude = tf.feature_column.numeric_column(\"longitude\")\n",
+ "bucketized_longitude = tf.feature_column.bucketized_column(\n",
+ " longitude, boundaries=get_quantile_based_boundaries(\n",
+ " california_housing_dataframe[\"longitude\"], 10))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "U-pQDAa0MeN3",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Task 1: Train the Model on Bucketized Feature Columns\n",
+ "**Bucketize all the real valued features in our example, train the model and see if the results improve.**\n",
+ "\n",
+ "In the preceding code block, two real valued columns (namely `households` and `longitude`) have been transformed into bucketized feature columns. Your task is to bucketize the rest of the columns, then run the code to train the model. There are various heuristics to find the range of the buckets. This exercise uses a quantile-based technique, which chooses the bucket boundaries in such a way that each bucket has the same number of examples."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "YFXV9lyMLedy",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def construct_feature_columns():\n",
+ " \"\"\"Construct the TensorFlow Feature Columns.\n",
+ "\n",
+ " Returns:\n",
+ " A set of feature columns\n",
+ " \"\"\" \n",
+ " households = tf.feature_column.numeric_column(\"households\")\n",
+ " longitude = tf.feature_column.numeric_column(\"longitude\")\n",
+ " latitude = tf.feature_column.numeric_column(\"latitude\")\n",
+ " housing_median_age = tf.feature_column.numeric_column(\"housing_median_age\")\n",
+ " median_income = tf.feature_column.numeric_column(\"median_income\")\n",
+ " rooms_per_person = tf.feature_column.numeric_column(\"rooms_per_person\")\n",
+ " \n",
+ " # Divide households into 7 buckets.\n",
+ " bucketized_households = tf.feature_column.bucketized_column(\n",
+ " households, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"households\"], 7))\n",
+ "\n",
+ " # Divide longitude into 10 buckets.\n",
+ " bucketized_longitude = tf.feature_column.bucketized_column(\n",
+ " longitude, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"longitude\"], 10))\n",
+ "\n",
+ " #\n",
+ " # YOUR CODE HERE: bucketize the following columns, following the example above:\n",
+ " #\n",
+ " bucketized_latitude = \n",
+ " bucketized_housing_median_age = \n",
+ " bucketized_median_income =\n",
+ " bucketized_rooms_per_person =\n",
+ " \n",
+ " feature_columns = set([\n",
+ " bucketized_longitude,\n",
+ " bucketized_latitude,\n",
+ " bucketized_housing_median_age,\n",
+ " bucketized_households,\n",
+ " bucketized_median_income,\n",
+ " bucketized_rooms_per_person])\n",
+ " \n",
+ " return feature_columns\n"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "0FfUytOTNJhL",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "_ = train_model(\n",
+ " learning_rate=1.0,\n",
+ " steps=500,\n",
+ " batch_size=100,\n",
+ " feature_columns=construct_feature_columns(),\n",
+ " training_examples=training_examples,\n",
+ " training_targets=training_targets,\n",
+ " validation_examples=validation_examples,\n",
+ " validation_targets=validation_targets)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ZTDHHM61NPTw",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for a solution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JQHnUhL_NRwA",
+ "colab_type": "text"
+ },
+ "source": [
+ "You may be wondering how to determine how many buckets to use. That is of course data-dependent. Here, we just selected arbitrary values so as to obtain a not-too-large model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Ro5civQ3Ngh_",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def construct_feature_columns():\n",
+ " \"\"\"Construct the TensorFlow Feature Columns.\n",
+ "\n",
+ " Returns:\n",
+ " A set of feature columns\n",
+ " \"\"\" \n",
+ " households = tf.feature_column.numeric_column(\"households\")\n",
+ " longitude = tf.feature_column.numeric_column(\"longitude\")\n",
+ " latitude = tf.feature_column.numeric_column(\"latitude\")\n",
+ " housing_median_age = tf.feature_column.numeric_column(\"housing_median_age\")\n",
+ " median_income = tf.feature_column.numeric_column(\"median_income\")\n",
+ " rooms_per_person = tf.feature_column.numeric_column(\"rooms_per_person\")\n",
+ " \n",
+ " # Divide households into 7 buckets.\n",
+ " bucketized_households = tf.feature_column.bucketized_column(\n",
+ " households, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"households\"], 7))\n",
+ "\n",
+ " # Divide longitude into 10 buckets.\n",
+ " bucketized_longitude = tf.feature_column.bucketized_column(\n",
+ " longitude, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"longitude\"], 10))\n",
+ " \n",
+ " # Divide latitude into 10 buckets.\n",
+ " bucketized_latitude = tf.feature_column.bucketized_column(\n",
+ " latitude, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"latitude\"], 10))\n",
+ "\n",
+ " # Divide housing_median_age into 7 buckets.\n",
+ " bucketized_housing_median_age = tf.feature_column.bucketized_column(\n",
+ " housing_median_age, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"housing_median_age\"], 7))\n",
+ " \n",
+ " # Divide median_income into 7 buckets.\n",
+ " bucketized_median_income = tf.feature_column.bucketized_column(\n",
+ " median_income, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"median_income\"], 7))\n",
+ " \n",
+ " # Divide rooms_per_person into 7 buckets.\n",
+ " bucketized_rooms_per_person = tf.feature_column.bucketized_column(\n",
+ " rooms_per_person, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"rooms_per_person\"], 7))\n",
+ " \n",
+ " feature_columns = set([\n",
+ " bucketized_longitude,\n",
+ " bucketized_latitude,\n",
+ " bucketized_housing_median_age,\n",
+ " bucketized_households,\n",
+ " bucketized_median_income,\n",
+ " bucketized_rooms_per_person])\n",
+ " \n",
+ " return feature_columns"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "RNgfYk6OO8Sy",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "_ = train_model(\n",
+ " learning_rate=1.0,\n",
+ " steps=500,\n",
+ " batch_size=100,\n",
+ " feature_columns=construct_feature_columns(),\n",
+ " training_examples=training_examples,\n",
+ " training_targets=training_targets,\n",
+ " validation_examples=validation_examples,\n",
+ " validation_targets=validation_targets)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "AFJ1qoZPlQcs",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Feature Crosses\n",
+ "\n",
+ "Crossing two (or more) features is a clever way to learn non-linear relations using a linear model. In our problem, if we just use the feature `latitude` for learning, the model might learn that city blocks at a particular latitude (or within a particular range of latitudes since we have bucketized it) are more likely to be expensive than others. Similarly for the feature `longitude`. However, if we cross `longitude` by `latitude`, the crossed feature represents a well defined city block. If the model learns that certain city blocks (within range of latitudes and longitudes) are more likely to be more expensive than others, it is a stronger signal than two features considered individually.\n",
+ "\n",
+ "Currently, the feature columns API only supports discrete features for crosses. To cross two continuous values, like `latitude` or `longitude`, we can bucketize them.\n",
+ "\n",
+ "If we cross the `latitude` and `longitude` features (supposing, for example, that `longitude` was bucketized into `2` buckets, while `latitude` has `3` buckets), we actually get six crossed binary features. Each of these features will get its own separate weight when we train the model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-Rk0c1oTYaVH",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Task 2: Train the Model Using Feature Crosses\n",
+ "\n",
+ "**Add a feature cross of `longitude` and `latitude` to your model, train it, and determine whether the results improve.**\n",
+ "\n",
+ "Refer to the TensorFlow API docs for [`crossed_column()`](https://www.tensorflow.org/api_docs/python/tf/feature_column/crossed_column) to build the feature column for your cross. Use a `hash_bucket_size` of `1000`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "-eYiVEGeYhUi",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {}
+ },
+ "source": [
+ "def construct_feature_columns():\n",
+ " \"\"\"Construct the TensorFlow Feature Columns.\n",
+ "\n",
+ " Returns:\n",
+ " A set of feature columns\n",
+ " \"\"\" \n",
+ " households = tf.feature_column.numeric_column(\"households\")\n",
+ " longitude = tf.feature_column.numeric_column(\"longitude\")\n",
+ " latitude = tf.feature_column.numeric_column(\"latitude\")\n",
+ " housing_median_age = tf.feature_column.numeric_column(\"housing_median_age\")\n",
+ " median_income = tf.feature_column.numeric_column(\"median_income\")\n",
+ " rooms_per_person = tf.feature_column.numeric_column(\"rooms_per_person\")\n",
+ " \n",
+ " # Divide households into 7 buckets.\n",
+ " bucketized_households = tf.feature_column.bucketized_column(\n",
+ " households, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"households\"], 7))\n",
+ "\n",
+ " # Divide longitude into 10 buckets.\n",
+ " bucketized_longitude = tf.feature_column.bucketized_column(\n",
+ " longitude, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"longitude\"], 10))\n",
+ " \n",
+ " # Divide latitude into 10 buckets.\n",
+ " bucketized_latitude = tf.feature_column.bucketized_column(\n",
+ " latitude, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"latitude\"], 10))\n",
+ "\n",
+ " # Divide housing_median_age into 7 buckets.\n",
+ " bucketized_housing_median_age = tf.feature_column.bucketized_column(\n",
+ " housing_median_age, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"housing_median_age\"], 7))\n",
+ " \n",
+ " # Divide median_income into 7 buckets.\n",
+ " bucketized_median_income = tf.feature_column.bucketized_column(\n",
+ " median_income, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"median_income\"], 7))\n",
+ " \n",
+ " # Divide rooms_per_person into 7 buckets.\n",
+ " bucketized_rooms_per_person = tf.feature_column.bucketized_column(\n",
+ " rooms_per_person, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"rooms_per_person\"], 7))\n",
+ " \n",
+ " # YOUR CODE HERE: Make a feature column for the long_x_lat feature cross\n",
+ " long_x_lat = \n",
+ " \n",
+ " feature_columns = set([\n",
+ " bucketized_longitude,\n",
+ " bucketized_latitude,\n",
+ " bucketized_housing_median_age,\n",
+ " bucketized_households,\n",
+ " bucketized_median_income,\n",
+ " bucketized_rooms_per_person,\n",
+ " long_x_lat])\n",
+ " \n",
+ " return feature_columns"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "xZuZMp3EShkM",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "_ = train_model(\n",
+ " learning_rate=1.0,\n",
+ " steps=500,\n",
+ " batch_size=100,\n",
+ " feature_columns=construct_feature_columns(),\n",
+ " training_examples=training_examples,\n",
+ " training_targets=training_targets,\n",
+ " validation_examples=validation_examples,\n",
+ " validation_targets=validation_targets)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0i7vGo9PTaZl",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for the solution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "3tAWu8qSTe2v",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def construct_feature_columns():\n",
+ " \"\"\"Construct the TensorFlow Feature Columns.\n",
+ "\n",
+ " Returns:\n",
+ " A set of feature columns\n",
+ " \"\"\" \n",
+ " households = tf.feature_column.numeric_column(\"households\")\n",
+ " longitude = tf.feature_column.numeric_column(\"longitude\")\n",
+ " latitude = tf.feature_column.numeric_column(\"latitude\")\n",
+ " housing_median_age = tf.feature_column.numeric_column(\"housing_median_age\")\n",
+ " median_income = tf.feature_column.numeric_column(\"median_income\")\n",
+ " rooms_per_person = tf.feature_column.numeric_column(\"rooms_per_person\")\n",
+ " \n",
+ " # Divide households into 7 buckets.\n",
+ " bucketized_households = tf.feature_column.bucketized_column(\n",
+ " households, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"households\"], 7))\n",
+ "\n",
+ " # Divide longitude into 10 buckets.\n",
+ " bucketized_longitude = tf.feature_column.bucketized_column(\n",
+ " longitude, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"longitude\"], 10))\n",
+ " \n",
+ " # Divide latitude into 10 buckets.\n",
+ " bucketized_latitude = tf.feature_column.bucketized_column(\n",
+ " latitude, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"latitude\"], 10))\n",
+ "\n",
+ " # Divide housing_median_age into 7 buckets.\n",
+ " bucketized_housing_median_age = tf.feature_column.bucketized_column(\n",
+ " housing_median_age, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"housing_median_age\"], 7))\n",
+ " \n",
+ " # Divide median_income into 7 buckets.\n",
+ " bucketized_median_income = tf.feature_column.bucketized_column(\n",
+ " median_income, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"median_income\"], 7))\n",
+ " \n",
+ " # Divide rooms_per_person into 7 buckets.\n",
+ " bucketized_rooms_per_person = tf.feature_column.bucketized_column(\n",
+ " rooms_per_person, boundaries=get_quantile_based_boundaries(\n",
+ " training_examples[\"rooms_per_person\"], 7))\n",
+ " \n",
+ " # YOUR CODE HERE: Make a feature column for the long_x_lat feature cross\n",
+ " long_x_lat = tf.feature_column.crossed_column(\n",
+ " set([bucketized_longitude, bucketized_latitude]), hash_bucket_size=1000) \n",
+ " \n",
+ " feature_columns = set([\n",
+ " bucketized_longitude,\n",
+ " bucketized_latitude,\n",
+ " bucketized_housing_median_age,\n",
+ " bucketized_households,\n",
+ " bucketized_median_income,\n",
+ " bucketized_rooms_per_person,\n",
+ " long_x_lat])\n",
+ " \n",
+ " return feature_columns"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "-_vvNYIyTtPC",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "_ = train_model(\n",
+ " learning_rate=1.0,\n",
+ " steps=500,\n",
+ " batch_size=100,\n",
+ " feature_columns=construct_feature_columns(),\n",
+ " training_examples=training_examples,\n",
+ " training_targets=training_targets,\n",
+ " validation_examples=validation_examples,\n",
+ " validation_targets=validation_targets)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ymlHJ-vrhLZw",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Optional Challenge: Try Out More Synthetic Features\n",
+ "\n",
+ "So far, we've tried simple bucketized columns and feature crosses, but there are many more combinations that could potentially improve the results. For example, you could cross multiple columns. What happens if you vary the number of buckets? What other synthetic features can you think of? Do they improve the model?"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/Notebooks/feature_sets.ipynb b/Notebooks/feature_sets.ipynb
new file mode 100644
index 0000000..d49f042
--- /dev/null
+++ b/Notebooks/feature_sets.ipynb
@@ -0,0 +1,661 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "feature_sets.ipynb",
+ "provenance": [],
+ "collapsed_sections": [
+ "JndnmDMp66FL",
+ "IGINhMIJ5Wyt",
+ "pZa8miwu6_tQ"
+ ]
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JndnmDMp66FL",
+ "colab_type": "text"
+ },
+ "source": [
+ "#### Copyright 2017 Google LLC."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "hMqWDc_m6rUC",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {}
+ },
+ "source": [
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zbIgBK-oXHO7",
+ "colab_type": "text"
+ },
+ "source": [
+ "# Feature Sets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bL04rAQwH3pH",
+ "colab_type": "text"
+ },
+ "source": [
+ "**Learning Objective:** Create a minimal set of features that performs just as well as a more complex feature set"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "F8Hci6tAH3pH",
+ "colab_type": "text"
+ },
+ "source": [
+ "So far, we've thrown all of our features into the model. Models with fewer features use fewer resources and are easier to maintain. Let's see if we can build a model on a minimal set of housing features that will perform equally as well as one that uses all the features in the data set."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "F5ZjVwK_qOyR",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Setup\n",
+ "\n",
+ "As before, let's load and prepare the California housing data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "SrOYRILAH3pJ",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "from __future__ import print_function\n",
+ "\n",
+ "import math\n",
+ "\n",
+ "from IPython import display\n",
+ "from matplotlib import cm\n",
+ "from matplotlib import gridspec\n",
+ "from matplotlib import pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn import metrics\n",
+ "import tensorflow as tf\n",
+ "from tensorflow.python.data import Dataset\n",
+ "\n",
+ "tf.logging.set_verbosity(tf.logging.ERROR)\n",
+ "pd.options.display.max_rows = 10\n",
+ "pd.options.display.float_format = '{:.1f}'.format\n",
+ "\n",
+ "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n",
+ "\n",
+ "california_housing_dataframe = california_housing_dataframe.reindex(\n",
+ " np.random.permutation(california_housing_dataframe.index))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "dGnXo7flH3pM",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def preprocess_features(california_housing_dataframe):\n",
+ " \"\"\"Prepares input features from California housing data set.\n",
+ "\n",
+ " Args:\n",
+ " california_housing_dataframe: A Pandas DataFrame expected to contain data\n",
+ " from the California housing data set.\n",
+ " Returns:\n",
+ " A DataFrame that contains the features to be used for the model, including\n",
+ " synthetic features.\n",
+ " \"\"\"\n",
+ " selected_features = california_housing_dataframe[\n",
+ " [\"latitude\",\n",
+ " \"longitude\",\n",
+ " \"housing_median_age\",\n",
+ " \"total_rooms\",\n",
+ " \"total_bedrooms\",\n",
+ " \"population\",\n",
+ " \"households\",\n",
+ " \"median_income\"]]\n",
+ " processed_features = selected_features.copy()\n",
+ " # Create a synthetic feature.\n",
+ " processed_features[\"rooms_per_person\"] = (\n",
+ " california_housing_dataframe[\"total_rooms\"] /\n",
+ " california_housing_dataframe[\"population\"])\n",
+ " return processed_features\n",
+ "\n",
+ "def preprocess_targets(california_housing_dataframe):\n",
+ " \"\"\"Prepares target features (i.e., labels) from California housing data set.\n",
+ "\n",
+ " Args:\n",
+ " california_housing_dataframe: A Pandas DataFrame expected to contain data\n",
+ " from the California housing data set.\n",
+ " Returns:\n",
+ " A DataFrame that contains the target feature.\n",
+ " \"\"\"\n",
+ " output_targets = pd.DataFrame()\n",
+ " # Scale the target to be in units of thousands of dollars.\n",
+ " output_targets[\"median_house_value\"] = (\n",
+ " california_housing_dataframe[\"median_house_value\"] / 1000.0)\n",
+ " return output_targets"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "jLXC8y4AqsIy",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# Choose the first 12000 (out of 17000) examples for training.\n",
+ "training_examples = preprocess_features(california_housing_dataframe.head(12000))\n",
+ "training_targets = preprocess_targets(california_housing_dataframe.head(12000))\n",
+ "\n",
+ "# Choose the last 5000 (out of 17000) examples for validation.\n",
+ "validation_examples = preprocess_features(california_housing_dataframe.tail(5000))\n",
+ "validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))\n",
+ "\n",
+ "# Double-check that we've done the right thing.\n",
+ "print(\"Training examples summary:\")\n",
+ "display.display(training_examples.describe())\n",
+ "print(\"Validation examples summary:\")\n",
+ "display.display(validation_examples.describe())\n",
+ "\n",
+ "print(\"Training targets summary:\")\n",
+ "display.display(training_targets.describe())\n",
+ "print(\"Validation targets summary:\")\n",
+ "display.display(validation_targets.describe())"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "hLvmkugKLany",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Task 1: Develop a Good Feature Set\n",
+ "\n",
+ "**What's the best performance you can get with just 2 or 3 features?**\n",
+ "\n",
+ "A **correlation matrix** shows pairwise correlations, both for each feature compared to the target and for each feature compared to other features.\n",
+ "\n",
+ "Here, correlation is defined as the [Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient). You don't have to understand the mathematical details for this exercise.\n",
+ "\n",
+ "Correlation values have the following meanings:\n",
+ "\n",
+ " * `-1.0`: perfect negative correlation\n",
+ " * `0.0`: no correlation\n",
+ " * `1.0`: perfect positive correlation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "UzoZUSdLIolF",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {
+ "test": {
+ "output": "ignore",
+ "timeout": 600
+ }
+ }
+ },
+ "source": [
+ "correlation_dataframe = training_examples.copy()\n",
+ "correlation_dataframe[\"target\"] = training_targets[\"median_house_value\"]\n",
+ "\n",
+ "correlation_dataframe.corr()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RQpktkNpia2P",
+ "colab_type": "text"
+ },
+ "source": [
+ "Features that have strong positive or negative correlations with the target will add information to our model. We can use the correlation matrix to find such strongly correlated features.\n",
+ "\n",
+ "We'd also like to have features that aren't so strongly correlated with each other, so that they add independent information.\n",
+ "\n",
+ "Use this information to try removing features. You can also try developing additional synthetic features, such as ratios of two raw features.\n",
+ "\n",
+ "For convenience, we've included the training code from the previous exercise."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "bjR5jWpFr2xs",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def construct_feature_columns(input_features):\n",
+ " \"\"\"Construct the TensorFlow Feature Columns.\n",
+ "\n",
+ " Args:\n",
+ " input_features: The names of the numerical input features to use.\n",
+ " Returns:\n",
+ " A set of feature columns\n",
+ " \"\"\" \n",
+ " return set([tf.feature_column.numeric_column(my_feature)\n",
+ " for my_feature in input_features])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "jsvKHzRciH9T",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):\n",
+ " \"\"\"Trains a linear regression model.\n",
+ " \n",
+ " Args:\n",
+ " features: pandas DataFrame of features\n",
+ " targets: pandas DataFrame of targets\n",
+ " batch_size: Size of batches to be passed to the model\n",
+ " shuffle: True or False. Whether to shuffle the data.\n",
+ " num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely\n",
+ " Returns:\n",
+ " Tuple of (features, labels) for next data batch\n",
+ " \"\"\"\n",
+ " \n",
+ " # Convert pandas data into a dict of np arrays.\n",
+ " features = {key:np.array(value) for key,value in dict(features).items()} \n",
+ " \n",
+ " # Construct a dataset, and configure batching/repeating.\n",
+ " ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit\n",
+ " ds = ds.batch(batch_size).repeat(num_epochs)\n",
+ "\n",
+ " # Shuffle the data, if specified.\n",
+ " if shuffle:\n",
+ " ds = ds.shuffle(10000)\n",
+ " \n",
+ " # Return the next batch of data.\n",
+ " features, labels = ds.make_one_shot_iterator().get_next()\n",
+ " return features, labels"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "g3kjQV9WH3pb",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def train_model(\n",
+ " learning_rate,\n",
+ " steps,\n",
+ " batch_size,\n",
+ " training_examples,\n",
+ " training_targets,\n",
+ " validation_examples,\n",
+ " validation_targets):\n",
+ " \"\"\"Trains a linear regression model.\n",
+ " \n",
+ " In addition to training, this function also prints training progress information,\n",
+ " as well as a plot of the training and validation loss over time.\n",
+ " \n",
+ " Args:\n",
+ " learning_rate: A `float`, the learning rate.\n",
+ " steps: A non-zero `int`, the total number of training steps. A training step\n",
+ " consists of a forward and backward pass using a single batch.\n",
+ " batch_size: A non-zero `int`, the batch size.\n",
+ " training_examples: A `DataFrame` containing one or more columns from\n",
+ " `california_housing_dataframe` to use as input features for training.\n",
+ " training_targets: A `DataFrame` containing exactly one column from\n",
+ " `california_housing_dataframe` to use as target for training.\n",
+ " validation_examples: A `DataFrame` containing one or more columns from\n",
+ " `california_housing_dataframe` to use as input features for validation.\n",
+ " validation_targets: A `DataFrame` containing exactly one column from\n",
+ " `california_housing_dataframe` to use as target for validation.\n",
+ " \n",
+ " Returns:\n",
+ " A `LinearRegressor` object trained on the training data.\n",
+ " \"\"\"\n",
+ "\n",
+ " periods = 10\n",
+ " steps_per_period = steps / periods\n",
+ "\n",
+ " # Create a linear regressor object.\n",
+ " my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)\n",
+ " my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)\n",
+ " linear_regressor = tf.estimator.LinearRegressor(\n",
+ " feature_columns=construct_feature_columns(training_examples),\n",
+ " optimizer=my_optimizer\n",
+ " )\n",
+ " \n",
+ " # Create input functions.\n",
+ " training_input_fn = lambda: my_input_fn(training_examples, \n",
+ " training_targets[\"median_house_value\"], \n",
+ " batch_size=batch_size)\n",
+ " predict_training_input_fn = lambda: my_input_fn(training_examples, \n",
+ " training_targets[\"median_house_value\"], \n",
+ " num_epochs=1, \n",
+ " shuffle=False)\n",
+ " predict_validation_input_fn = lambda: my_input_fn(validation_examples, \n",
+ " validation_targets[\"median_house_value\"], \n",
+ " num_epochs=1, \n",
+ " shuffle=False)\n",
+ "\n",
+ " # Train the model, but do so inside a loop so that we can periodically assess\n",
+ " # loss metrics.\n",
+ " print(\"Training model...\")\n",
+ " print(\"RMSE (on training data):\")\n",
+ " training_rmse = []\n",
+ " validation_rmse = []\n",
+ " for period in range (0, periods):\n",
+ " # Train the model, starting from the prior state.\n",
+ " linear_regressor.train(\n",
+ " input_fn=training_input_fn,\n",
+ " steps=steps_per_period,\n",
+ " )\n",
+ " # Take a break and compute predictions.\n",
+ " training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)\n",
+ " training_predictions = np.array([item['predictions'][0] for item in training_predictions])\n",
+ " \n",
+ " validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)\n",
+ " validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])\n",
+ " \n",
+ " # Compute training and validation loss.\n",
+ " training_root_mean_squared_error = math.sqrt(\n",
+ " metrics.mean_squared_error(training_predictions, training_targets))\n",
+ " validation_root_mean_squared_error = math.sqrt(\n",
+ " metrics.mean_squared_error(validation_predictions, validation_targets))\n",
+ " # Occasionally print the current loss.\n",
+ " print(\" period %02d : %0.2f\" % (period, training_root_mean_squared_error))\n",
+ " # Add the loss metrics from this period to our list.\n",
+ " training_rmse.append(training_root_mean_squared_error)\n",
+ " validation_rmse.append(validation_root_mean_squared_error)\n",
+ " print(\"Model training finished.\")\n",
+ "\n",
+ " \n",
+ " # Output a graph of loss metrics over periods.\n",
+ " plt.ylabel(\"RMSE\")\n",
+ " plt.xlabel(\"Periods\")\n",
+ " plt.title(\"Root Mean Squared Error vs. Periods\")\n",
+ " plt.tight_layout()\n",
+ " plt.plot(training_rmse, label=\"training\")\n",
+ " plt.plot(validation_rmse, label=\"validation\")\n",
+ " plt.legend()\n",
+ "\n",
+ " return linear_regressor"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "varLu7RNH3pf",
+ "colab_type": "text"
+ },
+ "source": [
+ "Spend 5 minutes searching for a good set of features and training parameters. Then check the solution to see what we chose. Don't forget that different features may require different learning parameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "DSgUxRIlH3pg",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "#\n",
+ "# Your code here: add your features of choice as a list of quoted strings.\n",
+ "#\n",
+ "minimal_features = [\n",
+ "]\n",
+ "\n",
+ "assert minimal_features, \"You must select at least one feature!\"\n",
+ "\n",
+ "minimal_training_examples = training_examples[minimal_features]\n",
+ "minimal_validation_examples = validation_examples[minimal_features]\n",
+ "\n",
+ "#\n",
+ "# Don't forget to adjust these parameters.\n",
+ "#\n",
+ "train_model(\n",
+ " learning_rate=0.001,\n",
+ " steps=500,\n",
+ " batch_size=5,\n",
+ " training_examples=minimal_training_examples,\n",
+ " training_targets=training_targets,\n",
+ " validation_examples=minimal_validation_examples,\n",
+ " validation_targets=validation_targets)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "IGINhMIJ5Wyt",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for a solution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "BAGoXFPZ5ZE3",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "minimal_features = [\n",
+ " \"median_income\",\n",
+ " \"latitude\",\n",
+ "]\n",
+ "\n",
+ "minimal_training_examples = training_examples[minimal_features]\n",
+ "minimal_validation_examples = validation_examples[minimal_features]\n",
+ "\n",
+ "_ = train_model(\n",
+ " learning_rate=0.01,\n",
+ " steps=500,\n",
+ " batch_size=5,\n",
+ " training_examples=minimal_training_examples,\n",
+ " training_targets=training_targets,\n",
+ " validation_examples=minimal_validation_examples,\n",
+ " validation_targets=validation_targets)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RidI9YhKOiY2",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Task 2: Make Better Use of Latitude\n",
+ "\n",
+ "Plotting `latitude` vs. `median_house_value` shows that there really isn't a linear relationship there.\n",
+ "\n",
+ "Instead, there are a couple of peaks, which roughly correspond to Los Angeles and San Francisco."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "hfGUKj2IR_F1",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {
+ "test": {
+ "output": "ignore",
+ "timeout": 600
+ }
+ }
+ },
+ "source": [
+ "plt.scatter(training_examples[\"latitude\"], training_targets[\"median_house_value\"])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6N0p91k2iFCP",
+ "colab_type": "text"
+ },
+ "source": [
+ "**Try creating some synthetic features that do a better job with latitude.**\n",
+ "\n",
+ "For example, you could have a feature that maps `latitude` to a value of `|latitude - 38|`, and call this `distance_from_san_francisco`.\n",
+ "\n",
+ "Or you could break the space into 10 different buckets. `latitude_32_to_33`, `latitude_33_to_34`, etc., each showing a value of `1.0` if `latitude` is within that bucket range and a value of `0.0` otherwise.\n",
+ "\n",
+ "Use the correlation matrix to help guide development, and then add them to your model if you find something that looks good.\n",
+ "\n",
+ "What's the best validation performance you can get?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "wduJ2B28yMFl",
+ "colab_type": "code",
+ "cellView": "form",
+ "colab": {}
+ },
+ "source": [
+ "#\n",
+ "# YOUR CODE HERE: Train on a new data set that includes synthetic features based on latitude.\n",
+ "#"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "pZa8miwu6_tQ",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for a solution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "PzABdyjq7IZU",
+ "colab_type": "text"
+ },
+ "source": [
+ "Aside from `latitude`, we'll also keep `median_income`, to compare with the previous results.\n",
+ "\n",
+ "We decided to bucketize the latitude. This is fairly straightforward in Pandas using `Series.apply`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "xdVF8siZ7Lup",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def select_and_transform_features(source_df):\n",
+ " LATITUDE_RANGES = zip(range(32, 44), range(33, 45))\n",
+ " selected_examples = pd.DataFrame()\n",
+ " selected_examples[\"median_income\"] = source_df[\"median_income\"]\n",
+ " for r in LATITUDE_RANGES:\n",
+ " selected_examples[\"latitude_%d_to_%d\" % r] = source_df[\"latitude\"].apply(\n",
+ " lambda l: 1.0 if l >= r[0] and l < r[1] else 0.0)\n",
+ " return selected_examples\n",
+ "\n",
+ "selected_training_examples = select_and_transform_features(training_examples)\n",
+ "selected_validation_examples = select_and_transform_features(validation_examples)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "U4iAdY6t7Pkh",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "_ = train_model(\n",
+ " learning_rate=0.01,\n",
+ " steps=500,\n",
+ " batch_size=5,\n",
+ " training_examples=selected_training_examples,\n",
+ " training_targets=training_targets,\n",
+ " validation_examples=selected_validation_examples,\n",
+ " validation_targets=validation_targets)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/Notebooks/first_steps_with_tensor_flow.ipynb b/Notebooks/first_steps_with_tensor_flow.ipynb
new file mode 100644
index 0000000..a949e3d
--- /dev/null
+++ b/Notebooks/first_steps_with_tensor_flow.ipynb
@@ -0,0 +1,973 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "first_steps_with_tensor_flow.ipynb",
+ "provenance": [],
+ "collapsed_sections": [
+ "JndnmDMp66FL",
+ "ajVM7rkoYXeL",
+ "ci1ISxxrZ7v0"
+ ]
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JndnmDMp66FL",
+ "colab_type": "text"
+ },
+ "source": [
+ "#### Copyright 2017 Google LLC."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "hMqWDc_m6rUC",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {}
+ },
+ "source": [
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4f3CKqFUqL2-",
+ "colab_type": "text"
+ },
+ "source": [
+ "# First Steps with TensorFlow"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Bd2Zkk1LE2Zr",
+ "colab_type": "text"
+ },
+ "source": [
+ "**Learning Objectives:**\n",
+ " * Learn fundamental TensorFlow concepts\n",
+ " * Use the `LinearRegressor` class in TensorFlow to predict median housing price, at the granularity of city blocks, based on one input feature\n",
+ " * Evaluate the accuracy of a model's predictions using Root Mean Squared Error (RMSE)\n",
+ " * Improve the accuracy of a model by tuning its hyperparameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "MxiIKhP4E2Zr",
+ "colab_type": "text"
+ },
+ "source": [
+ "The [data](https://developers.google.com/machine-learning/crash-course/california-housing-data-description) is based on 1990 census data from California."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6TjLjL9IU80G",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Setup\n",
+ "In this first cell, we'll load the necessary libraries."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "rVFf5asKE2Zt",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "from __future__ import print_function\n",
+ "\n",
+ "import math\n",
+ "\n",
+ "from IPython import display\n",
+ "from matplotlib import cm\n",
+ "from matplotlib import gridspec\n",
+ "from matplotlib import pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn import metrics\n",
+ "import tensorflow as tf\n",
+ "from tensorflow.python.data import Dataset\n",
+ "\n",
+ "tf.logging.set_verbosity(tf.logging.ERROR)\n",
+ "pd.options.display.max_rows = 10\n",
+ "pd.options.display.float_format = '{:.1f}'.format"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ipRyUHjhU80Q",
+ "colab_type": "text"
+ },
+ "source": [
+ "Next, we'll load our data set."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "9ivCDWnwE2Zx",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "vVk_qlG6U80j",
+ "colab_type": "text"
+ },
+ "source": [
+ "We'll randomize the data, just to be sure not to get any pathological ordering effects that might harm the performance of Stochastic Gradient Descent. Additionally, we'll scale `median_house_value` to be in units of thousands, so it can be learned a little more easily with learning rates in a range that we usually use."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "r0eVyguIU80m",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "california_housing_dataframe = california_housing_dataframe.reindex(\n",
+ " np.random.permutation(california_housing_dataframe.index))\n",
+ "california_housing_dataframe[\"median_house_value\"] /= 1000.0\n",
+ "california_housing_dataframe"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "HzzlSs3PtTmt",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Examine the Data\n",
+ "\n",
+ "It's a good idea to get to know your data a little bit before you work with it.\n",
+ "\n",
+ "We'll print out a quick summary of a few useful statistics on each column: count of examples, mean, standard deviation, max, min, and various quantiles."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "gzb10yoVrydW",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {
+ "test": {
+ "output": "ignore",
+ "timeout": 600
+ }
+ }
+ },
+ "source": [
+ "california_housing_dataframe.describe()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Lr6wYl2bt2Ep",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Build the First Model\n",
+ "\n",
+ "In this exercise, we'll try to predict `median_house_value`, which will be our label (sometimes also called a target). We'll use `total_rooms` as our input feature.\n",
+ "\n",
+ "**NOTE:** Our data is at the city block level, so this feature represents the total number of rooms in that block.\n",
+ "\n",
+ "To train our model, we'll use the [LinearRegressor](https://www.tensorflow.org/api_docs/python/tf/estimator/LinearRegressor) interface provided by the TensorFlow [Estimator](https://www.tensorflow.org/get_started/estimator) API. This API takes care of a lot of the low-level model plumbing, and exposes convenient methods for performing model training, evaluation, and inference."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0cpcsieFhsNI",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Step 1: Define Features and Configure Feature Columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "EL8-9d4ZJNR7",
+ "colab_type": "text"
+ },
+ "source": [
+ "In order to import our training data into TensorFlow, we need to specify what type of data each feature contains. There are two main types of data we'll use in this and future exercises:\n",
+ "\n",
+ "* **Categorical Data**: Data that is textual. In this exercise, our housing data set does not contain any categorical features, but examples you might see would be the home style, the words in a real-estate ad.\n",
+ "\n",
+ "* **Numerical Data**: Data that is a number (integer or float) and that you want to treat as a number. As we will discuss more later sometimes you might want to treat numerical data (e.g., a postal code) as if it were categorical.\n",
+ "\n",
+ "In TensorFlow, we indicate a feature's data type using a construct called a **feature column**. Feature columns store only a description of the feature data; they do not contain the feature data itself.\n",
+ "\n",
+ "To start, we're going to use just one numeric input feature, `total_rooms`. The following code pulls the `total_rooms` data from our `california_housing_dataframe` and defines the feature column using `numeric_column`, which specifies its data is numeric:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "rhEbFCZ86cDZ",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# Define the input feature: total_rooms.\n",
+ "my_feature = california_housing_dataframe[[\"total_rooms\"]]\n",
+ "\n",
+ "# Configure a numeric feature column for total_rooms.\n",
+ "feature_columns = [tf.feature_column.numeric_column(\"total_rooms\")]"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "K_3S8teX7Rd2",
+ "colab_type": "text"
+ },
+ "source": [
+ "**NOTE:** The shape of our `total_rooms` data is a one-dimensional array (a list of the total number of rooms for each block). This is the default shape for `numeric_column`, so we don't have to pass it as an argument."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UMl3qrU5MGV6",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Step 2: Define the Target"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cw4nrfcB7kyk",
+ "colab_type": "text"
+ },
+ "source": [
+ "Next, we'll define our target, which is `median_house_value`. Again, we can pull it from our `california_housing_dataframe`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "l1NvvNkH8Kbt",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# Define the label.\n",
+ "targets = california_housing_dataframe[\"median_house_value\"]"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4M-rTFHL2UkA",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Step 3: Configure the LinearRegressor"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fUfGQUNp7jdL",
+ "colab_type": "text"
+ },
+ "source": [
+ "Next, we'll configure a linear regression model using LinearRegressor. We'll train this model using the `GradientDescentOptimizer`, which implements Mini-Batch Stochastic Gradient Descent (SGD). The `learning_rate` argument controls the size of the gradient step.\n",
+ "\n",
+ "**NOTE:** To be safe, we also apply [gradient clipping](https://developers.google.com/machine-learning/glossary/#gradient_clipping) to our optimizer via `clip_gradients_by_norm`. Gradient clipping ensures the magnitude of the gradients do not become too large during training, which can cause gradient descent to fail. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ubhtW-NGU802",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# Use gradient descent as the optimizer for training the model.\n",
+ "my_optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.0000001)\n",
+ "my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)\n",
+ "\n",
+ "# Configure the linear regression model with our feature columns and optimizer.\n",
+ "# Set a learning rate of 0.0000001 for Gradient Descent.\n",
+ "linear_regressor = tf.estimator.LinearRegressor(\n",
+ " feature_columns=feature_columns,\n",
+ " optimizer=my_optimizer\n",
+ ")"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-0IztwdK2f3F",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Step 4: Define the Input Function"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "S5M5j6xSCHxx",
+ "colab_type": "text"
+ },
+ "source": [
+ "To import our California housing data into our `LinearRegressor`, we need to define an input function, which instructs TensorFlow how to preprocess\n",
+ "the data, as well as how to batch, shuffle, and repeat it during model training.\n",
+ "\n",
+ "First, we'll convert our *pandas* feature data into a dict of NumPy arrays. We can then use the TensorFlow [Dataset API](https://www.tensorflow.org/programmers_guide/datasets) to construct a dataset object from our data, and then break\n",
+ "our data into batches of `batch_size`, to be repeated for the specified number of epochs (num_epochs). \n",
+ "\n",
+ "**NOTE:** When the default value of `num_epochs=None` is passed to `repeat()`, the input data will be repeated indefinitely.\n",
+ "\n",
+ "Next, if `shuffle` is set to `True`, we'll shuffle the data so that it's passed to the model randomly during training. The `buffer_size` argument specifies\n",
+ "the size of the dataset from which `shuffle` will randomly sample.\n",
+ "\n",
+ "Finally, our input function constructs an iterator for the dataset and returns the next batch of data to the LinearRegressor."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "RKZ9zNcHJtwc",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):\n",
+ " \"\"\"Trains a linear regression model of one feature.\n",
+ " \n",
+ " Args:\n",
+ " features: pandas DataFrame of features\n",
+ " targets: pandas DataFrame of targets\n",
+ " batch_size: Size of batches to be passed to the model\n",
+ " shuffle: True or False. Whether to shuffle the data.\n",
+ " num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely\n",
+ " Returns:\n",
+ " Tuple of (features, labels) for next data batch\n",
+ " \"\"\"\n",
+ " \n",
+ " # Convert pandas data into a dict of np arrays.\n",
+ " features = {key:np.array(value) for key,value in dict(features).items()} \n",
+ " \n",
+ " # Construct a dataset, and configure batching/repeating.\n",
+ " ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit\n",
+ " ds = ds.batch(batch_size).repeat(num_epochs)\n",
+ " \n",
+ " # Shuffle the data, if specified.\n",
+ " if shuffle:\n",
+ " ds = ds.shuffle(buffer_size=10000)\n",
+ " \n",
+ " # Return the next batch of data.\n",
+ " features, labels = ds.make_one_shot_iterator().get_next()\n",
+ " return features, labels"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "wwa6UeA1V5F_",
+ "colab_type": "text"
+ },
+ "source": [
+ "**NOTE:** We'll continue to use this same input function in later exercises. For more\n",
+ "detailed documentation of input functions and the `Dataset` API, see the [TensorFlow Programmer's Guide](https://www.tensorflow.org/programmers_guide/datasets)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4YS50CQb2ooO",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Step 5: Train the Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yP92XkzhU803",
+ "colab_type": "text"
+ },
+ "source": [
+ "We can now call `train()` on our `linear_regressor` to train the model. We'll wrap `my_input_fn` in a `lambda`\n",
+ "so we can pass in `my_feature` and `targets` as arguments (see this [TensorFlow input function tutorial](https://www.tensorflow.org/get_started/input_fn#passing_input_fn_data_to_your_model) for more details), and to start, we'll\n",
+ "train for 100 steps."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "5M-Kt6w8U803",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "_ = linear_regressor.train(\n",
+ " input_fn = lambda:my_input_fn(my_feature, targets),\n",
+ " steps=100\n",
+ ")"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7Nwxqxlx2sOv",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Step 6: Evaluate the Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "KoDaF2dlJQG5",
+ "colab_type": "text"
+ },
+ "source": [
+ "Let's make predictions on that training data, to see how well our model fit it during training.\n",
+ "\n",
+ "**NOTE:** Training error measures how well your model fits the training data, but it **_does not_** measure how well your model **_generalizes to new data_**. In later exercises, you'll explore how to split your data to evaluate your model's ability to generalize.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "pDIxp6vcU809",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# Create an input function for predictions.\n",
+ "# Note: Since we're making just one prediction for each example, we don't \n",
+ "# need to repeat or shuffle the data here.\n",
+ "prediction_input_fn =lambda: my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)\n",
+ "\n",
+ "# Call predict() on the linear_regressor to make predictions.\n",
+ "predictions = linear_regressor.predict(input_fn=prediction_input_fn)\n",
+ "\n",
+ "# Format predictions as a NumPy array, so we can calculate error metrics.\n",
+ "predictions = np.array([item['predictions'][0] for item in predictions])\n",
+ "\n",
+ "# Print Mean Squared Error and Root Mean Squared Error.\n",
+ "mean_squared_error = metrics.mean_squared_error(predictions, targets)\n",
+ "root_mean_squared_error = math.sqrt(mean_squared_error)\n",
+ "print(\"Mean Squared Error (on training data): %0.3f\" % mean_squared_error)\n",
+ "print(\"Root Mean Squared Error (on training data): %0.3f\" % root_mean_squared_error)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "AKWstXXPzOVz",
+ "colab_type": "text"
+ },
+ "source": [
+ "Is this a good model? How would you judge how large this error is?\n",
+ "\n",
+ "Mean Squared Error (MSE) can be hard to interpret, so we often look at Root Mean Squared Error (RMSE)\n",
+ "instead. A nice property of RMSE is that it can be interpreted on the same scale as the original targets.\n",
+ "\n",
+ "Let's compare the RMSE to the difference of the min and max of our targets:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "7UwqGbbxP53O",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "min_house_value = california_housing_dataframe[\"median_house_value\"].min()\n",
+ "max_house_value = california_housing_dataframe[\"median_house_value\"].max()\n",
+ "min_max_difference = max_house_value - min_house_value\n",
+ "\n",
+ "print(\"Min. Median House Value: %0.3f\" % min_house_value)\n",
+ "print(\"Max. Median House Value: %0.3f\" % max_house_value)\n",
+ "print(\"Difference between Min. and Max.: %0.3f\" % min_max_difference)\n",
+ "print(\"Root Mean Squared Error: %0.3f\" % root_mean_squared_error)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JigJr0C7Pzit",
+ "colab_type": "text"
+ },
+ "source": [
+ "Our error spans nearly half the range of the target values. Can we do better?\n",
+ "\n",
+ "This is the question that nags at every model developer. Let's develop some basic strategies to reduce model error.\n",
+ "\n",
+ "The first thing we can do is take a look at how well our predictions match our targets, in terms of overall summary statistics."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "941nclxbzqGH",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {
+ "test": {
+ "output": "ignore",
+ "timeout": 600
+ }
+ }
+ },
+ "source": [
+ "calibration_data = pd.DataFrame()\n",
+ "calibration_data[\"predictions\"] = pd.Series(predictions)\n",
+ "calibration_data[\"targets\"] = pd.Series(targets)\n",
+ "calibration_data.describe()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "E2-bf8Hq36y8",
+ "colab_type": "text"
+ },
+ "source": [
+ "Okay, maybe this information is helpful. How does the mean value compare to the model's RMSE? How about the various quantiles?\n",
+ "\n",
+ "We can also visualize the data and the line we've learned. Recall that linear regression on a single feature can be drawn as a line mapping input *x* to output *y*.\n",
+ "\n",
+ "First, we'll get a uniform random sample of the data so we can make a readable scatter plot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "SGRIi3mAU81H",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "sample = california_housing_dataframe.sample(n=300)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "N-JwuJBKU81J",
+ "colab_type": "text"
+ },
+ "source": [
+ "Next, we'll plot the line we've learned, drawing from the model's bias term and feature weight, together with the scatter plot. The line will show up red."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "7G12E76-339G",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {
+ "test": {
+ "output": "ignore",
+ "timeout": 600
+ }
+ }
+ },
+ "source": [
+ "# Get the min and max total_rooms values.\n",
+ "x_0 = sample[\"total_rooms\"].min()\n",
+ "x_1 = sample[\"total_rooms\"].max()\n",
+ "\n",
+ "# Retrieve the final weight and bias generated during training.\n",
+ "weight = linear_regressor.get_variable_value('linear/linear_model/total_rooms/weights')[0]\n",
+ "bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')\n",
+ "\n",
+ "# Get the predicted median_house_values for the min and max total_rooms values.\n",
+ "y_0 = weight * x_0 + bias \n",
+ "y_1 = weight * x_1 + bias\n",
+ "\n",
+ "# Plot our regression line from (x_0, y_0) to (x_1, y_1).\n",
+ "plt.plot([x_0, x_1], [y_0, y_1], c='r')\n",
+ "\n",
+ "# Label the graph axes.\n",
+ "plt.ylabel(\"median_house_value\")\n",
+ "plt.xlabel(\"total_rooms\")\n",
+ "\n",
+ "# Plot a scatter plot from our data sample.\n",
+ "plt.scatter(sample[\"total_rooms\"], sample[\"median_house_value\"])\n",
+ "\n",
+ "# Display graph.\n",
+ "plt.show()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "t0lRt4USU81L",
+ "colab_type": "text"
+ },
+ "source": [
+ "This initial line looks way off. See if you can look back at the summary stats and see the same information encoded there.\n",
+ "\n",
+ "Together, these initial sanity checks suggest we may be able to find a much better line."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "AZWF67uv0HTG",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Tweak the Model Hyperparameters\n",
+ "For this exercise, we've put all the above code in a single function for convenience. You can call the function with different parameters to see the effect.\n",
+ "\n",
+ "In this function, we'll proceed in 10 evenly divided periods so that we can observe the model improvement at each period.\n",
+ "\n",
+ "For each period, we'll compute and graph training loss. This may help you judge when a model is converged, or if it needs more iterations.\n",
+ "\n",
+ "We'll also plot the feature weight and bias term values learned by the model over time. This is another way to see how things converge."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "wgSMeD5UU81N",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def train_model(learning_rate, steps, batch_size, input_feature=\"total_rooms\"):\n",
+ " \"\"\"Trains a linear regression model of one feature.\n",
+ " \n",
+ " Args:\n",
+ " learning_rate: A `float`, the learning rate.\n",
+ " steps: A non-zero `int`, the total number of training steps. A training step\n",
+ " consists of a forward and backward pass using a single batch.\n",
+ " batch_size: A non-zero `int`, the batch size.\n",
+ " input_feature: A `string` specifying a column from `california_housing_dataframe`\n",
+ " to use as input feature.\n",
+ " \"\"\"\n",
+ " \n",
+ " periods = 10\n",
+ " steps_per_period = steps / periods\n",
+ "\n",
+ " my_feature = input_feature\n",
+ " my_feature_data = california_housing_dataframe[[my_feature]]\n",
+ " my_label = \"median_house_value\"\n",
+ " targets = california_housing_dataframe[my_label]\n",
+ "\n",
+ " # Create feature columns.\n",
+ " feature_columns = [tf.feature_column.numeric_column(my_feature)]\n",
+ " \n",
+ " # Create input functions.\n",
+ " training_input_fn = lambda:my_input_fn(my_feature_data, targets, batch_size=batch_size)\n",
+ " prediction_input_fn = lambda: my_input_fn(my_feature_data, targets, num_epochs=1, shuffle=False)\n",
+ " \n",
+ " # Create a linear regressor object.\n",
+ " my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)\n",
+ " my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)\n",
+ " linear_regressor = tf.estimator.LinearRegressor(\n",
+ " feature_columns=feature_columns,\n",
+ " optimizer=my_optimizer\n",
+ " )\n",
+ "\n",
+ " # Set up to plot the state of our model's line each period.\n",
+ " plt.figure(figsize=(15, 6))\n",
+ " plt.subplot(1, 2, 1)\n",
+ " plt.title(\"Learned Line by Period\")\n",
+ " plt.ylabel(my_label)\n",
+ " plt.xlabel(my_feature)\n",
+ " sample = california_housing_dataframe.sample(n=300)\n",
+ " plt.scatter(sample[my_feature], sample[my_label])\n",
+ " colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)]\n",
+ "\n",
+ " # Train the model, but do so inside a loop so that we can periodically assess\n",
+ " # loss metrics.\n",
+ " print(\"Training model...\")\n",
+ " print(\"RMSE (on training data):\")\n",
+ " root_mean_squared_errors = []\n",
+ " for period in range (0, periods):\n",
+ " # Train the model, starting from the prior state.\n",
+ " linear_regressor.train(\n",
+ " input_fn=training_input_fn,\n",
+ " steps=steps_per_period\n",
+ " )\n",
+ " # Take a break and compute predictions.\n",
+ " predictions = linear_regressor.predict(input_fn=prediction_input_fn)\n",
+ " predictions = np.array([item['predictions'][0] for item in predictions])\n",
+ " \n",
+ " # Compute loss.\n",
+ " root_mean_squared_error = math.sqrt(\n",
+ " metrics.mean_squared_error(predictions, targets))\n",
+ " # Occasionally print the current loss.\n",
+ " print(\" period %02d : %0.2f\" % (period, root_mean_squared_error))\n",
+ " # Add the loss metrics from this period to our list.\n",
+ " root_mean_squared_errors.append(root_mean_squared_error)\n",
+ " # Finally, track the weights and biases over time.\n",
+ " # Apply some math to ensure that the data and line are plotted neatly.\n",
+ " y_extents = np.array([0, sample[my_label].max()])\n",
+ " \n",
+ " weight = linear_regressor.get_variable_value('linear/linear_model/%s/weights' % input_feature)[0]\n",
+ " bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')\n",
+ "\n",
+ " x_extents = (y_extents - bias) / weight\n",
+ " x_extents = np.maximum(np.minimum(x_extents,\n",
+ " sample[my_feature].max()),\n",
+ " sample[my_feature].min())\n",
+ " y_extents = weight * x_extents + bias\n",
+ " plt.plot(x_extents, y_extents, color=colors[period]) \n",
+ " print(\"Model training finished.\")\n",
+ "\n",
+ " # Output a graph of loss metrics over periods.\n",
+ " plt.subplot(1, 2, 2)\n",
+ " plt.ylabel('RMSE')\n",
+ " plt.xlabel('Periods')\n",
+ " plt.title(\"Root Mean Squared Error vs. Periods\")\n",
+ " plt.tight_layout()\n",
+ " plt.plot(root_mean_squared_errors)\n",
+ "\n",
+ " # Output a table with calibration data.\n",
+ " calibration_data = pd.DataFrame()\n",
+ " calibration_data[\"predictions\"] = pd.Series(predictions)\n",
+ " calibration_data[\"targets\"] = pd.Series(targets)\n",
+ " display.display(calibration_data.describe())\n",
+ "\n",
+ " print(\"Final RMSE (on training data): %0.2f\" % root_mean_squared_error)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "kg8A4ArBU81Q",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Task 1: Achieve an RMSE of 180 or Below\n",
+ "\n",
+ "Tweak the model hyperparameters to improve loss and better match the target distribution.\n",
+ "If, after 5 minutes or so, you're having trouble beating a RMSE of 180, check the solution for a possible combination."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "UzoZUSdLIolF",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {
+ "test": {
+ "output": "ignore",
+ "timeout": 600
+ }
+ }
+ },
+ "source": [
+ "train_model(\n",
+ " learning_rate=0.00001,\n",
+ " steps=100,\n",
+ " batch_size=1\n",
+ ")"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ajVM7rkoYXeL",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for one possible solution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "T3zmldDwYy5c",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "train_model(\n",
+ " learning_rate=0.00002,\n",
+ " steps=500,\n",
+ " batch_size=5\n",
+ ")"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "M8H0_D4vYa49",
+ "colab_type": "text"
+ },
+ "source": [
+ "This is just one possible configuration; there may be other combinations of settings that also give good results. Note that in general, this exercise isn't about finding the *one best* setting, but to help build your intutions about how tweaking the model configuration affects prediction quality."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QU5sLyYTqzqL",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Is There a Standard Heuristic for Model Tuning?\n",
+ "\n",
+ "This is a commonly asked question. The short answer is that the effects of different hyperparameters are data dependent. So there are no hard-and-fast rules; you'll need to test on your data.\n",
+ "\n",
+ "That said, here are a few rules of thumb that may help guide you:\n",
+ "\n",
+ " * Training error should steadily decrease, steeply at first, and should eventually plateau as training converges.\n",
+ " * If the training has not converged, try running it for longer.\n",
+ " * If the training error decreases too slowly, increasing the learning rate may help it decrease faster.\n",
+ " * But sometimes the exact opposite may happen if the learning rate is too high.\n",
+ " * If the training error varies wildly, try decreasing the learning rate.\n",
+ " * Lower learning rate plus larger number of steps or larger batch size is often a good combination.\n",
+ " * Very small batch sizes can also cause instability. First try larger values like 100 or 1000, and decrease until you see degradation.\n",
+ "\n",
+ "Again, never go strictly by these rules of thumb, because the effects are data dependent. Always experiment and verify."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "GpV-uF_cBCBU",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Task 2: Try a Different Feature\n",
+ "\n",
+ "See if you can do any better by replacing the `total_rooms` feature with the `population` feature.\n",
+ "\n",
+ "Don't take more than 5 minutes on this portion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "YMyOxzb0ZlAH",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# YOUR CODE HERE"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ci1ISxxrZ7v0",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for one possible solution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "SjdQQCduZ7BV",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "train_model(\n",
+ " learning_rate=0.00002,\n",
+ " steps=1000,\n",
+ " batch_size=5,\n",
+ " input_feature=\"population\"\n",
+ ")"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/Notebooks/intro_to_pandas.ipynb b/Notebooks/intro_to_pandas.ipynb
new file mode 100644
index 0000000..f19576d
--- /dev/null
+++ b/Notebooks/intro_to_pandas.ipynb
@@ -0,0 +1,648 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "intro_to_pandas.ipynb",
+ "provenance": [],
+ "collapsed_sections": [
+ "JndnmDMp66FL",
+ "YHIWvc9Ms-Ll",
+ "TJffr5_Jwqvd"
+ ]
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "JndnmDMp66FL"
+ },
+ "source": [
+ "#### Copyright 2017 Google LLC."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "hMqWDc_m6rUC",
+ "cellView": "both",
+ "colab": {}
+ },
+ "source": [
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "rHLcriKWLRe4"
+ },
+ "source": [
+ "# Intro to pandas"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "QvJBqX8_Bctk"
+ },
+ "source": [
+ "**Learning Objectives:**\n",
+ " * Gain an introduction to the `DataFrame` and `Series` data structures of the *pandas* library\n",
+ " * Access and manipulate data within a `DataFrame` and `Series`\n",
+ " * Import CSV data into a *pandas* `DataFrame`\n",
+ " * Reindex a `DataFrame` to shuffle data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "TIFJ83ZTBctl"
+ },
+ "source": [
+ "[*pandas*](http://pandas.pydata.org/) is a column-oriented data analysis API. It's a great tool for handling and analyzing input data, and many ML frameworks support *pandas* data structures as inputs.\n",
+ "Although a comprehensive introduction to the *pandas* API would span many pages, the core concepts are fairly straightforward, and we'll present them below. For a more complete reference, the [*pandas* docs site](http://pandas.pydata.org/pandas-docs/stable/index.html) contains extensive documentation and many tutorials."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "s_JOISVgmn9v"
+ },
+ "source": [
+ "## Basic Concepts\n",
+ "\n",
+ "The following line imports the *pandas* API and prints the API version:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "aSRYu62xUi3g",
+ "colab": {}
+ },
+ "source": [
+ "from __future__ import print_function\n",
+ "\n",
+ "import pandas as pd\n",
+ "pd.__version__"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "daQreKXIUslr"
+ },
+ "source": [
+ "The primary data structures in *pandas* are implemented as two classes:\n",
+ "\n",
+ " * **`DataFrame`**, which you can imagine as a relational data table, with rows and named columns.\n",
+ " * **`Series`**, which is a single column. A `DataFrame` contains one or more `Series` and a name for each `Series`.\n",
+ "\n",
+ "The data frame is a commonly used abstraction for data manipulation. Similar implementations exist in [Spark](https://spark.apache.org/) and [R](https://www.r-project.org/about.html)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "fjnAk1xcU0yc"
+ },
+ "source": [
+ "One way to create a `Series` is to construct a `Series` object. For example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "DFZ42Uq7UFDj",
+ "colab": {}
+ },
+ "source": [
+ "pd.Series(['San Francisco', 'San Jose', 'Sacramento'])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "U5ouUp1cU6pC"
+ },
+ "source": [
+ "`DataFrame` objects can be created by passing a `dict` mapping `string` column names to their respective `Series`. If the `Series` don't match in length, missing values are filled with special [NA/NaN](http://pandas.pydata.org/pandas-docs/stable/missing_data.html) values. Example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "avgr6GfiUh8t",
+ "colab": {}
+ },
+ "source": [
+ "city_names = pd.Series(['San Francisco', 'San Jose', 'Sacramento'])\n",
+ "population = pd.Series([852469, 1015785, 485199])\n",
+ "\n",
+ "pd.DataFrame({ 'City name': city_names, 'Population': population })"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "oa5wfZT7VHJl"
+ },
+ "source": [
+ "But most of the time, you load an entire file into a `DataFrame`. The following example loads a file with California housing data. Run the following cell to load the data and create feature definitions:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "av6RYOraVG1V",
+ "colab": {}
+ },
+ "source": [
+ "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n",
+ "california_housing_dataframe.describe()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "WrkBjfz5kEQu"
+ },
+ "source": [
+ "The example above used `DataFrame.describe` to show interesting statistics about a `DataFrame`. Another useful function is `DataFrame.head`, which displays the first few records of a `DataFrame`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "s3ND3bgOkB5k",
+ "colab": {}
+ },
+ "source": [
+ "california_housing_dataframe.head()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "w9-Es5Y6laGd"
+ },
+ "source": [
+ "Another powerful feature of *pandas* is graphing. For example, `DataFrame.hist` lets you quickly study the distribution of values in a column:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "nqndFVXVlbPN",
+ "colab": {}
+ },
+ "source": [
+ "california_housing_dataframe.hist('housing_median_age')"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "XtYZ7114n3b-"
+ },
+ "source": [
+ "## Accessing Data\n",
+ "\n",
+ "You can access `DataFrame` data using familiar Python dict/list operations:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "_TFm7-looBFF",
+ "colab": {}
+ },
+ "source": [
+ "cities = pd.DataFrame({ 'City name': city_names, 'Population': population })\n",
+ "print(type(cities['City name']))\n",
+ "cities['City name']"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "V5L6xacLoxyv",
+ "colab": {}
+ },
+ "source": [
+ "print(type(cities['City name'][1]))\n",
+ "cities['City name'][1]"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "gcYX1tBPugZl",
+ "colab": {}
+ },
+ "source": [
+ "print(type(cities[0:2]))\n",
+ "cities[0:2]"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "65g1ZdGVjXsQ"
+ },
+ "source": [
+ "In addition, *pandas* provides an extremely rich API for advanced [indexing and selection](http://pandas.pydata.org/pandas-docs/stable/indexing.html) that is too extensive to be covered here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "RM1iaD-ka3Y1"
+ },
+ "source": [
+ "## Manipulating Data\n",
+ "\n",
+ "You may apply Python's basic arithmetic operations to `Series`. For example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "XWmyCFJ5bOv-",
+ "colab": {}
+ },
+ "source": [
+ "population / 1000."
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "TQzIVnbnmWGM"
+ },
+ "source": [
+ "[NumPy](http://www.numpy.org/) is a popular toolkit for scientific computing. *pandas* `Series` can be used as arguments to most NumPy functions:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "ko6pLK6JmkYP",
+ "colab": {}
+ },
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "np.log(population)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "xmxFuQmurr6d"
+ },
+ "source": [
+ "For more complex single-column transformations, you can use `Series.apply`. Like the Python [map function](https://docs.python.org/2/library/functions.html#map), \n",
+ "`Series.apply` accepts as an argument a [lambda function](https://docs.python.org/2/tutorial/controlflow.html#lambda-expressions), which is applied to each value.\n",
+ "\n",
+ "The example below creates a new `Series` that indicates whether `population` is over one million:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "Fc1DvPAbstjI",
+ "colab": {}
+ },
+ "source": [
+ "population.apply(lambda val: val > 1000000)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "ZeYYLoV9b9fB"
+ },
+ "source": [
+ "\n",
+ "Modifying `DataFrames` is also straightforward. For example, the following code adds two `Series` to an existing `DataFrame`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "0gCEX99Hb8LR",
+ "colab": {}
+ },
+ "source": [
+ "cities['Area square miles'] = pd.Series([46.87, 176.53, 97.92])\n",
+ "cities['Population density'] = cities['Population'] / cities['Area square miles']\n",
+ "cities"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "6qh63m-ayb-c"
+ },
+ "source": [
+ "## Exercise #1\n",
+ "\n",
+ "Modify the `cities` table by adding a new boolean column that is True if and only if *both* of the following are True:\n",
+ "\n",
+ " * The city is named after a saint.\n",
+ " * The city has an area greater than 50 square miles.\n",
+ "\n",
+ "**Note:** Boolean `Series` are combined using the bitwise, rather than the traditional boolean, operators. For example, when performing *logical and*, use `&` instead of `and`.\n",
+ "\n",
+ "**Hint:** \"San\" in Spanish means \"saint.\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "zCOn8ftSyddH",
+ "colab": {}
+ },
+ "source": [
+ "# Your code here"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "YHIWvc9Ms-Ll"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for a solution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "T5OlrqtdtCIb",
+ "colab": {}
+ },
+ "source": [
+ "cities['Is wide and has saint name'] = (cities['Area square miles'] > 50) & cities['City name'].apply(lambda name: name.startswith('San'))\n",
+ "cities"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "f-xAOJeMiXFB"
+ },
+ "source": [
+ "## Indexes\n",
+ "Both `Series` and `DataFrame` objects also define an `index` property that assigns an identifier value to each `Series` item or `DataFrame` row. \n",
+ "\n",
+ "By default, at construction, *pandas* assigns index values that reflect the ordering of the source data. Once created, the index values are stable; that is, they do not change when data is reordered."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "2684gsWNinq9",
+ "colab": {}
+ },
+ "source": [
+ "city_names.index"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "F_qPe2TBjfWd",
+ "colab": {}
+ },
+ "source": [
+ "cities.index"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "hp2oWY9Slo_h"
+ },
+ "source": [
+ "Call `DataFrame.reindex` to manually reorder the rows. For example, the following has the same effect as sorting by city name:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "sN0zUzSAj-U1",
+ "colab": {}
+ },
+ "source": [
+ "cities.reindex([2, 0, 1])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "-GQFz8NZuS06"
+ },
+ "source": [
+ "Reindexing is a great way to shuffle (randomize) a `DataFrame`. In the example below, we take the index, which is array-like, and pass it to NumPy's `random.permutation` function, which shuffles its values in place. Calling `reindex` with this shuffled array causes the `DataFrame` rows to be shuffled in the same way.\n",
+ "Try running the following cell multiple times!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "mF8GC0k8uYhz",
+ "colab": {}
+ },
+ "source": [
+ "cities.reindex(np.random.permutation(cities.index))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "fSso35fQmGKb"
+ },
+ "source": [
+ "For more information, see the [Index documentation](http://pandas.pydata.org/pandas-docs/stable/indexing.html#index-objects)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "8UngIdVhz8C0"
+ },
+ "source": [
+ "## Exercise #2\n",
+ "\n",
+ "The `reindex` method allows index values that are not in the original `DataFrame`'s index values. Try it and see what happens if you use such values! Why do you think this is allowed?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "PN55GrDX0jzO",
+ "colab": {}
+ },
+ "source": [
+ "# Your code here"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "TJffr5_Jwqvd"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for the solution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "8oSvi2QWwuDH"
+ },
+ "source": [
+ "If your `reindex` input array includes values not in the original `DataFrame` index values, `reindex` will add new rows for these \"missing\" indices and populate all corresponding columns with `NaN` values:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "yBdkucKCwy4x",
+ "colab": {}
+ },
+ "source": [
+ "cities.reindex([0, 4, 5, 2])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "2l82PhPbwz7g"
+ },
+ "source": [
+ "This behavior is desirable because indexes are often strings pulled from the actual data (see the [*pandas* reindex\n",
+ "documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html) for an example\n",
+ "in which the index values are browser names).\n",
+ "\n",
+ "In this case, allowing \"missing\" indices makes it easy to reindex using an external list, as you don't have to worry about\n",
+ "sanitizing the input."
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/Notebooks/synthetic_features_and_outliers.ipynb b/Notebooks/synthetic_features_and_outliers.ipynb
new file mode 100644
index 0000000..4daed8e
--- /dev/null
+++ b/Notebooks/synthetic_features_and_outliers.ipynb
@@ -0,0 +1,582 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "synthetic_features_and_outliers.ipynb",
+ "provenance": [],
+ "collapsed_sections": [
+ "JndnmDMp66FL",
+ "i5Ul3zf5QYvW",
+ "jByCP8hDRZmM",
+ "WvgxW0bUSC-c"
+ ]
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JndnmDMp66FL",
+ "colab_type": "text"
+ },
+ "source": [
+ "#### Copyright 2017 Google LLC."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "hMqWDc_m6rUC",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {}
+ },
+ "source": [
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4f3CKqFUqL2-",
+ "colab_type": "text"
+ },
+ "source": [
+ "# Synthetic Features and Outliers"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jnKgkN5fHbGy",
+ "colab_type": "text"
+ },
+ "source": [
+ "**Learning Objectives:**\n",
+ " * Create a synthetic feature that is the ratio of two other features\n",
+ " * Use this new feature as an input to a linear regression model\n",
+ " * Improve the effectiveness of the model by identifying and clipping (removing) outliers out of the input data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "VOpLo5dcHbG0",
+ "colab_type": "text"
+ },
+ "source": [
+ "Let's revisit our model from the previous First Steps with TensorFlow exercise. \n",
+ "\n",
+ "First, we'll import the California housing data into a *pandas* `DataFrame`:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "S8gm6BpqRRuh",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "9D8GgUovHbG0",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "from __future__ import print_function\n",
+ "\n",
+ "import math\n",
+ "\n",
+ "from IPython import display\n",
+ "from matplotlib import cm\n",
+ "from matplotlib import gridspec\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import sklearn.metrics as metrics\n",
+ "import tensorflow as tf\n",
+ "from tensorflow.python.data import Dataset\n",
+ "\n",
+ "tf.logging.set_verbosity(tf.logging.ERROR)\n",
+ "pd.options.display.max_rows = 10\n",
+ "pd.options.display.float_format = '{:.1f}'.format\n",
+ "\n",
+ "california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n",
+ "\n",
+ "california_housing_dataframe = california_housing_dataframe.reindex(\n",
+ " np.random.permutation(california_housing_dataframe.index))\n",
+ "california_housing_dataframe[\"median_house_value\"] /= 1000.0\n",
+ "california_housing_dataframe"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "I6kNgrwCO_ms",
+ "colab_type": "text"
+ },
+ "source": [
+ "Next, we'll set up our input function, and define the function for model training:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "5RpTJER9XDub",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):\n",
+ " \"\"\"Trains a linear regression model of one feature.\n",
+ " \n",
+ " Args:\n",
+ " features: pandas DataFrame of features\n",
+ " targets: pandas DataFrame of targets\n",
+ " batch_size: Size of batches to be passed to the model\n",
+ " shuffle: True or False. Whether to shuffle the data.\n",
+ " num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely\n",
+ " Returns:\n",
+ " Tuple of (features, labels) for next data batch\n",
+ " \"\"\"\n",
+ " \n",
+ " # Convert pandas data into a dict of np arrays.\n",
+ " features = {key:np.array(value) for key,value in dict(features).items()} \n",
+ " \n",
+ " # Construct a dataset, and configure batching/repeating.\n",
+ " ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit\n",
+ " ds = ds.batch(batch_size).repeat(num_epochs)\n",
+ " \n",
+ " # Shuffle the data, if specified.\n",
+ " if shuffle:\n",
+ " ds = ds.shuffle(buffer_size=10000)\n",
+ " \n",
+ " # Return the next batch of data.\n",
+ " features, labels = ds.make_one_shot_iterator().get_next()\n",
+ " return features, labels"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "VgQPftrpHbG3",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "def train_model(learning_rate, steps, batch_size, input_feature):\n",
+ " \"\"\"Trains a linear regression model.\n",
+ " \n",
+ " Args:\n",
+ " learning_rate: A `float`, the learning rate.\n",
+ " steps: A non-zero `int`, the total number of training steps. A training step\n",
+ " consists of a forward and backward pass using a single batch.\n",
+ " batch_size: A non-zero `int`, the batch size.\n",
+ " input_feature: A `string` specifying a column from `california_housing_dataframe`\n",
+ " to use as input feature.\n",
+ " \n",
+ " Returns:\n",
+ " A Pandas `DataFrame` containing targets and the corresponding predictions done\n",
+ " after training the model.\n",
+ " \"\"\"\n",
+ " \n",
+ " periods = 10\n",
+ " steps_per_period = steps / periods\n",
+ "\n",
+ " my_feature = input_feature\n",
+ " my_feature_data = california_housing_dataframe[[my_feature]].astype('float32')\n",
+ " my_label = \"median_house_value\"\n",
+ " targets = california_housing_dataframe[my_label].astype('float32')\n",
+ "\n",
+ " # Create input functions.\n",
+ " training_input_fn = lambda: my_input_fn(my_feature_data, targets, batch_size=batch_size)\n",
+ " predict_training_input_fn = lambda: my_input_fn(my_feature_data, targets, num_epochs=1, shuffle=False)\n",
+ " \n",
+ " # Create feature columns.\n",
+ " feature_columns = [tf.feature_column.numeric_column(my_feature)]\n",
+ " \n",
+ " # Create a linear regressor object.\n",
+ " my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)\n",
+ " my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)\n",
+ " linear_regressor = tf.estimator.LinearRegressor(\n",
+ " feature_columns=feature_columns,\n",
+ " optimizer=my_optimizer\n",
+ " )\n",
+ "\n",
+ " # Set up to plot the state of our model's line each period.\n",
+ " plt.figure(figsize=(15, 6))\n",
+ " plt.subplot(1, 2, 1)\n",
+ " plt.title(\"Learned Line by Period\")\n",
+ " plt.ylabel(my_label)\n",
+ " plt.xlabel(my_feature)\n",
+ " sample = california_housing_dataframe.sample(n=300)\n",
+ " plt.scatter(sample[my_feature], sample[my_label])\n",
+ " colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)]\n",
+ "\n",
+ " # Train the model, but do so inside a loop so that we can periodically assess\n",
+ " # loss metrics.\n",
+ " print(\"Training model...\")\n",
+ " print(\"RMSE (on training data):\")\n",
+ " root_mean_squared_errors = []\n",
+ " for period in range (0, periods):\n",
+ " # Train the model, starting from the prior state.\n",
+ " linear_regressor.train(\n",
+ " input_fn=training_input_fn,\n",
+ " steps=steps_per_period,\n",
+ " )\n",
+ " # Take a break and compute predictions.\n",
+ " predictions = linear_regressor.predict(input_fn=predict_training_input_fn)\n",
+ " predictions = np.array([item['predictions'][0] for item in predictions])\n",
+ " \n",
+ " # Compute loss.\n",
+ " root_mean_squared_error = math.sqrt(\n",
+ " metrics.mean_squared_error(predictions, targets))\n",
+ " # Occasionally print the current loss.\n",
+ " print(\" period %02d : %0.2f\" % (period, root_mean_squared_error))\n",
+ " # Add the loss metrics from this period to our list.\n",
+ " root_mean_squared_errors.append(root_mean_squared_error)\n",
+ " # Finally, track the weights and biases over time.\n",
+ " # Apply some math to ensure that the data and line are plotted neatly.\n",
+ " y_extents = np.array([0, sample[my_label].max()])\n",
+ " \n",
+ " weight = linear_regressor.get_variable_value('linear/linear_model/%s/weights' % input_feature)[0]\n",
+ " bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')\n",
+ " \n",
+ " x_extents = (y_extents - bias) / weight\n",
+ " x_extents = np.maximum(np.minimum(x_extents,\n",
+ " sample[my_feature].max()),\n",
+ " sample[my_feature].min())\n",
+ " y_extents = weight * x_extents + bias\n",
+ " plt.plot(x_extents, y_extents, color=colors[period]) \n",
+ " print(\"Model training finished.\")\n",
+ "\n",
+ " # Output a graph of loss metrics over periods.\n",
+ " plt.subplot(1, 2, 2)\n",
+ " plt.ylabel('RMSE')\n",
+ " plt.xlabel('Periods')\n",
+ " plt.title(\"Root Mean Squared Error vs. Periods\")\n",
+ " plt.tight_layout()\n",
+ " plt.plot(root_mean_squared_errors)\n",
+ "\n",
+ " # Create a table with calibration data.\n",
+ " calibration_data = pd.DataFrame()\n",
+ " calibration_data[\"predictions\"] = pd.Series(predictions)\n",
+ " calibration_data[\"targets\"] = pd.Series(targets)\n",
+ " display.display(calibration_data.describe())\n",
+ "\n",
+ " print(\"Final RMSE (on training data): %0.2f\" % root_mean_squared_error)\n",
+ " \n",
+ " return calibration_data"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "FJ6xUNVRm-do",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Task 1: Try a Synthetic Feature\n",
+ "\n",
+ "Both the `total_rooms` and `population` features count totals for a given city block.\n",
+ "\n",
+ "But what if one city block were more densely populated than another? We can explore how block density relates to median house value by creating a synthetic feature that's a ratio of `total_rooms` and `population`.\n",
+ "\n",
+ "In the cell below, create a feature called `rooms_per_person`, and use that as the `input_feature` to `train_model()`.\n",
+ "\n",
+ "What's the best performance you can get with this single feature by tweaking the learning rate? (The better the performance, the better your regression line should fit the data, and the lower\n",
+ "the final RMSE should be.)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "isONN2XK32Wo",
+ "colab_type": "text"
+ },
+ "source": [
+ "**NOTE**: You may find it helpful to add a few code cells below so you can try out several different learning rates and compare the results. To add a new code cell, hover your cursor directly below the center of this cell, and click **CODE**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "5ihcVutnnu1D",
+ "colab_type": "code",
+ "cellView": "both",
+ "colab": {
+ "test": {
+ "output": "ignore",
+ "timeout": 600
+ }
+ }
+ },
+ "source": [
+ "#\n",
+ "# YOUR CODE HERE\n",
+ "#\n",
+ "california_housing_dataframe[\"rooms_per_person\"] =\n",
+ "\n",
+ "calibration_data = train_model(\n",
+ " learning_rate=0.00005,\n",
+ " steps=500,\n",
+ " batch_size=5,\n",
+ " input_feature=\"rooms_per_person\"\n",
+ ")"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "i5Ul3zf5QYvW",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for a solution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Leaz2oYMQcBf",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "california_housing_dataframe[\"rooms_per_person\"] = (\n",
+ " california_housing_dataframe[\"total_rooms\"] / california_housing_dataframe[\"population\"])\n",
+ "\n",
+ "calibration_data = train_model(\n",
+ " learning_rate=0.05,\n",
+ " steps=500,\n",
+ " batch_size=5,\n",
+ " input_feature=\"rooms_per_person\")"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ZjQrZ8mcHFiU",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Task 2: Identify Outliers\n",
+ "\n",
+ "We can visualize the performance of our model by creating a scatter plot of predictions vs. target values. Ideally, these would lie on a perfectly correlated diagonal line.\n",
+ "\n",
+ "Use Pyplot's [`scatter()`](https://matplotlib.org/gallery/shapes_and_collections/scatter.html) to create a scatter plot of predictions vs. targets, using the rooms-per-person model you trained in Task 1.\n",
+ "\n",
+ "Do you see any oddities? Trace these back to the source data by looking at the distribution of values in `rooms_per_person`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "P0BDOec4HbG_",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# YOUR CODE HERE"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jByCP8hDRZmM",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for the solution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "s0tiX2gdRe-S",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "plt.figure(figsize=(15, 6))\n",
+ "plt.subplot(1, 2, 1)\n",
+ "plt.scatter(calibration_data[\"predictions\"], calibration_data[\"targets\"])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "kMQD0Uq3RqTX",
+ "colab_type": "text"
+ },
+ "source": [
+ "The calibration data shows most scatter points aligned to a line. The line is almost vertical, but we'll come back to that later. Right now let's focus on the ones that deviate from the line. We notice that they are relatively few in number.\n",
+ "\n",
+ "If we plot a histogram of `rooms_per_person`, we find that we have a few outliers in our input data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "POTM8C_ER1Oc",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "plt.subplot(1, 2, 2)\n",
+ "_ = california_housing_dataframe[\"rooms_per_person\"].hist()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9l0KYpBQu8ed",
+ "colab_type": "text"
+ },
+ "source": [
+ "## Task 3: Clip Outliers\n",
+ "\n",
+ "See if you can further improve the model fit by setting the outlier values of `rooms_per_person` to some reasonable minimum or maximum.\n",
+ "\n",
+ "For reference, here's a quick example of how to apply a function to a Pandas `Series`:\n",
+ "\n",
+ " clipped_feature = my_dataframe[\"my_feature_name\"].apply(lambda x: max(x, 0))\n",
+ "\n",
+ "The above `clipped_feature` will have no values less than `0`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "rGxjRoYlHbHC",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# YOUR CODE HERE"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "WvgxW0bUSC-c",
+ "colab_type": "text"
+ },
+ "source": [
+ "### Solution\n",
+ "\n",
+ "Click below for the solution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8YGNjXPaSMPV",
+ "colab_type": "text"
+ },
+ "source": [
+ "The histogram we created in Task 2 shows that the majority of values are less than `5`. Let's clip `rooms_per_person` to 5, and plot a histogram to double-check the results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "9YyARz6gSR7Q",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "california_housing_dataframe[\"rooms_per_person\"] = (\n",
+ " california_housing_dataframe[\"rooms_per_person\"]).apply(lambda x: min(x, 5))\n",
+ "\n",
+ "_ = california_housing_dataframe[\"rooms_per_person\"].hist()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "vO0e1p_aSgKA",
+ "colab_type": "text"
+ },
+ "source": [
+ "To verify that clipping worked, let's train again and print the calibration data once more:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ZgSP2HKfSoOH",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "calibration_data = train_model(\n",
+ " learning_rate=0.05,\n",
+ " steps=500,\n",
+ " batch_size=5,\n",
+ " input_feature=\"rooms_per_person\")"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "gySE-UgfSony",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "_ = plt.scatter(calibration_data[\"predictions\"], calibration_data[\"targets\"])"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 41a8d7c..521a26c 100644
--- a/README.md
+++ b/README.md
@@ -1,53 +1,39 @@
-# 🍓 My Resources for AI, ML, and DNN 🍓
+## tensorflow for deep learning
-## Learning
+
-## Getting the News
+### 👾 my old ml notebooks and tensorflow/numpy small projects:
-* Feedly with [list of blogs to follow](https://raw.githubusercontent.com/bt3gl/Machine-Learning-Resources/master/ml_ai_feedly.opml).
-* Check [my blog](http://bt3gl.github.io/) :).
-* [Deep Learning weekly](http://www.deeplearningweekly.com/).
+
+
+* **[ml notebooks](Notebooks)**: my jupyter notebooks with ml models
+* **[tensorflow examples](TensorFlow)**: tensorflow learning examples
+* **[caffe](Caffee)**: an example using caffe library on docker container
+* **[deep art](Deep_Art)**: my deep learning generated art models
+* **[ml numpy](Numpy)**: my code and examples using numpy
+
+
-## Machine Learning in General
+---------
-* [Stanford's Machine Learning Course](http://cs229.stanford.edu/)
-* [A Chart of Neural Networks](http://www.asimovinstitute.org/neural-network-zoo/).
+### cool resources
-### Fun:
-
-* [Machine Learning for Artists](http://ml4a.github.io/index/).
-* [LossFunctions.tumblr](http://lossfunctions.tumblr.com/).
-* [CreativeAI](http://www.creativeai.net/).
+
+* **[machine learning course, by stanford](http://cs229.stanford.edu/)**
+* **[cnn for visual recognition, by stanford](http://cs231n.stanford.edu/)**
+* **[developer ml course, by google](https://developers.google.com/machine-learning)**
+* **[tensorflow courses, by google](https://www.tensorflow.org/)**
+* **[deep learning basics, by mit](https://medium.com/tensorflow/mit-deep-learning-basics-introduction-and-overview-with-tensorflow-355bcd26baf0)**
+* **[a chart of neural networks, by asimov institute](http://www.asimovinstitute.org/neural-network-zoo/)**
+* **[course on reinforcement learning, by ucl](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)**
+* **[plaground, by tensorflow](http://playground.tensorflow.org)**
+* **[deep learning course, by nvidia](https://www.youtube.com/playlist?list=PL5B692fm6--tI-ijknnVZWbXU2H4JpSYe)**
+* **[energy-based approaches to representation learning, by y. lecun](https://www.youtube.com/watch?v=m17B-cXcZFI&=&t=524s)**
+* **[deep learning lectures, by lex fridman](https://www.youtube.com/watch?v=O5xeyoRL95U&list=PLrAXtmErZgOeiKm4sgNOknGvNjby9efdf)**
+* **[deeplearning.ai, by andrew ng](https://www.deeplearning.ai/deep-learning-specialization/)**
+* **[deep learning, by i. goodfellow et al.](http://www.deeplearningbook.org/)**
-## Deep Learning
-
-
-### Reinforcement Learning
-
-* [UCL Course on RL](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
-
-### ConvNets
-
-* [Stanford's Convolutional Neural Networks for Visual Recognition](http://cs231n.stanford.edu/)
-* [The 9 CNN Papers You Need To Know About](https://adeshpande3.github.io/adeshpande3.github.io/The-9-Deep-Learning-Papers-You-Need-To-Know-About.html).
-
-### Hardware
-
-* [NVIDIA Deep Learning Course](https://www.youtube.com/playlist?list=PL5B692fm6--tI-ijknnVZWbXU2H4JpSYe)
-
-
-### Computer Vision
-
-* [Multiple View Geometry in CV](https://www.goodreads.com/book/show/18938711-multiple-view-geometry-in-computer-vision).
-
-
-
-## Working
-
-### Benchmarkers
-
-* [DeepBench](https://github.com/baidu-research/DeepBench).
diff --git a/Talks/.DS_Store b/Talks/.DS_Store
deleted file mode 100644
index 5008ddf..0000000
Binary files a/Talks/.DS_Store and /dev/null differ
diff --git a/Talks/AlphaGo_IJCAI.pdf b/Talks/AlphaGo_IJCAI.pdf
deleted file mode 100644
index 7533889..0000000
Binary files a/Talks/AlphaGo_IJCAI.pdf and /dev/null differ
diff --git a/Talks/DLSummerSchool_Aug2016_compress.pdf b/Talks/DLSummerSchool_Aug2016_compress.pdf
deleted file mode 100644
index 203f9a5..0000000
Binary files a/Talks/DLSummerSchool_Aug2016_compress.pdf and /dev/null differ
diff --git a/Talks/icml2016_tutorial_deep_residual_networks_kaiminghe.pdf b/Talks/icml2016_tutorial_deep_residual_networks_kaiminghe.pdf
deleted file mode 100644
index 438394a..0000000
Binary files a/Talks/icml2016_tutorial_deep_residual_networks_kaiminghe.pdf and /dev/null differ
diff --git a/Talks/intro_RL/intro_RL.pdf b/Talks/intro_RL/intro_RL.pdf
deleted file mode 100644
index 6b8f895..0000000
Binary files a/Talks/intro_RL/intro_RL.pdf and /dev/null differ
diff --git a/ml_ai_feedly.opml b/ml_ai_feedly.opml
deleted file mode 100644
index 289e879..0000000
--- a/ml_ai_feedly.opml
+++ /dev/null
@@ -1,307 +0,0 @@
-
-
-
-
- Marina subscriptions in feedly Cloud
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-