Merge pull request rasbt#103 from rasbt/sklearn0.22

check ch02 on latest softw.
erkundanec · Dec 5, 2019 · 83420ff · 83420ff
2 parents eeb9a0c + 07cce34
commit 83420ff
Show file tree

Hide file tree

Showing 16 changed files with 342 additions and 266 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,6 +18,12 @@ ch12/t10k-labels-idx1-ubyte.gz
 ch12/train-images-idx3-ubyte.gz
 ch12/train-labels-idx1-ubyte.gz
 
+# Ch18 data files
+ch08/aclImdb/
+ch08/aclImdb_v1.tar.gz
+ch08/movie_data.csv
+
+# Other 
 *checkpoint
 *tfevents*
 *.data-00000-of-00001

diff --git a/ch02/ch02.ipynb b/ch02/ch02.ipynb
diff --git a/ch03/ch03.ipynb b/ch03/ch03.ipynb
diff --git a/ch03/ch03.py b/ch03/ch03.py
@@ -35,7 +35,15 @@
 
 
 
-# *The use of `watermark` is optional. You can install this IPython extension via "`pip install watermark`". For more information, please see: https://github.com/rasbt/watermark.*
+# *The use of `watermark` is optional. You can install this Jupyter extension via*  
+# 
+#     conda install watermark -c conda-forge  
+# 
+# or  
+# 
+#     pip install watermark   
+# 
+# *For more information, please see: https://github.com/rasbt/watermark.*
 
 # ### Overview
 
@@ -627,7 +635,7 @@ def error(p):
 plt.ylim([0, 1.1])
 plt.xlabel('p(i=1)')
 plt.ylabel('impurity index')
-plt.savefig('images/03_19.png', dpi=300, bbox_inches='tight')
+#plt.savefig('images/03_19.png', dpi=300, bbox_inches='tight')
 plt.show()
 
 

diff --git a/ch04/ch04.ipynb b/ch04/ch04.ipynb
diff --git a/ch04/ch04.py b/ch04/ch04.py
@@ -319,9 +319,10 @@
 c_transf.fit_transform(X).astype(float)
 
 
-# ## Optional: Ordinal Encoding
 
-# If we are unsure about the numerical differences between the categories of ordinal features, we can also encode them using a thresholded one-hot encoded format. For example, we can split the feature "size" with values M, L, and XL into two new features "x > M", "x > L", and . For example, let's consider the original DataFrame:
+# ## Optional: Encoding Ordinal Features
+
+# If we are unsure about the numerical differences between the categories of ordinal features, or the difference between two ordinal values is not defined, we can also encode them using a threshold encoding with 0/1 values. For example, we can split the feature "size" with values M, L, and XL into two new features "x > M" and "x > L". Let's consider the original DataFrame:
 
 
 

diff --git a/ch05/ch05.ipynb b/ch05/ch05.ipynb
diff --git a/ch06/ch06.ipynb b/ch06/ch06.ipynb
diff --git a/ch06/ch06.py b/ch06/ch06.py
@@ -171,8 +171,7 @@
 
 
 
-kfold = StratifiedKFold(n_splits=10,
-                        random_state=1).split(X_train, y_train)
+kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)
 
 scores = []
 for k, (train, test) in enumerate(kfold):
@@ -329,6 +328,7 @@
 gs = GridSearchCV(estimator=pipe_svc, 
                   param_grid=param_grid, 
                   scoring='accuracy', 
+                  refit=True,
                   cv=10,
                   n_jobs=-1)
 gs = gs.fit(X_train, y_train)
@@ -339,7 +339,11 @@
 
 
 clf = gs.best_estimator_
-clf.fit(X_train, y_train)
+
+# clf.fit(X_train, y_train) 
+# note that we do not need to refit the classifier
+# because this is done automatically via refit=True.
+
 print('Test accuracy: %.3f' % clf.score(X_test, y_test))
 
 
@@ -498,8 +502,7 @@
 X_train2 = X_train[:, [4, 14]]
 
 
-cv = list(StratifiedKFold(n_splits=3, 
-                          random_state=1).split(X_train, y_train))
+cv = list(StratifiedKFold(n_splits=3).split(X_train, y_train))
 
 fig = plt.figure(figsize=(7, 5))
 

diff --git a/ch07/ch07.ipynb b/ch07/ch07.ipynb
diff --git a/ch07/ch07.py b/ch07/ch07.py
@@ -768,8 +768,3 @@ def get_params(self, deep=True):
 
 
 
-
-
-
-
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -768,8 +768,3 @@ def get_params(self, deep=True):