From 28b1275e15acfaea00bcddc7cb56d9fe74706ea7 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Thu, 11 Aug 2022 10:26:17 -0500
Subject: [PATCH] tokenizer improvement

---
 ch08/ch08.ipynb  |  6 +++---
 ch08/ch08.py     |  2 +-
 errata/README.md | 11 +++++++++++
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/ch08/ch08.ipynb b/ch08/ch08.ipynb
index de8032fb..336b8638 100644
--- a/ch08/ch08.ipynb
+++ b/ch08/ch08.ipynb
@@ -1433,7 +1433,7 @@
     "\n",
     "def tokenizer(text):\n",
     "    text = re.sub('<[^>]*>', '', text)\n",
-    "    emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text.lower())\n",
+    "    emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text)\n",
     "    text = re.sub('[\\W]+', ' ', text.lower()) +\\\n",
     "        ' '.join(emoticons).replace('-', '')\n",
     "    tokenized = [w for w in text.split() if w not in stop]\n",
@@ -1871,7 +1871,7 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1885,7 +1885,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.9.7"
   },
   "toc": {
    "nav_menu": {},
diff --git a/ch08/ch08.py b/ch08/ch08.py
index 0e385552..8dd19014 100644
--- a/ch08/ch08.py
+++ b/ch08/ch08.py
@@ -565,7 +565,7 @@ def tokenizer_porter(text):
 
 def tokenizer(text):
     text = re.sub('<[^>]*>', '', text)
-    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
+    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
     text = re.sub('[\W]+', ' ', text.lower()) +        ' '.join(emoticons).replace('-', '')
     tokenized = [w for w in text.split() if w not in stop]
     return tokenized
diff --git a/errata/README.md b/errata/README.md
index 0cef0c1a..84d20712 100644
--- a/errata/README.md
+++ b/errata/README.md
@@ -26,6 +26,17 @@
 
 - pg. 162. In `S_i = \sum_{x in D_i} (x - m_i) (x - m_i)^{\top}`, the transpose should be on the first row vector, i.e., `S_i = \sum_{x in D_i} (x - m_i)^{\top} (x - m_i)`.
 
+
+
+## Chapter 8
+
+- In tokenizer, change `text.lower()` to `text`, i.e.,
+
+```python
+emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
+```
+to catch emoticons such as ":-P"
+
 ## Chapter 13
 
 - pg. 469: Instead of `tf.keras.activations.tanh(z)` it should be `tf.keras.activations.relu(z)`.