tokenizer improvement

jasgnft · Aug 11, 2022 · 28b1275 · 28b1275
1 parent 53bbbd3
commit 28b1275
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 4 deletions.
diff --git a/ch08/ch08.ipynb b/ch08/ch08.ipynb
@@ -1433,7 +1433,7 @@
     "\n",
     "def tokenizer(text):\n",
     "    text = re.sub('<[^>]*>', '', text)\n",
-    "    emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text.lower())\n",
+    "    emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text)\n",
     "    text = re.sub('[\\W]+', ' ', text.lower()) +\\\n",
     "        ' '.join(emoticons).replace('-', '')\n",
     "    tokenized = [w for w in text.split() if w not in stop]\n",
@@ -1871,7 +1871,7 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1885,7 +1885,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.9.7"
   },
   "toc": {
    "nav_menu": {},

diff --git a/ch08/ch08.py b/ch08/ch08.py
@@ -565,7 +565,7 @@ def tokenizer_porter(text):
 
 def tokenizer(text):
     text = re.sub('<[^>]*>', '', text)
-    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
+    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
     text = re.sub('[\W]+', ' ', text.lower()) +        ' '.join(emoticons).replace('-', '')
     tokenized = [w for w in text.split() if w not in stop]
     return tokenized

diff --git a/errata/README.md b/errata/README.md
@@ -26,6 +26,17 @@
 
 - pg. 162. In `S_i = \sum_{x in D_i} (x - m_i) (x - m_i)^{\top}`, the transpose should be on the first row vector, i.e., `S_i = \sum_{x in D_i} (x - m_i)^{\top} (x - m_i)`.
 
+
+
+## Chapter 8
+
+- In tokenizer, change `text.lower()` to `text`, i.e.,
+
+```python
+emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
+```
+to catch emoticons such as ":-P"
+
 ## Chapter 13
 
 - pg. 469: Instead of `tf.keras.activations.tanh(z)` it should be `tf.keras.activations.relu(z)`.