From 28b1275e15acfaea00bcddc7cb56d9fe74706ea7 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Thu, 11 Aug 2022 10:26:17 -0500 Subject: [PATCH] tokenizer improvement --- ch08/ch08.ipynb | 6 +++--- ch08/ch08.py | 2 +- errata/README.md | 11 +++++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ch08/ch08.ipynb b/ch08/ch08.ipynb index de8032fb..336b8638 100644 --- a/ch08/ch08.ipynb +++ b/ch08/ch08.ipynb @@ -1433,7 +1433,7 @@ "\n", "def tokenizer(text):\n", " text = re.sub('<[^>]*>', '', text)\n", - " emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text.lower())\n", + " emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text)\n", " text = re.sub('[\\W]+', ' ', text.lower()) +\\\n", " ' '.join(emoticons).replace('-', '')\n", " tokenized = [w for w in text.split() if w not in stop]\n", @@ -1871,7 +1871,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1885,7 +1885,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.9.7" }, "toc": { "nav_menu": {}, diff --git a/ch08/ch08.py b/ch08/ch08.py index 0e385552..8dd19014 100644 --- a/ch08/ch08.py +++ b/ch08/ch08.py @@ -565,7 +565,7 @@ def tokenizer_porter(text): def tokenizer(text): text = re.sub('<[^>]*>', '', text) - emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()) + emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '') tokenized = [w for w in text.split() if w not in stop] return tokenized diff --git a/errata/README.md b/errata/README.md index 0cef0c1a..84d20712 100644 --- a/errata/README.md +++ b/errata/README.md @@ -26,6 +26,17 @@ - pg. 162. In `S_i = \sum_{x in D_i} (x - m_i) (x - m_i)^{\top}`, the transpose should be on the first row vector, i.e., `S_i = \sum_{x in D_i} (x - m_i)^{\top} (x - m_i)`. + + +## Chapter 8 + +- In tokenizer, change `text.lower()` to `text`, i.e., + +```python +emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) +``` +to catch emoticons such as ":-P" + ## Chapter 13 - pg. 469: Instead of `tf.keras.activations.tanh(z)` it should be `tf.keras.activations.relu(z)`.