3/28/24, 3:07 PM Sentiment_analysis
In [17]: import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
In [18]: nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
[nltk_data] Downloading package punkt to
[nltk_data] C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data] C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data] Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data] C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data] Package wordnet is already up-to-date!
Out[18]: True
Method 1
In [19]: df = pd.read_csv('reviews.csv', usecols=['body'])
lemma = WordNetLemmatizer()
stop_words = stopwords.words('english')
In [20]: def text_prep(x):
corp = str(x).lower()
corp = re.sub('[^a-zA-Z]+',' ', corp).strip()
tokens = word_tokenize(corp)
words = [t for t in tokens if t not in stop_words]
lemmatize = [lemma.lemmatize(w) for w in words]
return lemmatize
In [22]: preprocess_tag = [text_prep(i) for i in df['body']]
df["preprocess_txt"] = preprocess_tag
df['total_len'] = df['preprocess_txt'].map(lambda x: len(x))
In [24]: file = open('negative-words.txt', 'r')
neg_words = file.read().split()
file = open('positive-words.txt', 'r')
pos_words = file.read().split()
localhost:8888/notebooks/Sentiment_analysis.ipynb 1/4
3/28/24, 3:07 PM Sentiment_analysis
In [27]: num_pos = df['preprocess_txt'].map(lambda x: len([i for i in x if i in pos_w
df['pos_count'] = num_pos
num_neg = df['preprocess_txt'].map(lambda x: len([i for i in x if i in neg_w
df['neg_count'] = num_neg
df['sentiment'] = round((df['pos_count'] - df['neg_count']) / df['total_len'
df.head()
Out[27]:
body preprocess_txt total_len pos_count neg_count sentiment
I had the Samsung [samsung, awhile,
0 A600 for awhile which absolute, doo, doo, read, 162 18 18 0.00
is abs... re...
Due to a software
[due, software, issue,
1 issue between Nokia 67 8 3 0.07
nokia, sprint, phone, t...
and Spri...
This is a great,
[great, reliable, phone,
2 reliable phone. I also 68 10 4 0.09
also, purchased, phon...
purcha...
I love the phone and
[love, phone, really, need,
3 all, because I really 41 3 0 0.07
one, expect, price...
did...
The phone has been
[phone, great, every,
4 great for every 56 5 3 0.04
purpose, offer, except, ...
purpose it ...
Method 2
In [28]: df['sentiment'] = round(df['pos_count'] / (df['neg_count']+1), 2)
df.head()
Out[28]:
body preprocess_txt total_len pos_count neg_count sentiment
I had the Samsung [samsung, awhile,
0 A600 for awhile which absolute, doo, doo, read, 162 18 18 0.95
is abs... re...
Due to a software
[due, software, issue,
1 issue between Nokia 67 8 3 2.00
nokia, sprint, phone, t...
and Spri...
This is a great,
[great, reliable, phone,
2 reliable phone. I also 68 10 4 2.00
also, purchased, phon...
purcha...
I love the phone and
[love, phone, really, need,
3 all, because I really 41 3 0 3.00
one, expect, price...
did...
The phone has been
[phone, great, every,
4 great for every 56 5 3 1.25
purpose, offer, except, ...
purpose it ...
In [30]: nltk.download('vader_lexicon')
[nltk_data] Downloading package vader_lexicon to
[nltk_data] C:\Users\student\AppData\Roaming\nltk_data...
Out[30]: True
localhost:8888/notebooks/Sentiment_analysis.ipynb 2/4
3/28/24, 3:07 PM Sentiment_analysis
Method 3
In [35]: from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent = SentimentIntensityAnalyzer()
df = pd.read_csv('reviews.csv', usecols=['body'])
df['body'].fillna('', inplace=True)
polarity = [round(sent.polarity_scores(str(i))['compound'], 2) for i in df['
df['sentiment_score'] = polarity
print(df.head())
body sentiment_score
0 I had the Samsung A600 for awhile which is abs... 0.86
1 Due to a software issue between Nokia and Spri... 0.89
2 This is a great, reliable phone. I also purcha... 0.80
3 I love the phone and all, because I really did... 0.96
4 The phone has been great for every purpose it ... 0.77
Exra
In [54]: # Create WordNetLemmatizer object
wnl = WordNetLemmatizer()
# single word lemmatization examples
list1 = ['kites', 'babies', 'dogs', 'flying', 'smiling',
'driving', 'tried', 'feet']
for words in list1:
print(words + " ---> " + wnl.lemmatize(words))
print('better' + " ---> " + wnl.lemmatize('better',pos='a'))
kites ---> kite
babies ---> baby
dogs ---> dog
flying ---> flying
smiling ---> smiling
driving ---> driving
tried ---> tried
feet ---> foot
better ---> good
In [59]: sentence = 'I am good in cricket, but best in Football.'
# Tokenize the sentence
tokens = nltk.word_tokenize(sentence)
# Get English stopwords
english_stopwords = set(stopwords.words('english'))
# Filter out stopwords
filtered_tokens = [word for word in tokens if word.lower() not in english_st
print(filtered_tokens)
['good', 'cricket', ',', 'best', 'Football', '.']
localhost:8888/notebooks/Sentiment_analysis.ipynb 3/4
3/28/24, 3:07 PM Sentiment_analysis
In [60]: import nltk
from nltk.stem import PorterStemmer
# Sentence to stem
sentence = 'I am good in cricket, but best in Football.'
# Tokenize the sentence
tokens = nltk.word_tokenize(sentence)
# Initialize PorterStemmer
stemmer = PorterStemmer()
# Perform stemming on each token
stemmed_tokens = [stemmer.stem(word) for word in tokens]
print(stemmed_tokens)
['I', 'am', 'good', 'in', 'cricket', ',', 'but', 'best', 'in', 'footbal',
'.']
In [ ]:
localhost:8888/notebooks/Sentiment_analysis.ipynb 4/4