0% found this document useful (0 votes)

47 views5 pages

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

racoon97970301

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

47 views5 pages

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

racoon97970301

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 5

3/26/24, 4:07 PM 1_sentiment_analysis_nlp.

ipynb - Colaboratory

keyboard_arrow_down Mounting the drive

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDriveb/nlp_project
!ls #checking if files are there or not

keyboard_arrow_down Importing the dataset

import pandas as pd

data = pd.read_csv('amazon_alexa.tsv', sep='\t')

data.head(10) #dataset is labelled in a binary format

mydata = data[['verified_reviews','feedback']] #relevant columns

mydata.columns = ['review','label'] #renaming

mydata.head()

#checking the distribution of label columnn

mydata.value_counts('label')

As can be seen above, dataset is imbalanced. Thus we will be using Undersampling technique to balance the dataset.

# Count the occurrences of each label

label_counts = mydata["label"].value_counts()

# Get the number of rows to drop from the majority class

rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly

if rows_to_drop > 0:
data_majority = mydata[mydata["label"] == 1]
data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
data_balanced = mydata.copy()

# Check the new class balance

print(data_balanced["label"].value_counts())

The dataset above is balanced.

keyboard_arrow_down Data preprocessing

#defining a function to clean the dataset
import re

def clean_text(text):
# to remove special characters and punctuation
text = re.sub(r"[^\w\s]", " ", text)

# to remove single characters

text = re.sub(r"\b[a-zA-Z]\b", " ", text)

# to remove HTML tags

text = re.sub(r"<[^>]*>", " ", text)

# to lowercase the text

text = text.lower()

# to remove extra whitespace

text = re.sub(r"\s+", " ", text)

# to trim leading and trailing spaces

text = text.strip()

return text

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 1/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

import pandas as pd

# extracting the review colum as a list

reviews = data_balanced['review'].tolist()

# Cleaning the text in the list made

cleaned_reviews = [clean_text(review) for review in reviews] #iterating through each element of reviews column

# Adding the cleaned reviews as a new column to the datafrae

data_balanced['clean_reviews'] = cleaned_reviews

data_balanced

keyboard_arrow_down Splitting the dataset into 5% training and 95% test dataset
import pandas as pd

# Assuming your DataFrame is called "df"

total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

# Randomly sample train_size rows for the training set

test_set = data_balanced.sample(test_size)

# Get the remaining rows for the test set

train_set = data_balanced.drop(test_set.index)

keyboard_arrow_down Sentiment analysis using LLM

keyboard_arrow_down Setting up GEMINI API
!pip install -q -U google-generativeai

# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display

from IPython.display import Markdown

def to_markdown(text):
text = text.replace('•', ' *')
return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key

from google.colab import userdata

# Using `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

for m in genai.list_models():
if 'generateContent' in m.supported_generation_methods:
print(m.name)

#we will be using the gemini pro model

model = genai.GenerativeModel('gemini-pro')

%%time
response = model.generate_content("how great is MS Dhoni?")

to_markdown(response.text)

keyboard_arrow_down Integrating the Gemini pro API to our sentiment analysis task
https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 2/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

test_set_sample = test_set.sample(20) #taking random 20 dataset from test dataset

test_set_sample['pred_label'] = '' #creating a column pred_label for our predicted feedback

test_set_sample

# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data

print(json_data)

prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)

#feeding the prompt to gemini model to generate the content

response = model.generate_content(prompt)

print(response.text)

import json

# Clean the data by stripping the backticks

json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame

data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

#Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

confusion_matrix(y_true, y_pred)

keyboard_arrow_down Batching GEMINI API calls

test_set.shape

test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

batches = []
batch_size = 25

for i in range(0, len(test_set_total), batch_size):

batches.append(test_set_total[i : i + batch_size]) # Append batches instead of assigning

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 3/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

import time

def gemini_completion_function(batch,current_batch,total_batch):
"""Function works in three steps:
# Step-1: Convert the DataFrame to JSON using the to_json() method.
# Step-2: Preparing the Gemini Prompt
# Step-3: Calling Gemini API
"""

print(f"Now processing batch#: {current_batch+1} of {total_batch}")

json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three backticks below.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.
Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

```
{json_data}
```
"""

print(prompt)
response = model.generate_content(prompt)
time.sleep(5)

return response

batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
responses.append(gemini_completion_function(batches[i],i,batch_count))

import json

df_total = pd.DataFrame() # Initialize an empty DataFrame

for response in responses:

# Clean the data by stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame

data = json.loads(json_data)
df_temp = pd.DataFrame(data)

# Append the DataFrame to the final DataFrame

df_total = df_total.append(df_temp, ignore_index=True)

print(df_total) # Display the final DataFrame

#Overwriting the pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 4/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 5/5

Amna Bagh Ali
No ratings yet
Amna Bagh Ali
6 pages
The Same Column Names
No ratings yet
The Same Column Names
9 pages
Q 3
No ratings yet
Q 3
2 pages
Alexa Sentiment Analysis
No ratings yet
Alexa Sentiment Analysis
34 pages
ML Week10.1
No ratings yet
ML Week10.1
5 pages
Sentiment Analysis With NLP Deep Learning
No ratings yet
Sentiment Analysis With NLP Deep Learning
8 pages
NLP Transformer-Based Models Used For Sentiment Analysis: 1. BERT
No ratings yet
NLP Transformer-Based Models Used For Sentiment Analysis: 1. BERT
98 pages
Miniproject 14
No ratings yet
Miniproject 14
4 pages
RajSingh WIexp7
No ratings yet
RajSingh WIexp7
8 pages
Naive Bayes for Sentiment Analysis Guide
No ratings yet
Naive Bayes for Sentiment Analysis Guide
10 pages
Code
No ratings yet
Code
18 pages
AIML IA3 Loki & SG
No ratings yet
AIML IA3 Loki & SG
31 pages
Few-Shot Learning Tutorial - Medium
No ratings yet
Few-Shot Learning Tutorial - Medium
16 pages
Text Preprocessing and Sentiment Analysis
No ratings yet
Text Preprocessing and Sentiment Analysis
13 pages
Toxic Comment Classification
No ratings yet
Toxic Comment Classification
11 pages
NLP Sentimental Analysis 1736351356
No ratings yet
NLP Sentimental Analysis 1736351356
32 pages
Machine Learning Lab Manual
No ratings yet
Machine Learning Lab Manual
12 pages
R002 KrishAhuja BDA Lab9.Ipynb - Colab
No ratings yet
R002 KrishAhuja BDA Lab9.Ipynb - Colab
3 pages
2023 Aug How To Prepare Data For A Neural Network A Step-by-Step Guide
No ratings yet
2023 Aug How To Prepare Data For A Neural Network A Step-by-Step Guide
7 pages
Ad3461 ML Lab Manual
No ratings yet
Ad3461 ML Lab Manual
48 pages
Sentiment Analysis with NLTK
No ratings yet
Sentiment Analysis with NLTK
4 pages
Detailed Report
No ratings yet
Detailed Report
6 pages
2023 Aug How To Produce Data For A Neural networkORG
No ratings yet
2023 Aug How To Produce Data For A Neural networkORG
6 pages
Imbalanced Classes in ML: 10 Techniques
No ratings yet
Imbalanced Classes in ML: 10 Techniques
10 pages
NLP Labsheet-2 Sentiment Analysis Using Naive Bayes Classifier
No ratings yet
NLP Labsheet-2 Sentiment Analysis Using Naive Bayes Classifier
15 pages
Bert Sentiment
No ratings yet
Bert Sentiment
7 pages
CTI Record
No ratings yet
CTI Record
49 pages
Transformer Models for Sentiment Analysis
No ratings yet
Transformer Models for Sentiment Analysis
45 pages
WDM - Week - I
No ratings yet
WDM - Week - I
24 pages
SocrAI Day 3
No ratings yet
SocrAI Day 3
43 pages
Dataset Description: Amazon Reviews of Unlocked Phone
No ratings yet
Dataset Description: Amazon Reviews of Unlocked Phone
4 pages
CNN for Sentiment Analysis Implementation
No ratings yet
CNN for Sentiment Analysis Implementation
7 pages
Solution T1
No ratings yet
Solution T1
9 pages
Document Retrieval Techniques Overview
No ratings yet
Document Retrieval Techniques Overview
43 pages
DS - Lab Report.
No ratings yet
DS - Lab Report.
25 pages
Lab Report 3 - Colab
No ratings yet
Lab Report 3 - Colab
6 pages
Twitter Sentiment Analysis Dss
No ratings yet
Twitter Sentiment Analysis Dss
14 pages
Part C - Assignment No. 2 Mini-Project On Twitter
No ratings yet
Part C - Assignment No. 2 Mini-Project On Twitter
7 pages
Ai Assign 3.ipynb - Colab
No ratings yet
Ai Assign 3.ipynb - Colab
6 pages
LabAssignment 03ai
No ratings yet
LabAssignment 03ai
7 pages
Ie ML Project (Getting Started)
No ratings yet
Ie ML Project (Getting Started)
3 pages
Sentimental Analysis
No ratings yet
Sentimental Analysis
3 pages
Maneesha Nidigonda Verzeo Major Project
No ratings yet
Maneesha Nidigonda Verzeo Major Project
11 pages
05 ML PDF
No ratings yet
05 ML PDF
1 page
Detection of Inline Code Comment Smells
No ratings yet
Detection of Inline Code Comment Smells
8 pages
Python CA 4
No ratings yet
Python CA 4
9 pages
Malignant Comment Classifier Guide
No ratings yet
Malignant Comment Classifier Guide
30 pages
ML - LAB Record
No ratings yet
ML - LAB Record
36 pages
Sentiment Analysis On Tweets
No ratings yet
Sentiment Analysis On Tweets
2 pages
How To Handle Imbalanced Datasets - by Subha - Medium
No ratings yet
How To Handle Imbalanced Datasets - by Subha - Medium
18 pages
CSE4062S21 Group3 Project Delivery7 FinalReport
No ratings yet
CSE4062S21 Group3 Project Delivery7 FinalReport
9 pages
IR Practical
100% (1)
IR Practical
24 pages
Project Ali Huzaifa
No ratings yet
Project Ali Huzaifa
6 pages
Vamshi ml-4
No ratings yet
Vamshi ml-4
3 pages
Machine Learning Lab
No ratings yet
Machine Learning Lab
33 pages
MiniProject - ML - Ipynb - Colaboratory
No ratings yet
MiniProject - ML - Ipynb - Colaboratory
26 pages
AD3461 Machine Learning Lab Manual
No ratings yet
AD3461 Machine Learning Lab Manual
26 pages
Suzuki GSX1300RL1 Parts Catalogue
No ratings yet
Suzuki GSX1300RL1 Parts Catalogue
128 pages
NCERT 100 Days Challenge
No ratings yet
NCERT 100 Days Challenge
8 pages
Edapt Prompt Engg Syllabus
100% (1)
Edapt Prompt Engg Syllabus
5 pages
Cementing Materials Overview
No ratings yet
Cementing Materials Overview
19 pages
Copia de Collage Social Realism in Art Lesson
No ratings yet
Copia de Collage Social Realism in Art Lesson
19 pages
UltraTech Cement: Global Leader Overview
No ratings yet
UltraTech Cement: Global Leader Overview
32 pages
ModelDPR HoneyProcessing
No ratings yet
ModelDPR HoneyProcessing
29 pages
Architectural Design Services Contract: November
No ratings yet
Architectural Design Services Contract: November
2 pages
CEMO - Pump Catalogue
No ratings yet
CEMO - Pump Catalogue
28 pages
Ball and Beam Dynamics - Full Model
No ratings yet
Ball and Beam Dynamics - Full Model
9 pages
Technical Specifications Mysql
No ratings yet
Technical Specifications Mysql
2 pages
Still Feel Unwell After Helicobacter Pylori Treatment Page 8 Duodenal Ulcer Forums
No ratings yet
Still Feel Unwell After Helicobacter Pylori Treatment Page 8 Duodenal Ulcer Forums
3 pages
Lecture 1 B
No ratings yet
Lecture 1 B
49 pages
RFP Tender Uploading and Opening
No ratings yet
RFP Tender Uploading and Opening
31 pages
Engineering Vol 72 1901-08-02
No ratings yet
Engineering Vol 72 1901-08-02
35 pages
Settlement in Suchodolski v. Poland
No ratings yet
Settlement in Suchodolski v. Poland
3 pages
All Inquiry
No ratings yet
All Inquiry
18 pages
GSF35-2 PU: Part. No.: 3410.0345
No ratings yet
GSF35-2 PU: Part. No.: 3410.0345
2 pages
Counters and Registers
No ratings yet
Counters and Registers
76 pages
2018 East End
No ratings yet
2018 East End
7 pages
Engineering Measurements - Methods and Intrinsic Errors - WILLEY PDF
No ratings yet
Engineering Measurements - Methods and Intrinsic Errors - WILLEY PDF
195 pages
T.Rex - Request To Exit Detector: Data Sheet
No ratings yet
T.Rex - Request To Exit Detector: Data Sheet
4 pages
Discharge Nozzle 360
No ratings yet
Discharge Nozzle 360
2 pages
MSDS AmmoniumThiosulfate
No ratings yet
MSDS AmmoniumThiosulfate
6 pages
Visio for Enterprise Architecture at TU/e
No ratings yet
Visio for Enterprise Architecture at TU/e
9 pages
Anchor & Mooring
No ratings yet
Anchor & Mooring
3 pages
Splines Toolbox - Version 2
No ratings yet
Splines Toolbox - Version 2
112 pages
Model Answers for Judicial Translator Exam
No ratings yet
Model Answers for Judicial Translator Exam
8 pages
Residential Property Lease Deed Draft
No ratings yet
Residential Property Lease Deed Draft
6 pages

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

3/26/24, 4:07 PM 1_sentiment_analysis_nlp.

keyboard_arrow_down Mounting the drive

keyboard_arrow_down Importing the dataset

data = pd.read_csv('amazon_alexa.tsv', sep='\t')

mydata = data[['verified_reviews','feedback']] #relevant columns

#checking the distribution of label columnn

# Count the occurrences of each label

# Get the number of rows to drop from the majority class

# Drop rows from the majority class randomly

# Check the new class balance

The dataset above is balanced.

keyboard_arrow_down Data preprocessing

# to remove single characters

# to remove HTML tags

# to lowercase the text

# to remove extra whitespace

# to trim leading and trailing spaces

# extracting the review colum as a list

# Cleaning the text in the list made

# Adding the cleaned reviews as a new column to the datafrae

# Assuming your DataFrame is called "df"

# Randomly sample train_size rows for the training set

# Get the remaining rows for the test set

keyboard_arrow_down Sentiment analysis using LLM

import google.generativeai as genai

from IPython.display import display

# Used to securely store your API key

# Using `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.

#we will be using the gemini pro model

test_set_sample = test_set.sample(20) #taking random 20 dataset from test dataset

test_set_sample['pred_label'] = '' #creating a column pred_label for our predicted feedback

# Convert the DataFrame to JSON using the to_json() method

# Print the JSON data

#feeding the prompt to gemini model to generate the content

# Clean the data by stripping the backticks

# Load the cleaned data and convert to DataFrame

#Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

keyboard_arrow_down Batching GEMINI API calls

for i in range(0, len(test_set_total), batch_size):

print(f"Now processing batch#: {current_batch+1} of {total_batch}")

df_total = pd.DataFrame() # Initialize an empty DataFrame

for response in responses:

# Load the cleaned data and convert to DataFrame

# Append the DataFrame to the final DataFrame

print(df_total) # Display the final DataFrame

#Overwriting the pred_label from 'df' into pred_label in 'train_set_sample'

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

You might also like