0% found this document useful (0 votes)

11 views

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

racoon97970301

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

11 views

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

racoon97970301

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 5

3/26/24, 4:07 PM 1_sentiment_analysis_nlp.

ipynb - Colaboratory

keyboard_arrow_down Mounting the drive

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDriveb/nlp_project
!ls #checking if files are there or not

keyboard_arrow_down Importing the dataset

import pandas as pd

data = pd.read_csv('amazon_alexa.tsv', sep='\t')

data.head(10) #dataset is labelled in a binary format

mydata = data[['verified_reviews','feedback']] #relevant columns

mydata.columns = ['review','label'] #renaming

mydata.head()

#checking the distribution of label columnn

mydata.value_counts('label')

As can be seen above, dataset is imbalanced. Thus we will be using Undersampling technique to balance the dataset.

# Count the occurrences of each label

label_counts = mydata["label"].value_counts()

# Get the number of rows to drop from the majority class

rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly

if rows_to_drop > 0:
data_majority = mydata[mydata["label"] == 1]
data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
data_balanced = mydata.copy()

# Check the new class balance

print(data_balanced["label"].value_counts())

The dataset above is balanced.

keyboard_arrow_down Data preprocessing

#defining a function to clean the dataset
import re

def clean_text(text):
# to remove special characters and punctuation
text = re.sub(r"[^\w\s]", " ", text)

# to remove single characters

text = re.sub(r"\b[a-zA-Z]\b", " ", text)

# to remove HTML tags

text = re.sub(r"<[^>]*>", " ", text)

# to lowercase the text

text = text.lower()

# to remove extra whitespace

text = re.sub(r"\s+", " ", text)

# to trim leading and trailing spaces

text = text.strip()

return text

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 1/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

import pandas as pd

# extracting the review colum as a list

reviews = data_balanced['review'].tolist()

# Cleaning the text in the list made

cleaned_reviews = [clean_text(review) for review in reviews] #iterating through each element of reviews column

# Adding the cleaned reviews as a new column to the datafrae

data_balanced['clean_reviews'] = cleaned_reviews

data_balanced

keyboard_arrow_down Splitting the dataset into 5% training and 95% test dataset
import pandas as pd

# Assuming your DataFrame is called "df"

total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

# Randomly sample train_size rows for the training set

test_set = data_balanced.sample(test_size)

# Get the remaining rows for the test set

train_set = data_balanced.drop(test_set.index)

keyboard_arrow_down Sentiment analysis using LLM

keyboard_arrow_down Setting up GEMINI API
!pip install -q -U google-generativeai

# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display

from IPython.display import Markdown

def to_markdown(text):
text = text.replace('•', ' *')
return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key

from google.colab import userdata

# Using `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

for m in genai.list_models():
if 'generateContent' in m.supported_generation_methods:
print(m.name)

#we will be using the gemini pro model

model = genai.GenerativeModel('gemini-pro')

%%time
response = model.generate_content("how great is MS Dhoni?")

to_markdown(response.text)

keyboard_arrow_down Integrating the Gemini pro API to our sentiment analysis task
https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 2/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

test_set_sample = test_set.sample(20) #taking random 20 dataset from test dataset

test_set_sample['pred_label'] = '' #creating a column pred_label for our predicted feedback

test_set_sample

# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data

print(json_data)

prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)

#feeding the prompt to gemini model to generate the content

response = model.generate_content(prompt)

print(response.text)

import json

# Clean the data by stripping the backticks

json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame

data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

#Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

confusion_matrix(y_true, y_pred)

keyboard_arrow_down Batching GEMINI API calls

test_set.shape

test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

batches = []
batch_size = 25

for i in range(0, len(test_set_total), batch_size):

batches.append(test_set_total[i : i + batch_size]) # Append batches instead of assigning

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 3/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

import time

def gemini_completion_function(batch,current_batch,total_batch):
"""Function works in three steps:
# Step-1: Convert the DataFrame to JSON using the to_json() method.
# Step-2: Preparing the Gemini Prompt
# Step-3: Calling Gemini API
"""

print(f"Now processing batch#: {current_batch+1} of {total_batch}")

json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three backticks below.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.
Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

```
{json_data}
```
"""

print(prompt)
response = model.generate_content(prompt)
time.sleep(5)

return response

batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
responses.append(gemini_completion_function(batches[i],i,batch_count))

import json

df_total = pd.DataFrame() # Initialize an empty DataFrame

for response in responses:

# Clean the data by stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame

data = json.loads(json_data)
df_temp = pd.DataFrame(data)

# Append the DataFrame to the final DataFrame

df_total = df_total.append(df_temp, ignore_index=True)

print(df_total) # Display the final DataFrame

#Overwriting the pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 4/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 5/5

Parallel Concurrent Processing Failover and Load Balancing of E-Business Suite Release 11i and Release 12 Mike Swing, TruTek PDF
100% (1)
Parallel Concurrent Processing Failover and Load Balancing of E-Business Suite Release 11i and Release 12 Mike Swing, TruTek PDF
52 pages
Writing Field and Technical Reports
No ratings yet
Writing Field and Technical Reports
125 pages
code
No ratings yet
code
13 pages
Email Spam Classifier
No ratings yet
Email Spam Classifier
22 pages
Data analytics assignment solutions
No ratings yet
Data analytics assignment solutions
20 pages
SHASHANK ML.docx
No ratings yet
SHASHANK ML.docx
23 pages
Abhiml ML File
No ratings yet
Abhiml ML File
74 pages
Institute of Management Technology, Ghaziabad End Term Exam (Term - VII) Take Home Exam (Time Duration: 2.30 HRS) Batch 2019 - 21 Answer-Sheet
No ratings yet
Institute of Management Technology, Ghaziabad End Term Exam (Term - VII) Take Home Exam (Time Duration: 2.30 HRS) Batch 2019 - 21 Answer-Sheet
18 pages
House Pricing
No ratings yet
House Pricing
15 pages
Emp at Tricode
No ratings yet
Emp at Tricode
6 pages
17 Ensemble Techniques Problem Statement
No ratings yet
17 Ensemble Techniques Problem Statement
28 pages
Coe Projects
No ratings yet
Coe Projects
7 pages
Cabico Tan
No ratings yet
Cabico Tan
11 pages
ML Remaining
No ratings yet
ML Remaining
17 pages
Part I
No ratings yet
Part I
12 pages
ML1
No ratings yet
ML1
6 pages
Data Analytics Program
No ratings yet
Data Analytics Program
11 pages
a
No ratings yet
a
2 pages
Correction
No ratings yet
Correction
3 pages
My_own_cheatsheet
No ratings yet
My_own_cheatsheet
13 pages
SMA EXP 10 CODE PRINT
No ratings yet
SMA EXP 10 CODE PRINT
7 pages
Lab 1
No ratings yet
Lab 1
3 pages
New Chat: 1. Predicting Uber Ride Prices
No ratings yet
New Chat: 1. Predicting Uber Ride Prices
16 pages
ML 1-10
No ratings yet
ML 1-10
53 pages
TP - JEUX - ML - Corrigé
No ratings yet
TP - JEUX - ML - Corrigé
5 pages
ML MANUAL WITH OUTPUTS (2)
No ratings yet
ML MANUAL WITH OUTPUTS (2)
30 pages
Adaboost
No ratings yet
Adaboost
2 pages
Import Pandas As PD DF PD - Read - CSV ("Titanic - Train - CSV") DF - Head
No ratings yet
Import Pandas As PD DF PD - Read - CSV ("Titanic - Train - CSV") DF - Head
20 pages
ML Internal questions
No ratings yet
ML Internal questions
15 pages
Lab Report 8
No ratings yet
Lab Report 8
11 pages
BI 8
No ratings yet
BI 8
3 pages
AIL303 M
No ratings yet
AIL303 M
22 pages
unit 3 5
No ratings yet
unit 3 5
4 pages
MACHINE LEARNING manual
No ratings yet
MACHINE LEARNING manual
36 pages
Additional Program
No ratings yet
Additional Program
573 pages
ML Practical 205160694034
No ratings yet
ML Practical 205160694034
33 pages
Activity 4 CGPA Vs Placement Package Program
No ratings yet
Activity 4 CGPA Vs Placement Package Program
4 pages
healthcare-project-simplilearn- Week3
No ratings yet
healthcare-project-simplilearn- Week3
7 pages
AI ML - Cycle 2 Programs (1)
No ratings yet
AI ML - Cycle 2 Programs (1)
15 pages
Data Science Lab Manual
No ratings yet
Data Science Lab Manual
32 pages
sentiment analysis using LSTM (1)
No ratings yet
sentiment analysis using LSTM (1)
5 pages
21b-200-SE_LW04
No ratings yet
21b-200-SE_LW04
4 pages
Tips_for_Testing_in_Python_1646539645
No ratings yet
Tips_for_Testing_in_Python_1646539645
23 pages
dl lab prog 2
No ratings yet
dl lab prog 2
2 pages
main.py (1)
No ratings yet
main.py (1)
10 pages
NLP Lab
No ratings yet
NLP Lab
18 pages
Ml Solution
No ratings yet
Ml Solution
60 pages
AML_code_for_m2
No ratings yet
AML_code_for_m2
7 pages
Machine Learning Code Explanation
No ratings yet
Machine Learning Code Explanation
33 pages
ML Foram
No ratings yet
ML Foram
17 pages
ml record
No ratings yet
ml record
21 pages
ANLY 502 Final Report
No ratings yet
ANLY 502 Final Report
7 pages
Edx Course Lab Programs
No ratings yet
Edx Course Lab Programs
19 pages
A2 Vishal Borra
No ratings yet
A2 Vishal Borra
2 pages
DMT Cia2
No ratings yet
DMT Cia2
11 pages
bot
No ratings yet
bot
1 page
DL Lab Manual
100% (1)
DL Lab Manual
35 pages
Report
No ratings yet
Report
24 pages
Exp4(Linear Regression)
No ratings yet
Exp4(Linear Regression)
2 pages
Multi Classification.py(for 1 Class Tp,Tn,Fp,Fn)
No ratings yet
Multi Classification.py(for 1 Class Tp,Tn,Fp,Fn)
25 pages
Code
No ratings yet
Code
4 pages
Angular Generative AI: Building an intelligent CV enhancer with Google Gemini
From Everand
Angular Generative AI: Building an intelligent CV enhancer with Google Gemini
Abdelfattah Ragab
No ratings yet
Comparativestudy
No ratings yet
Comparativestudy
12 pages
DLP 1
No ratings yet
DLP 1
8 pages
MS 3
No ratings yet
MS 3
16 pages
Project Report CB
No ratings yet
Project Report CB
34 pages
Python Tkinter Programs For Students
No ratings yet
Python Tkinter Programs For Students
11 pages
Bealajar Menghafal Irregular Verb
No ratings yet
Bealajar Menghafal Irregular Verb
3 pages
PDF Levi Strauss Anthropology and Aesthetics 1st Edition Boris Wiseman download
100% (10)
PDF Levi Strauss Anthropology and Aesthetics 1st Edition Boris Wiseman download
71 pages
Spectrum Analyzer PDF
No ratings yet
Spectrum Analyzer PDF
18 pages
Chartering 3789
No ratings yet
Chartering 3789
2 pages
IDC - Syllabus - Fundamentals of Comp SC
No ratings yet
IDC - Syllabus - Fundamentals of Comp SC
2 pages
AI Systems and Definitions
No ratings yet
AI Systems and Definitions
82 pages
Bengali 128 Updated Team 24x7offshoring)
No ratings yet
Bengali 128 Updated Team 24x7offshoring)
4 pages
R - 04 X 1.PGM - Oxi Programming Example
No ratings yet
R - 04 X 1.PGM - Oxi Programming Example
7 pages
Kpop Dance Competition Smartfren Proposal
No ratings yet
Kpop Dance Competition Smartfren Proposal
6 pages
MPU 3273/ LANG 2128/ BLC 221: Professional Communication
No ratings yet
MPU 3273/ LANG 2128/ BLC 221: Professional Communication
17 pages
Gesture Singing
100% (1)
Gesture Singing
14 pages
Landmarks
No ratings yet
Landmarks
71 pages
12th Physics Unit Test 2 Question Bank Science 2024 25
No ratings yet
12th Physics Unit Test 2 Question Bank Science 2024 25
7 pages
Curriculum Map Part 2-1
No ratings yet
Curriculum Map Part 2-1
10 pages
Voyage Manager User Guide Ver1.0
No ratings yet
Voyage Manager User Guide Ver1.0
19 pages
Bus Organization of 8085 Microprocessor
No ratings yet
Bus Organization of 8085 Microprocessor
6 pages
Organizational Behavior: Chapter No 11 Communication
No ratings yet
Organizational Behavior: Chapter No 11 Communication
27 pages
An Introduction by Kamala Das
No ratings yet
An Introduction by Kamala Das
4 pages
STP Permission Form Axler
No ratings yet
STP Permission Form Axler
1 page
Fork Dash
No ratings yet
Fork Dash
8 pages
A Radical Approach to Real Analysis 2nd Edition David Bressoud all chapter instant download
100% (6)
A Radical Approach to Real Analysis 2nd Edition David Bressoud all chapter instant download
58 pages
CHP 3 Micro Structure
No ratings yet
CHP 3 Micro Structure
6 pages
AMANDA
No ratings yet
AMANDA
6 pages