0% found this document useful (0 votes)
11 views

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

racoon97970301
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views

1 - Sentiment - Analysis - NLP - Ipynb - Codes Only

Uploaded by

racoon97970301
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

3/26/24, 4:07 PM 1_sentiment_analysis_nlp.

ipynb - Colaboratory

keyboard_arrow_down Mounting the drive


from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDriveb/nlp_project
!ls #checking if files are there or not

keyboard_arrow_down Importing the dataset


import pandas as pd

data = pd.read_csv('amazon_alexa.tsv', sep='\t')


data.head(10) #dataset is labelled in a binary format

mydata = data[['verified_reviews','feedback']] #relevant columns


mydata.columns = ['review','label'] #renaming

mydata.head()

#checking the distribution of label columnn


mydata.value_counts('label')

As can be seen above, dataset is imbalanced. Thus we will be using Undersampling technique to balance the dataset.

# Count the occurrences of each label


label_counts = mydata["label"].value_counts()

# Get the number of rows to drop from the majority class


rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly


if rows_to_drop > 0:
data_majority = mydata[mydata["label"] == 1]
data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
data_balanced = mydata.copy()

# Check the new class balance


print(data_balanced["label"].value_counts())

The dataset above is balanced.

keyboard_arrow_down Data preprocessing


#defining a function to clean the dataset
import re

def clean_text(text):
# to remove special characters and punctuation
text = re.sub(r"[^\w\s]", " ", text)

# to remove single characters


text = re.sub(r"\b[a-zA-Z]\b", " ", text)

# to remove HTML tags


text = re.sub(r"<[^>]*>", " ", text)

# to lowercase the text


text = text.lower()

# to remove extra whitespace


text = re.sub(r"\s+", " ", text)

# to trim leading and trailing spaces


text = text.strip()

return text

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 1/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

import pandas as pd

# extracting the review colum as a list


reviews = data_balanced['review'].tolist()

# Cleaning the text in the list made


cleaned_reviews = [clean_text(review) for review in reviews] #iterating through each element of reviews column

# Adding the cleaned reviews as a new column to the datafrae


data_balanced['clean_reviews'] = cleaned_reviews

data_balanced

keyboard_arrow_down Splitting the dataset into 5% training and 95% test dataset
import pandas as pd

# Assuming your DataFrame is called "df"


total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

# Randomly sample train_size rows for the training set


test_set = data_balanced.sample(test_size)

# Get the remaining rows for the test set


train_set = data_balanced.drop(test_set.index)

keyboard_arrow_down Sentiment analysis using LLM


keyboard_arrow_down Setting up GEMINI API
!pip install -q -U google-generativeai

# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display


from IPython.display import Markdown

def to_markdown(text):
text = text.replace('•', ' *')
return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key


from google.colab import userdata

# Using `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.


GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

for m in genai.list_models():
if 'generateContent' in m.supported_generation_methods:
print(m.name)

#we will be using the gemini pro model


model = genai.GenerativeModel('gemini-pro')

%%time
response = model.generate_content("how great is MS Dhoni?")

to_markdown(response.text)

keyboard_arrow_down Integrating the Gemini pro API to our sentiment analysis task
https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 2/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

test_set_sample = test_set.sample(20) #taking random 20 dataset from test dataset

test_set_sample['pred_label'] = '' #creating a column pred_label for our predicted feedback

test_set_sample

# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data


print(json_data)

prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)

#feeding the prompt to gemini model to generate the content


response = model.generate_content(prompt)

print(response.text)

import json

# Clean the data by stripping the backticks


json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame


data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

#Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

confusion_matrix(y_true, y_pred)

keyboard_arrow_down Batching GEMINI API calls


test_set.shape

test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

batches = []
batch_size = 25

for i in range(0, len(test_set_total), batch_size):


batches.append(test_set_total[i : i + batch_size]) # Append batches instead of assigning

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 3/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

import time

def gemini_completion_function(batch,current_batch,total_batch):
"""Function works in three steps:
# Step-1: Convert the DataFrame to JSON using the to_json() method.
# Step-2: Preparing the Gemini Prompt
# Step-3: Calling Gemini API
"""

print(f"Now processing batch#: {current_batch+1} of {total_batch}")

json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three backticks below.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.
Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

```
{json_data}
```
"""

print(prompt)
response = model.generate_content(prompt)
time.sleep(5)

return response

batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
responses.append(gemini_completion_function(batches[i],i,batch_count))

import json

df_total = pd.DataFrame() # Initialize an empty DataFrame

for response in responses:


# Clean the data by stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame


data = json.loads(json_data)
df_temp = pd.DataFrame(data)

# Append the DataFrame to the final DataFrame


df_total = df_total.append(df_temp, ignore_index=True)

print(df_total) # Display the final DataFrame

#Overwriting the pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 4/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory

https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 5/5

You might also like