3/26/24, 4:07 PM 1_sentiment_analysis_nlp.
ipynb - Colaboratory
keyboard_arrow_down Mounting the drive
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDriveb/nlp_project
!ls #checking if files are there or not
keyboard_arrow_down Importing the dataset
import pandas as pd
data = pd.read_csv('amazon_alexa.tsv', sep='\t')
data.head(10) #dataset is labelled in a binary format
mydata = data[['verified_reviews','feedback']] #relevant columns
mydata.columns = ['review','label'] #renaming
mydata.head()
#checking the distribution of label columnn
mydata.value_counts('label')
As can be seen above, dataset is imbalanced. Thus we will be using Undersampling technique to balance the dataset.
# Count the occurrences of each label
label_counts = mydata["label"].value_counts()
# Get the number of rows to drop from the majority class
rows_to_drop = label_counts.max() - label_counts.min()
# Drop rows from the majority class randomly
if rows_to_drop > 0:
data_majority = mydata[mydata["label"] == 1]
data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
data_balanced = mydata.copy()
# Check the new class balance
print(data_balanced["label"].value_counts())
The dataset above is balanced.
keyboard_arrow_down Data preprocessing
#defining a function to clean the dataset
import re
def clean_text(text):
# to remove special characters and punctuation
text = re.sub(r"[^\w\s]", " ", text)
# to remove single characters
text = re.sub(r"\b[a-zA-Z]\b", " ", text)
# to remove HTML tags
text = re.sub(r"<[^>]*>", " ", text)
# to lowercase the text
text = text.lower()
# to remove extra whitespace
text = re.sub(r"\s+", " ", text)
# to trim leading and trailing spaces
text = text.strip()
return text
https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 1/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory
import pandas as pd
# extracting the review colum as a list
reviews = data_balanced['review'].tolist()
# Cleaning the text in the list made
cleaned_reviews = [clean_text(review) for review in reviews] #iterating through each element of reviews column
# Adding the cleaned reviews as a new column to the datafrae
data_balanced['clean_reviews'] = cleaned_reviews
data_balanced
keyboard_arrow_down Splitting the dataset into 5% training and 95% test dataset
import pandas as pd
# Assuming your DataFrame is called "df"
total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)
# Randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)
# Get the remaining rows for the test set
train_set = data_balanced.drop(test_set.index)
keyboard_arrow_down Sentiment analysis using LLM
keyboard_arrow_down Setting up GEMINI API
!pip install -q -U google-generativeai
# Necessary packages
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
def to_markdown(text):
text = text.replace('•', ' *')
return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
# Used to securely store your API key
from google.colab import userdata
# Using `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
for m in genai.list_models():
if 'generateContent' in m.supported_generation_methods:
print(m.name)
#we will be using the gemini pro model
model = genai.GenerativeModel('gemini-pro')
%%time
response = model.generate_content("how great is MS Dhoni?")
to_markdown(response.text)
keyboard_arrow_down Integrating the Gemini pro API to our sentiment analysis task
https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 2/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory
test_set_sample = test_set.sample(20) #taking random 20 dataset from test dataset
test_set_sample['pred_label'] = '' #creating a column pred_label for our predicted feedback
test_set_sample
# Convert the DataFrame to JSON using the to_json() method
json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')
# Print the JSON data
print(json_data)
prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.
```
{json_data}
```
"""
print(prompt)
#feeding the prompt to gemini model to generate the content
response = model.generate_content(prompt)
print(response.text)
import json
# Clean the data by stripping the backticks
json_data = response.text.strip("`")
# Load the cleaned data and convert to DataFrame
data = json.loads(json_data)
df_sample = pd.DataFrame(data)
df_sample
#Overwrite pred_label from 'df' into pred_label in 'train_set_sample'
test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample
# Plotting confusion matrix on the predictions
from sklearn.metrics import confusion_matrix
y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]
confusion_matrix(y_true, y_pred)
keyboard_arrow_down Batching GEMINI API calls
test_set.shape
test_set_total = test_set.sample(100)
test_set_total['pred_label'] = ''
test_set_total
batches = []
batch_size = 25
for i in range(0, len(test_set_total), batch_size):
batches.append(test_set_total[i : i + batch_size]) # Append batches instead of assigning
https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 3/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory
import time
def gemini_completion_function(batch,current_batch,total_batch):
"""Function works in three steps:
# Step-1: Convert the DataFrame to JSON using the to_json() method.
# Step-2: Preparing the Gemini Prompt
# Step-3: Calling Gemini API
"""
print(f"Now processing batch#: {current_batch+1} of {total_batch}")
json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')
prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three backticks below.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.
Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
```
{json_data}
```
"""
print(prompt)
response = model.generate_content(prompt)
time.sleep(5)
return response
batch_count = len(batches)
responses = []
for i in range(0,len(batches)):
responses.append(gemini_completion_function(batches[i],i,batch_count))
import json
df_total = pd.DataFrame() # Initialize an empty DataFrame
for response in responses:
# Clean the data by stripping the backticks
json_data = response.text.strip("`")
# Load the cleaned data and convert to DataFrame
data = json.loads(json_data)
df_temp = pd.DataFrame(data)
# Append the DataFrame to the final DataFrame
df_total = df_total.append(df_temp, ignore_index=True)
print(df_total) # Display the final DataFrame
#Overwriting the pred_label from 'df' into pred_label in 'train_set_sample'
test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total
# Plotting confusion matrix on the predictions
from sklearn.metrics import confusion_matrix, accuracy_score
y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]
print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")
https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 4/5
3/26/24, 4:07 PM 1_sentiment_analysis_nlp.ipynb - Colaboratory
https://colab.research.google.com/drive/1v0c7kmDSBApUGFTq0mibaidimGca-PWg?authuser=6#scrollTo=Rx1NfrvIxcyP&printMode=true 5/5