forked from Cyanide2119/AgriReviewsSentimentalAnalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
project1_2.py
192 lines (156 loc) · 6.24 KB
/
project1_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# Import necessary libraries
import platform
import re
import string
import time
import nltk
# Import libraries from previous sections (for sentiment analysis)
import pandas as pd
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
# Downloading NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")
# Text preprocessing function
def preprocess_text(text):
# Lowercasing
text = text.lower()
# Removing punctuation and numbers
text = re.sub(f"[{string.punctuation}]", "", text)
text = re.sub(r"\d+", "", text)
# Tokenization and lemmatization
lemmatizer = WordNetLemmatizer()
words = text.split()
words = [
lemmatizer.lemmatize(word)
for word in words
if word not in stopwords.words("english")
]
return " ".join(words)
# Function to filter reviews that are in English
def filter_english_reviews(reviews):
english_reviews = []
for review in reviews:
try:
# Detect the language of the review
if detect(review) == "en": # If the review is in English
english_reviews.append(review)
except:
# In case langdetect fails to detect language, skip that review
continue
return english_reviews
# Scraping reviews from Amazon using Selenium
def extract_reviews_amazon(product_url):
# Configure Selenium WebDriver
options = Options()
options.add_argument(
"--headless"
) # Run in headless mode (without opening a browser window)
if platform.system() != "Linux":
service = Service(
r"C:\Drivers\chromedriver.exe"
) # Replace with the path to your ChromeDriver
else:
service = Service("/usr/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=options)
try:
driver.get(product_url)
time.sleep(3) # Wait for the page to load
reviews = []
# Load multiple pages of reviews if available
while True:
review_elements = driver.find_elements(
By.CSS_SELECTOR, "span.a-size-base.review-text.review-text-content span"
)
reviews.extend([element.text for element in review_elements])
# Check if the "Next" button is available to navigate to the next page of reviews
try:
next_button = driver.find_element(By.CSS_SELECTOR, "li.a-last a")
next_button.click()
time.sleep(3)
except:
break # No "Next" button found, exit loop
driver.quit()
return reviews
except Exception as e:
print(f"Error occurred: {e}")
driver.quit()
return []
# Sentiment analysis functions (same as previous sections)
def determine_overall_sentiment(reviews, model, tfidf):
cleaned_reviews = [preprocess_text(review) for review in reviews]
review_tfidf = tfidf.transform(cleaned_reviews)
sentiments = model.predict(review_tfidf)
positive_reviews = sum(sentiments)
total_reviews = len(sentiments)
overall_sentiment = (
"Positive" if positive_reviews > total_reviews / 2 else "Negative"
)
return overall_sentiment, positive_reviews, total_reviews - positive_reviews
# Train sentiment analysis model (same as previous sections)
def train_sentiment_model():
# Sample product reviews dataset (You can replace it with your actual dataset)
data = {
"review": [
"This product is great! I love it.",
"I hate this product, it's the worst.",
"Amazing quality, will buy again.",
"Terrible product. Do not recommend.",
"Very good value for the price.",
"Awful. Broke after one use.",
"Satisfied with the purchase.",
"Completely disappointed. Waste of money.",
"Excellent quality, highly recommended!",
"Not worth it, very cheap material.",
],
"sentiment": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0], # 1: Positive, 0: Negative
}
# Loading data into a DataFrame
df = pd.DataFrame(data)
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
df["review"], df["sentiment"], test_size=0.2, random_state=42
)
# Vectorizing the text data using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
# Training a Logistic Regression classifier
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
return model, tfidf
# Main function to extract reviews and determine sentiment
def main(product_url):
# Train the sentiment model
model, tfidf = train_sentiment_model()
# Extract reviews from Amazon
reviews = extract_reviews_amazon(product_url)
if not reviews:
print("No reviews found.")
return
# Filter only English reviews
english_reviews = filter_english_reviews(reviews)
print(english_reviews)
if not english_reviews:
print("No English reviews found.")
return
# Determine overall sentiment
overall_sentiment, positive_count, negative_count = determine_overall_sentiment(
english_reviews, model, tfidf
)
# Display result
print(f"Overall Sentiment: {overall_sentiment}")
print(f"Positive Reviews: {positive_count}")
print(f"Negative Reviews: {negative_count}")
# Example usage
product_url = "https://www.amazon.in/TADSO-LID-Stainless-Gardening-Agriculture-8-5X11X35/product-reviews/B08672MYQ6/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews&pageNumber=1"
main(product_url)