Importing Packages: Id Label Tweet 0 1 2 3 4
Importing Packages: Id Label Tweet 0 1 2 3 4
ipynb - Colaboratory
Importing Packages
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
nltk.download('stopwords')
sentiment_data = pd.read_csv('/content/TwitterHate.csv')
print(len(sentiment_data))
sentiment_data.head()
31962
id label tweet
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 1/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
sentiment_data['label'].value_counts()
#Imbalanced Dataset
0 29720
1 2242
Name: label, dtype: int64
def textcleanup(data):
tk = TweetTokenizer()
stop_words = set(stopwords.words('english'))
tweet_list = []
word_list = []
for tweet in list(data['tweet']):
tweet = tweet.encode('ascii', 'ignore').decode('ascii')
tweet = re.sub('[^ ]+\.[^ ]+','',tweet) # Remove URL
tweet = re.sub("[#'']",'',tweet) # Remove #
tweet = re.sub('\@\w+','',tweet) # Remove User handle
tweet = re.sub(r'^[RT]','',tweet)#remove RT-tags
tweet = re.sub("\W+\\+[A-Za-z0-9]+\d+\D|\\+[A-Za-z0-9]+\d+\D+\w",'',tweet) #Remove redu
tweet = re.sub("\b[a]+[m]+[p]\b",'',tweet)
tweet = tweet.lower().lstrip().rstrip()
tweet = tk.tokenize(tweet)
tweet = [word for word in tweet if word not in stop_words]
tweet = list(filter(lambda sentiment: len(sentiment) > 1, tweet))
tweet_list.append(tweet)
word_list.extend(tweet)
return tweet_list,word_list,stop_words
cleantext,wordlist,stop_words = textcleanup(sentiment_data)
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 2/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
word_count = Counter(wordlist)
word_count.most_common(10)
[('love', 2725),
('day', 2247),
('happy', 1673),
('im', 1155),
('time', 1115),
('life', 1114),
('like', 1089),
('today', 993),
('new', 989),
('positive', 934)]
plt.show()
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 3/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
clean_sentiments = []
clean_sentiments[0]
labels clean_sentiments
2 0 bihday majesty
tfidf_vectorizer = TfidfVectorizer(
max_df=0.5,
min_df=10,
strip_accents='unicode',
max_features=5000
)
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 4/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
)
tfidf_data = tfidf_vectorizer.fit_transform(sentiments_frame['clean_sentiments'])
#Splitting Data
i te
X_train, X_test, y_train, y_test = train_test_split(tfidf_data,sentiments_frame['labels'],
#Creating Model
model = LogisticRegression()
model.fit(X_train,y_train)
train_score = model.score(X_train,y_train)
test_score = model.score(X_test,y_test)
print(train_score)
print(test_score)
0.9557276389377762
0.9510402002189895
<matplotlib.axes._subplots.AxesSubplot at 0x7f2f86c36320>
grid_sr = GridSearchCV(
LogisticRegression(class_weight="balanced"), parameters, scoring='recal
)
grid_sr.fit(X_train, y_train)
grid_sr.best_params_
train_score1 = model_test.score(train_X,train_y)
test_score1 = model_test.score(test_X,test_y)
print(classification_report(test_y,model_test.predict(test_X)))
print(classification_report(train_y,model_test.predict(train_X)))
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 7/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 8/8