diff --git a/main.py b/main.py new file mode 100644 index 0000000..d84d787 --- /dev/null +++ b/main.py @@ -0,0 +1,593 @@ +#import dependencies#import +import re +import time +import requests +import tweepy +import json + + +#math/data +import numpy as np +import math +import pandas as pd +import seaborn as sns +import pandas as pd +import matplotlib.pyplot as plt +from seaborn import heatmap +from wordcloud import WordCloud +import collections + +import wikipedia as wp +from pygeocoder import Geocoder +from googletrans import Translator + +# Import and Initialize Sentiment Analyzer +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer +analyzer = SentimentIntensityAnalyzer() + +from bs4 import BeautifulSoup +from datetime import datetime +from datetime import date, timedelta + + +#from config import consumer_key, consumer_secret, access_token, access_token_secret +from apikeys import twitterAccessToken as access_token +from apikeys import twitterAccessTokenSecret as access_token_secret +from apikeys import twitterConsumerKey as consumer_key +from apikeys import twitterConsumerSecretKey as consumer_secret +newsKey = '3327bb97745b4f24b875a9bbc9154270' + + +# Setup Tweepy API Authentication# Setup +auth = tweepy.OAuthHandler(consumer_key, consumer_secret) +auth.set_access_token(access_token, access_token_secret) +api = tweepy.API(auth, parser=tweepy.parsers.JSONParser()) + + + + +def parse_url( url): + response = requests.get(url) + soup = BeautifulSoup(response.text, 'lxml') + listylist=[] + for table in soup.find_all('table'): + listylist.append(parse_html_table(table)) + return listylist + +def parse_html_table( table): + n_columns = 0; n_rows=0; column_names = [] + + # Find number of rows and columns + # we also find the column titles if we can + for row in table.find_all('tr'): + + # Determine the number of rows in the table + td_tags = row.find_all('td') + if len(td_tags) > 0: + n_rows+=1 + if n_columns == 0: + # Set the number of columns for our table + n_columns = len(td_tags) + + # Handle column names if we find them + th_tags = row.find_all('th') + if len(th_tags) > 0 and len(column_names) == 0: + for th in th_tags: + column_names.append(th.get_text()) + + # Safeguard on Column Titles + if len(column_names) > 0 and len(column_names) != n_columns: + raise Exception("Column titles do not match the number of columns") + + columns = column_names if len(column_names) > 0 else range(0,n_columns) + df = pd.DataFrame(columns = columns, + index= range(0,n_rows)) + row_marker = 0 + for row in table.find_all('tr'): + column_marker = 0 + columns = row.find_all('td') + for column in columns: + df.iat[row_marker,column_marker] = column.get_text() + column_marker += 1 + if len(columns) > 0: + row_marker += 1 + + # Convert to float if possible + for col in df: + try: + df[col] = df[col].astype(float) + except ValueError: + pass + + return df + +def getCountryLanguages(): + #TODO: Use the .apply to just change the table to one dialect. Imrpove language scope later. + df = parse_url('https://www.infoplease.com/world/countries-world/languages-spoken-each-country-world') + countryLanguages = df[0].rename(columns={0:'country',1:'language'}).set_index('country') + countryLanguages['language'] = [re.sub('\d+|%|\(.*\)|\s','',i).split(',')[0].split(';')[0] for i in countryLanguages['language']] + return countryLanguages + + +#returns hashtag, followers, following, text, geo, date +#cityCountry example: 'paris,france' +def SearchForData(search_term, nTweets, cityCountry='',radius=100, numDays=1): + + # Setup Tweepy API Authentication + #auth = tweepy.OAuthHandler(consumer_key, consumer_secret) + #auth.set_access_token(access_token, access_token_secret) + #api = tweepy.API(auth, parser=tweepy.parsers.JSONParser()) + + #--- Calculate geocordinates from cityCountry --- --- ---- ---- --- --- --- -- + if cityCountry: + geoConvertTries = 0 + while True: + try: + result = Geocoder.geocode(cityCountry) + except Exception as error: + #print('errrooooorrrrr: ',error.message) + if 'OVER_QUERY_LIMIT' in str(error): + print('Encountered an error:{0}\nWaiting 20 seconds and trying again.'.format(error)) + time.sleep(20) + if geoConvertTries>10: + print("Could not convert geo. returning empty list") + return [] + else: + print("Could not convert geo. returning empty list") + return [] + else: + break + geoConvertTries+=1 + # 34.0934,56.134,50mi + coords = str(result[0].coordinates).replace('(','').replace(')','') + f',{radius}mi' + coords=coords.replace(' ','') + print(cityCountry, ": ", coords) + #--- ---- ----- ---- ---- ---- ---- ---- --- ---- ---- --- ---- --- ---- --- -- + + #--- grab tweets --- ---- ---- ---- ---- ---- ---- ---- --- --- ---- ---- ---- + maxTweets = 10000; oldest_tweet = None; unique_ids = []; desiredTweets = [];nTweetsPerDay=nTweets/numDays + for day,num in zip([str(date.today() - timedelta(i)).split()[0] for i in range(numDays)], range(1,numDays+1)): + tweetsPerDay=[];tries=1 + while len(desiredTweets) < min(nTweetsPerDay*num , maxTweets/numDays): + + #--- determine whether to grab tweets by geo or not --- ---- --- ----- -- + while True: + try: + if cityCountry: + tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type="recent", max_id=oldest_tweet, geocode=coords, until=day) + else: + tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type="recent", max_id=oldest_tweet, until=day) + time.sleep(2) + except Exception as error: + print(error,'Trying again after 1 minute.') + time.sleep(60) + else: + break + + #---- ----- ----- ---- ----- ---- ----- ---- ----- ---- ---- ---- ---- -- + + #--- Dont go through an infinite loop trying to fill tweets that don't exist ----- + if len(tweetsPerDay['statuses'])==0: + print(f'No tweets returned while searching for \'{search_term}\'',len(desiredTweets)\ + ,' ',day) + if desiredTweets==[]: + return pd.DataFrame([{'text':'', 'vader':0, + 'location':cityCountry, 'geo':coords, + 'hashtags':0, 'followers':0, + 'friends_count':0,'statuses_count':0, + 'created_at':datetime.strptime(str(date.today()), '%Y-%m-%d')}]) + return pd.DataFrame(desiredTweets) + + #--- Append relevent tweets to output listy list ---- --- ---- ---- ---- --- --- + for tweet in tweetsPerDay['statuses']: + # Append tweet_id to ids list if it doesn't already exist. This allows checking for duplicate tweets + if tweet["id"] not in unique_ids : + unique_ids.append(tweet['id']) + if cityCountry: + desiredTweets.append({'text':tweet['text'], 'vader':analyzer.polarity_scores(tweet['text'])['compound'], + 'location':cityCountry, 'geo':coords, + 'hashtags':tweet['entities']['hashtags'], 'followers':tweet['user']['followers_count'], + 'friends_count':tweet['user']['friends_count'],'statuses_count':tweet['user']['statuses_count'], + 'created_at':datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')}) + else: + desiredTweets.append({'text':tweet['text'], 'vader':analyzer.polarity_scores(tweet['text'])['compound'], + 'hashtags':tweet['entities']['hashtags'], 'followers':tweet['user']['followers_count'], + 'friends_count':tweet['user']['friends_count'],'statuses_count':tweet['user']['statuses_count'], + 'created_at':datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')}) + + # Reassign the the oldest tweet (i.e. the max_id) subtract 1 so the previous oldest isn't included + oldest_tweet = tweet["id"] - 1 + print(f'{day}, {cityCountry}: attempt {tries}, {len(desiredTweets)} tweets gathered.');tries+=1 + + #--- Print sample tweet --- --- ---- ---- --- ---- ---- --- ---- ---- --- + translator = Translator() + try: + print ('Sample Tweet:',translator.translate(desiredTweets[0]['text'], dest='en').text) + except: + print('there was an error translating sample tweet: ',desiredTweets[0]['text']) + return pd.DataFrame(desiredTweets) if cityCountry else desiredTweets + + +def worldTweets(search_term,numTweets, numDays): + #-- Get the most populated cities from wikipedia (Thank you wikipedia library!) -- + html = wp.page("List_of_cities_by_population_density").html().encode("UTF-8") + worldCities = pd.read_html(html)[1] + worldCities = worldCities.drop([2,3,4],axis=1) + worldCities = worldCities.rename(columns={0:'city',1:'population',5:'density',6:'country'}) + worldCities = worldCities.iloc[1:] + worldCities['population'] = [int(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['population']] + worldCities['density'] = [int(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['density']] + #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- --- + + #--- Get tweets by Worlds most densily populated cities ---- --- ---- ---- --- --- + translator = Translator() + comparisons=pd.DataFrame(columns=['time density','sentiment']) + cityCount = 10 + for index,row in worldCities.iterrows(): + #-- location --- ----- --- ---- + city,pop,density,country = row + cityCountry = city+' , '+country + + #-- language conversion --- ---- -- + languagesDf = getCountryLanguages() + if translateToLocalLanguage: + try: + translatedSearch = translator.translate(search_term, src='en', dest=languagesDf.loc[country,'language']).text + except ValueError: + print("could not translate ", languagesDf.loc[country,'language']) + translatedSearch=search_term + print('translated word: ',translatedSearch) + else: + translatedSearch=search_term + #--- --- --- ---- ---- --- --- --- + + try: + tweetsWorld = pd.concat([tweetsWorld, SearchForData(translatedSearch, numTweets, cityCountry, 100)], axis=0) + except: + tweetsWorld = SearchForData(translatedSearch, numTweets, cityCountry, 100) + print('\n') + time.sleep(4) + #if cityCount==0: + # break + #else: + # cityCount-=1 + return tweetsWorld + +def usTweets(search_term, numTweets, numDays): + #--- population per cities in United States --- ---- ---- --- ---- --- --- --- --- + html = wp.page("List_of_United_States_cities_by_population_density").html().encode("UTF-8") + UScities = pd.read_html(html)[1] + UScities = UScities.drop([0,2,4,6,8],axis=1) + UScities = UScities.rename(columns={1:'city',3:'state',5: 'land area (mi^2)',7:'density'}) + UScities = UScities.iloc[1:] + #df['population']=[int(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in df['population']] + UScities['density'] = [float(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in UScities['density']] + UScities['land area (mi^2)']=[float(area.split('\xa0')[-1]) for area in UScities['land area (mi^2)']] + #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- --- + + + + #--- Add US Cities --- ---- ---- ---- ---- --- + cityCount = 100;nottheseones = 0 + for index,row in UScities.iterrows(): + if nottheseones<0: + nottheseones+=1 + pass + #-- location --- ----- --- ---- + city,state,area,density = row + cityState = state+' , '+city + + try: + tweetsUS = pd.concat([tweetsUS, SearchForData(search_term=search_term, nTweets=numTweets, cityCountry=cityState,radius=max(area,5), numDays=numDays )], axis=0) + except: + tweetsUS = SearchForData(search_term=search_term, nTweets=numTweets, cityCountry=cityState,radius=max(area,5), numDays=numDays) + + print('\n') + time.sleep(4) + if cityCount==0: + break + else: + cityCount-=1 + return tweetsUS, UScities + + +def GetTweetsByPopularCities(search_term, numTweets, translateToLocalLanguage = True, numDays = 1): + #tweetsWorld = worldTweets() + tweetsUS, UScities = usTweets(search_term, numTweets, numDays) + #return tweetsUS + tweetsUS['day']=[time.date() if time!= 0 else 0 for time in tweetsUS['created_at']] + + + #Calculate time density of tweets per city + cityData = tweetsUS.groupby(['location','geo','day']).mean() + aveTimes={} + for date in tweetsUS.groupby(['location','geo','day'])['created_at']: + if date == None: + aveTimes[cityTime]=None + cityTime, timeseries = date + (cityState,geo,timeobj)=cityTime + #print(cit) + [state,city]=cityState.split(',') + city=city.lstrip().rstrip(); state=state.rstrip().lstrip() + cityDensity=UScities.loc[UScities['city']==city].loc[UScities['state']==state].loc[:,'density'].values[0] + #return timeseries + try: + aveTimes[cityTime]=np.mean([(b-a).seconds*100/cityDensity for a,b in zip(timeseries.tolist()[1:],timeseries.tolist()[0:-1])]) + if not aveTimes[cityTime]: + aveTimes[cityTime]=0 + except: + if len(timeseries.tolist())<2: + aveTimes[cityTime]=None + #print(f'{city}, {cityDensity}, {aveTimes[cityState]}') + #KeyError: ('New Jersey , Guttenberg', datetime.date(2018, 6, 30)) + cityData['time between tweets']=[aveTimes[citytime] for citytime in cityData.index] + + #c_matrix = pd.pivot_table(news_data_new, "score", index=["name"], columns=["year_month"], aggfunc=np.mean) + matrix=pd.pivot_table(cityData.loc[:,['location','day','vader']].drop(columns=['location', 'day']),'vader',index=['location'],columns=['day'],aggfunc=np.mean) + plt.figure(figsize=(12,12)) + hm = heatmap(matrix, cmap='RdYlGn', annot=True) + hm.set_title('City Tweet Sentiment Over Time') + plt.savefig('/plots/cityTweetSentimentOverTime.png',dpi=100) + plt.show() + return cityData, tweetsUS + + +def getSentimentsByPopularUsers(target_term): + # Get 100 recent tweets and create lists of tweets, sentiments, followers and retweets + public_tweets = api.search(target_term, count=100, lang="en", result_type="recent") + tweets = [] + sentiments = [] + followers = [] + retweets = [] + for tweet in public_tweets['statuses']: + tweets.append(tweet['text']) + followers.append(tweet['user']['followers_count']) + retweets.append(tweet['retweet_count']) + result = analyzer.polarity_scores(tweet["text"]) + sentiments.append(result['compound']) + # Create a dataframe by combining all the lists + combined_df = pd.DataFrame({'Followers':followers,'Sentiments':sentiments, 'Retweets':retweets, 'Tweets':tweets}) + + # Calculate average sentiment of top 10 users with most followers + top_10_followers = combined_df.nlargest(10, 'Followers') + top_10_followers_average = round(top_10_followers['Sentiments'].mean(),3) + + top_10_retweets = combined_df.nlargest(10, 'Retweets') + top_10_retweets_average = round(top_10_retweets['Sentiments'].mean(),3) + + overal_average = round(combined_df['Sentiments'].mean(),3) + + # Print analysis + print('Average overall sentiment is ' + str(overal_average)) + print('Sentiment of top 10 influential people is '+str(top_10_followers_average)) + print('Sentiment of top 10 retweeted tweets is '+str(top_10_retweets_average)) + + if top_10_followers_average>overal_average: + print('Top 10 influential people have more positive sentiments than general public about '+target_term) + else: + print('General public has more positive sentiments than top 10 influential people about '+target_term) + + #Draw bar chart for sentiments + plt.figure(figsize= (15,10)) + titles = ['Influential Users','Retweeted Most','Overall average'] + sent = [top_10_followers_average, top_10_retweets_average, overal_average] + plt.bar(titles, sent) + + plt.title(f"Sentiment Analysis {time.strftime('%x %X')}") + plt.ylabel("Tweet Polarity") + plt.axhline(0, color = 'black') + plt.savefig("/plots/output_9_1.png") + #plt.ylim (-1.0,1.0) + plt.show() + + # Draw scatter chart for sentiments, number of followers and number of retweets + # Remove users with top 20 follower count to allow more meaningful visualization + combined_df1 = combined_df.nsmallest(80, 'Followers') + plt.figure(figsize= (15,10)) + plt.scatter(combined_df1['Followers'], combined_df1['Sentiments'], color="lightskyblue", edgecolor="black" + , label="Number of retweets", alpha=0.5, s=combined_df1['Retweets'], linewidths=1) + plt.title(f"Sentiment Analysis based on Number of Followers {time.strftime('%x %X')}") + plt.xlabel("Number of Followers") + plt.ylabel("Tweet Polarity") + plt.ylim (-1.0,1.0) + + lgnd=plt.legend(loc="upper right", title="Sentiment Analysis based on Number of Followers", scatterpoints=1) + lgnd.legendHandles[0]._sizes = [50] + lgnd.legendHandles[0]._sizes = [50] + lgnd.legendHandles[0]._sizes = [50] + plt.savefig("/plots/output_8_1.png") + plt.show() + +def bar_news(news_data): + '''takes a DataFrame as an argument, + creates a bar chart "Overal Quantity of Articles per News Source" + ''' + #group data by News Sources names + group_news=news_data.groupby("name") + group_count=group_news['published_at'].count().rename("count") + + df_group_count=pd.DataFrame(group_count) + df_group_count.sort_values(by=['count'], inplace=True, ascending=False) + df_group_count.reset_index(inplace=True) + + #create a bar chart to show quantity of articles for each news source + group_name=df_group_count['name'] + y_pos=np.arange(len(df_group_count['count'])) + plt.figure(figsize=(10, 7)) + plt.barh(y_pos, df_group_count['count'], tick_label=group_name, alpha=0.6, color="green") + plt.title(f"Overal Quantity of Articles per News Source, 09/01/17-{time.strftime('%x')}") + plt.xlabel("Number of Articles") + plt.savefig("/plots/news_bar_chart.png", dpi=100, bbox_inches='tight') + plt.show() + + #add analysis for bar chart + q=len(df_group_count['count'])-1 + name_min=df_group_count.iloc[q,0] + min_q=df_group_count.iloc[q,1] + + name_max=df_group_count.iloc[0,0] + max_q=df_group_count.iloc[0,1] + print(f'Based on the analysis of "Overal Quantity of Articles per News Source" chart, we can notice that {name_max} published the maximum number of articles ({max_q}).' ) + print(f'{name_min} published the minimal quantity of articles ({min_q}) related to the given keywords.') + +def compound_scores_heat_map(news_data_new): + '''takes a DataFrame with compound scores as an argument, + creates a heatmap chart for average compound scores. + Since some news sources didn't publish articles on particular months, + using pd.pivot_table gives us a 2D Series where missing values are replaced with NaN. + ''' + c_matrix = pd.pivot_table(news_data_new, "score", index=["name"], columns=["year_month"], aggfunc=np.mean) + c_matrix=c_matrix.round(2) + + #create a heatmap chart of the average compound scores + fig = plt.figure(figsize=(12,12)) + hm = sns.heatmap(c_matrix, cmap='RdYlGn', annot=True) + hm.set_title(f"Heatmap of Avg. Compound Scores, 09/01/17-{time.strftime('%x')}") + plt.savefig("/plots/compound_scores_heat_map.png", dpi=100) + plt.show() + + +def compound_score_data(news_data): + '''takes a DataFrame as an argument and calculates compound scores for articles description, + returns a new DataFrame + ''' + news_data_new=news_data.loc[:,['year_month', 'name', 'description']] + for index, row in news_data_new.iterrows(): + text=news_data_new.loc[index,"description"] + if text: + results = analyzer.polarity_scores(text) + news_data_new.loc[index,"score"]=round(results["compound"],2) + else: + news_data_new.drop(index, inplace=True) + print(news_data_new.head()) + compound_scores_heat_map(news_data_new) + + +def count_articles_heat_map(news_data): + '''takes a DataFrame as an argument, + creates a heatmap chart "Number of Articles per Month" + Since some news sources didn't publish articles on particular months, + using pd.pivot_table gives us a 2D Series where missing values are replaced with NaN. + ''' + d_matrix = pd.pivot_table(news_data, "url", index=["name"], columns=["year_month"], aggfunc='count') + d_matrix.head(20) + fig = plt.figure(figsize=(12,12)) + cm = sns.heatmap(d_matrix, cmap='YlGnBu', annot=True, fmt='g') + cm.set_title(f"Number of Articles per Month, 09/01/17-{time.strftime('%x')}") + plt.savefig("/plots/count_values_heat_map.png", dpi=100) + plt.show() + + +def news_api_data(user_input): + '''takes an argument "user_input" as keywords to perform API calls to newsapi.org, + returns a DataFrame with news data + ''' + + #in case we have several words as user's input, we can mark words or phrases that must appear with a + symbol + user_input_list=user_input.split(" ") + user_input_plus="".join(f'+{k}' for k in user_input_list) + print(user_input_plus) + + url="https://newsapi.org/v2/everything" + + #specify parameters for sample, i.e results come from the particular news resources + sparams={ + 'apiKey': newsKey, + 'q': user_input_plus, + 'language': 'en', + 'pageSize': 100, + 'from': '2017-06-01', + 'sortBy': 'publishedAt', + 'sources': 'abc-news, associated-press, bbc-news, breitbart-news, cbc-news, cbs-news, cnn, fox-news, daily-mail, independent, nbc-news, newsweek, new-york-magazine, reuters, time, the-globe-and-mail, the-telegraph, the-washington-times, usa-today' + } + + #make API request and retrieve totalResults value + responses=requests.get(url, params=sparams) + + st_code=responses.status_code + if st_code!=200: + print(f'API problem, code {responses.status_code}') + + news_info=responses.json() + #print(responses.json()) + + sample_total=news_info['totalResults'] + + #if response containes more than 10000 articles, retrive information for 10000. + #Otherwise API returns an error 419 - too many requests + if sample_total>10000: + sample_total=10000 + print(f'The total number of results is {sample_total}') + + #calculate the nuber of pages to iterate through (numer of all results is diveded by 100 (i.e. records per page)) + x=min(80, math.ceil(sample_total/100)) + print(f'The number of pages is {x}') + + #create DataFrame for retrieved results + news_data=pd.DataFrame(columns=["published_at","name", "author", "description", "url","year_month"]) + index=0 + + #iterate through each page and store results in the DataFrame + for page in range(1,x+1): + sparams['page']=page + responses=requests.get(url, params=sparams) + news_info=responses.json() + #print(f'processing page {page} of {x}') + if 'articles' not in news_info: + #print(f'skipping page {page} of {x}') + continue + for article in news_info['articles']: + news_data.loc[index,"published_at"]=article["publishedAt"] + news_data.loc[index,"name"]=article["source"]["name"] + news_data.loc[index,"author"]=article["author"] + news_data.loc[index,"description"]=article["description"] + news_data.loc[index,"url"]=article["url"] + news_data.loc[index,"year_month"]=article["publishedAt"][:7] + index+=1 + + #clean the data from duplicate records + news_data.drop_duplicates(subset=['published_at','name','description'], keep='first', inplace=True) + + news_data.to_csv("report.csv") + #news_data.info() + bar_news( news_data) + compound_score_data(news_data) + count_articles_heat_map(news_data) + #except: + # print('Query limit reached. Skipping news analysis') + +def getWordCloud(search_term): + #searchfordata inputs: search_term, nTweets, cityCountry='',radius=100, numDays=1 + tweet=SearchForData(search_term,100) + # Getting all hashtags in a list + hashtags = [] + text='' + for t in range(len(tweet)): + for u in (tweet[t]['hashtags']): + hashtags.append(u['text']) + + #print(hashtags) + # Assigning the list to a string variable since wordcloud needs it in a string variable and can'y use list + text=' '.join(hashtags) + #print(text) + # Using WordCloud library to create a Wordcloud chart + wordcloud = WordCloud(background_color='white',max_words=len(text),max_font_size=40,relative_scaling=.15).generate(text) + + plt.figure(figsize=(7,10)) + plt.imshow(wordcloud) + plt.axis('off') + plt.savefig('/plots/WordCloud.png',dpi=500) + plt.show() + + +def main(search_term): + timeStart = time.time() + + getSentimentsByPopularUsers(search_term) + news_api_data(search_term) + getWordCloud(search_term) + #GetTweetsByPopularCities inputs: search_term, numTweets, translateToLocalLanguage = True, numDays = 1 + cityDataMain = GetTweetsByPopularCities(search_term, numTweets=100,translateToLocalLanguage= False, numDays = 6) + + print(f'Test took {(time.time()-timeStart)//60} minutes') + return cityDataMain diff --git a/plots/WordCloud.png b/plots/WordCloud.png new file mode 100644 index 0000000..ebf6a1b Binary files /dev/null and b/plots/WordCloud.png differ diff --git a/plots/compound_scores_heat_map.png b/plots/compound_scores_heat_map.png new file mode 100644 index 0000000..8eadecd Binary files /dev/null and b/plots/compound_scores_heat_map.png differ diff --git a/plots/count_values_heat_map.png b/plots/count_values_heat_map.png new file mode 100644 index 0000000..23b6132 Binary files /dev/null and b/plots/count_values_heat_map.png differ diff --git a/plots/fig.png b/plots/fig.png new file mode 100644 index 0000000..6b03a4c Binary files /dev/null and b/plots/fig.png differ diff --git a/plots/news_bar_chart.png b/plots/news_bar_chart.png new file mode 100644 index 0000000..468c239 Binary files /dev/null and b/plots/news_bar_chart.png differ diff --git a/plots/output_8_1.png b/plots/output_8_1.png new file mode 100644 index 0000000..4d39cb3 Binary files /dev/null and b/plots/output_8_1.png differ diff --git a/plots/output_9_1.png b/plots/output_9_1.png new file mode 100644 index 0000000..a7ee1d7 Binary files /dev/null and b/plots/output_9_1.png differ