# Import and Initialize Sentiment Analyzer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer analyzer = SentimentIntensityAnalyzer() import requests import pandas as pd from bs4 import BeautifulSoup import re from datetime import datetime from datetime import date, timedelta import tweepy; import json from apikeys import twitterAccessToken as access_token from apikeys import twitterAccessTokenSecret as access_token_secret from apikeys import twitterConsumerKey as consumer_key from apikeys import twitterConsumerSecretKey as consumer_secret def parse_url( url): response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') listylist=[] for table in soup.find_all('table'): listylist.append(parse_html_table(table)) return listylist def parse_html_table( table): n_columns = 0; n_rows=0; column_names = [] # Find number of rows and columns # we also find the column titles if we can for row in table.find_all('tr'): # Determine the number of rows in the table td_tags = row.find_all('td') if len(td_tags) > 0: n_rows+=1 if n_columns == 0: # Set the number of columns for our table n_columns = len(td_tags) # Handle column names if we find them th_tags = row.find_all('th') if len(th_tags) > 0 and len(column_names) == 0: for th in th_tags: column_names.append(th.get_text()) # Safeguard on Column Titles if len(column_names) > 0 and len(column_names) != n_columns: raise Exception("Column titles do not match the number of columns") columns = column_names if len(column_names) > 0 else range(0,n_columns) df = pd.DataFrame(columns = columns, index= range(0,n_rows)) row_marker = 0 for row in table.find_all('tr'): column_marker = 0 columns = row.find_all('td') for column in columns: df.iat[row_marker,column_marker] = column.get_text() column_marker += 1 if len(columns) > 0: row_marker += 1 # Convert to float if possible for col in df: try: df[col] = df[col].astype(float) except ValueError: pass return df def getCountryLanguages(): #TODO: Use the .apply to just change the table to one dialect. Imrpove language scope later. df = parse_url('https://www.infoplease.com/world/countries-world/languages-spoken-each-country-world') countryLanguages = df[0].rename(columns={0:'country',1:'language'}).set_index('country') countryLanguages['language'] = [re.sub('\d+|%|\(.*\)|\s','',i).split(',')[0].split(';')[0] for i in countryLanguages['language']] return countryLanguages #returns hashtag, followers, following, text, geo, date #cityCountry example: 'paris,france' def SearchForData(search_term, nTweets, cityCountry='',radius=100): # Setup Tweepy API Authentication auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, parser=tweepy.parsers.JSONParser()) #--- Calculate geocordinates from cityCountry --- --- ---- ---- --- --- --- -- geoConvertTries = 0 while True: try: result = Geocoder.geocode(cityCountry) except Exception as error: #print('errrooooorrrrr: ',error.message) if 'OVER_QUERY_LIMIT' in str(error): print('Encountered an error:{0}\nWaiting 30 seconds and trying again.'.format(error)) time.sleep(30) if geoConvertTries>10: print("Could not convert geo. returning empty list") return [] elif not (re.search('^\w+,\w+$',cityCountry)): print("cityCountry input format is incorrect. It should be \'city,Country\' like \'paris,france\'") return [] else: print("Could not convert geo. returning empty list") return [] else: break geoConvertTries+=1 # 34.0934,56.134,50mi coords = str(result[0].coordinates).replace('(','').replace(')','') + f',{radius}mi' coords=coords.replace(' ','') print(cityCountry, ": ", coords) #--- ---- ----- ---- ---- ---- ---- ---- --- ---- ---- --- ---- --- ---- --- -- #--- grab tweets --- ---- ---- ---- ---- ---- ---- ---- --- --- ---- ---- ---- maxTweets = 10000; oldest_tweet = None; unique_ids = []; desiredTweets = [];nTweetsPerDay=nTweets/8 for day,num in zip([str(date.today() - timedelta(i)).split()[0] for i in range(8)], range(1,9)): tweetsPerDay=[] while len(tweetsPerDay) < min(nTweetsPerDay,maxTweets/8): #--- determine whether to grab tweets by geo or not --- ---- --- ----- -- while True: try: if cityCountry: tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type="recent", max_id=oldest_tweet, geocode=coords, until=day) else: tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type="recent", max_id=oldest_tweet, until=day) except Exception as error: print(error,'Trying again after 1 minute.') time.sleep(60) else: break #---- ----- ----- ---- ----- ---- ----- ---- ----- ---- ---- ---- ---- -- #--- Dont go through an infinite loop trying to fill tweets that don't exist ----- if len(tweetsPerDay['statuses'])==0: print(f'No tweets returned while searching for \'{search_term}\'\n',len(desiredTweets)\ ,'\n',day) return pd.DataFrame(desiredTweets) #--- Append relevent tweets to output listy list ---- --- ---- ---- ---- --- --- for tweet in tweetsPerDay['statuses']: # Append tweet_id to ids list if it doesn't already exist. This allows checking for duplicate tweets if tweet["id"] not in unique_ids : unique_ids.append(tweet['id']) desiredTweets.append({'text':tweet['text'], 'vader':analyzer.polarity_scores(tweet['text'])['compound'], 'location':cityCountry, 'hashtags':tweet['entities']['hashtags'], 'followers':tweet['user']['followers_count'], 'friends_count':tweet['user']['friends_count'],'statuses_count':tweet['user']['statuses_count'], 'created_at':datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')}) # Reassign the the oldest tweet (i.e. the max_id) subtract 1 so the previous oldest isn't included oldest_tweet = tweet["id"] - 1 #--- Print sample tweet --- --- ---- ---- --- ---- ---- --- ---- ---- --- translator = Translator() try: print ('Sample Tweet:',translator.translate(desiredTweets[0]['text'], dest='en').text) except: print('there was an error translating sample tweet: ',desiredTweets[0]['text']) return pd.DataFrame(desiredTweets) def GetTweetsByPopularCities(search_term, numTweets, translateToLocalLanguage = True): #-- Get the most populated cities from wikipedia (Thank you wikipedia library!) -- html = wp.page("List_of_cities_by_population_density").html().encode("UTF-8") worldCities = pd.read_html(html)[1] worldCities = worldCities.drop([2,3,4],axis=1) worldCities = worldCities.rename(columns={0:'city',1:'population',5:'density',6:'country'}) worldCities = worldCities.iloc[1:] worldCities['population'] = [int(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['population']] worldCities['density'] = [int(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['density']] #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- --- #--- population per cities in United States --- ---- ---- --- ---- --- --- --- --- html = wp.page("List_of_United_States_cities_by_population_density").html().encode("UTF-8") UScities = pd.read_html(html)[1] UScities = UScities.drop([0,2,4,6,8],axis=1) UScities = UScities.rename(columns={1:'city',3:'state',5: 'land area (mi^2)',7:'density'}) UScities = UScities.iloc[1:] #df['population']=[int(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in df['population']] UScities['density'] = [float(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in UScities['density']] UScities['land area (mi^2)']=[float(area.split('\xa0')[-1]) for area in UScities['land area (mi^2)']] #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- --- #--- Get tweets by Worlds most densily populated cities ---- --- ---- ---- --- --- translator = Translator() comparisons=pd.DataFrame(columns=['time density','sentiment']) cityCount = 3 for index,row in worldCities.iterrows(): #-- location --- ----- --- ---- city,pop,density,country = row cityCountry = city+' , '+country #-- language conversion --- ---- -- languagesDf = getCountryLanguages() if translateToLocalLanguage: try: translatedSearch = translator.translate(search_term, src='en', dest=languagesDf.loc[country,'language']).text except ValueError: print("could not translate ", languagesDf.loc[country,'language']) translatedSearch=search_term print('translated word: ',translatedSearch) else: translatedSearch=search_term #--- --- --- ---- ---- --- --- --- try: tweetsWorld = pd.concat([tweetsWorld, SearchForData(translatedSearch, numTweets, cityCountry, 100)], axis=0) except: tweetsWorld = SearchForData(translatedSearch, numTweets, cityCountry, 100) print('\n') time.sleep(4) #if cityCount==0: # break #else: # cityCount-=1 #--- Add US Cities --- ---- ---- ---- ---- --- cityCount = 5 for index,row in UScities.iterrows(): #-- location --- ----- --- ---- city,state,area,density = row cityCountry = state+' , '+city try: tweetsUS = pd.concat([tweetsUS, SearchForData(translatedSearch, numTweets, cityCountry, max(area,5))], axis=0) except: tweetsUS = SearchForData(translatedSearch, numTweets, cityCountry, max(area,5)) print('\n') time.sleep(4) #if cityCount==0: # break #else: # cityCount-=1 return tweetsUS, tweetsWorld tweetsUS, tweetsWorld = GetTweetsByPopularCities('trump', 200, False) print(tweetsUS.groupby('location').mean()) print(tweetsWorld.groupby('location').mean()) #locations are not required inputs #tweets = SearchForData(search_term='baguettes', nTweets=100, cityCountry='paris,france',radius=100) #tweets