SearchForData

# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
from datetime import datetime
from datetime import date, timedelta

import tweepy; import json
from apikeys import twitterAccessToken as access_token
from apikeys import twitterAccessTokenSecret as access_token_secret
from apikeys import twitterConsumerKey as consumer_key
from apikeys import twitterConsumerSecretKey as consumer_secret

def parse_url( url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    listylist=[]
    for table in soup.find_all('table'):
        listylist.append(parse_html_table(table))
    return listylist

def parse_html_table( table):
    n_columns = 0; n_rows=0; column_names = []

    # Find number of rows and columns
    # we also find the column titles if we can
    for row in table.find_all('tr'):

        # Determine the number of rows in the table
        td_tags = row.find_all('td')
        if len(td_tags) > 0:
            n_rows+=1
            if n_columns == 0:
                # Set the number of columns for our table
                n_columns = len(td_tags)

        # Handle column names if we find them
        th_tags = row.find_all('th') 
        if len(th_tags) > 0 and len(column_names) == 0:
            for th in th_tags:
                column_names.append(th.get_text())

    # Safeguard on Column Titles
    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")

    columns = column_names if len(column_names) > 0 else range(0,n_columns)
    df = pd.DataFrame(columns = columns,
                      index= range(0,n_rows))
    row_marker = 0
    for row in table.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1

    # Convert to float if possible
    for col in df:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            pass

    return df

def getCountryLanguages():
    #TODO: Use the .apply to just change the table to one dialect. Imrpove language scope later.
    df = parse_url('https://www.infoplease.com/world/countries-world/languages-spoken-each-country-world')
    countryLanguages = df[0].rename(columns={0:'country',1:'language'}).set_index('country')
    countryLanguages['language'] = [re.sub('\d+|%|\(.*\)|\s','',i).split(',')[0].split(';')[0] for i in countryLanguages['language']]
    return countryLanguages


#returns hashtag, followers, following, text, geo, date
#cityCountry example: 'paris,france'
def SearchForData(search_term, nTweets, cityCountry='',radius=100):
    
    # Setup Tweepy API Authentication
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())
    
    #--- Calculate geocordinates from cityCountry --- --- ---- ---- --- --- --- --
    geoConvertTries = 0
    while True:
        try:
            result = Geocoder.geocode(cityCountry)
        except Exception as error:
            #print('errrooooorrrrr: ',error.message)
            if 'OVER_QUERY_LIMIT' in str(error):
                print('Encountered an error:{0}\nWaiting 30 seconds and trying again.'.format(error))
                time.sleep(30)
                if geoConvertTries>10:
                    print("Could not convert geo. returning empty list")
                    return []
            elif not (re.search('^\w+,\w+$',cityCountry)):
                print("cityCountry input format is incorrect. It should be \'city,Country\' like \'paris,france\'")
                return []
            else:
                print("Could not convert geo. returning empty list")
                return []
        else:
            break
        geoConvertTries+=1
    # 34.0934,56.134,50mi
    coords = str(result[0].coordinates).replace('(','').replace(')','') + f',{radius}mi'
    coords=coords.replace(' ','')
    print(cityCountry, ": ", coords)
    #--- ---- ----- ---- ---- ---- ---- ---- --- ---- ---- --- ---- --- ---- --- --

    #--- grab tweets --- ---- ---- ---- ---- ---- ---- ---- --- --- ---- ---- ----
    maxTweets = 10000; oldest_tweet = None; unique_ids = []; desiredTweets = [];nTweetsPerDay=nTweets/8
    for day,num in zip([str(date.today() - timedelta(i)).split()[0] for i in range(8)], range(1,9)):
        tweetsPerDay=[]
        while len(tweetsPerDay) < min(nTweetsPerDay,maxTweets/8):
            #--- determine whether to grab tweets by geo or not --- ---- --- ----- --
            while True:
                try:
                    if cityCountry:
                        tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type="recent", max_id=oldest_tweet, geocode=coords, until=day)
                    else:
                        tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type="recent", max_id=oldest_tweet, until=day)
                except Exception as error:
                    print(error,'Trying again after 1 minute.')
                    time.sleep(60)
                else:
                    break
            #---- ----- ----- ---- ----- ---- ----- ---- ----- ---- ---- ---- ---- --

            #--- Dont go through an infinite loop trying to fill tweets that don't exist -----
            if len(tweetsPerDay['statuses'])==0:
                print(f'No tweets returned while searching for \'{search_term}\'\n',len(desiredTweets)\
                     ,'\n',day)
                return pd.DataFrame(desiredTweets)

            #--- Append relevent tweets to output listy list ---- --- ---- ---- ---- --- ---
            for tweet in tweetsPerDay['statuses']:
                # Append tweet_id to ids list if it doesn't already exist. This allows checking for duplicate tweets
                if tweet["id"] not in unique_ids :
                    unique_ids.append(tweet['id'])
                    desiredTweets.append({'text':tweet['text'], 'vader':analyzer.polarity_scores(tweet['text'])['compound'],
                                          'location':cityCountry,
                                          'hashtags':tweet['entities']['hashtags'], 'followers':tweet['user']['followers_count'],
                                         'friends_count':tweet['user']['friends_count'],'statuses_count':tweet['user']['statuses_count'],
                                          'created_at':datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')})
                
                # Reassign the the oldest tweet (i.e. the max_id) subtract 1 so the previous oldest isn't included
                oldest_tweet = tweet["id"] - 1
            

    #--- Print sample tweet --- --- ---- ---- --- ---- ---- --- ---- ---- ---
    translator = Translator()
    try:
        print ('Sample Tweet:',translator.translate(desiredTweets[0]['text'], dest='en').text)
    except:
        print('there was an error translating sample tweet: ',desiredTweets[0]['text'])
    return pd.DataFrame(desiredTweets)


def GetTweetsByPopularCities(search_term, numTweets, translateToLocalLanguage = True):
    #-- Get the most populated cities from wikipedia (Thank you wikipedia library!) --
    html = wp.page("List_of_cities_by_population_density").html().encode("UTF-8")
    worldCities = pd.read_html(html)[1]
    worldCities = worldCities.drop([2,3,4],axis=1)
    worldCities = worldCities.rename(columns={0:'city',1:'population',5:'density',6:'country'})
    worldCities = worldCities.iloc[1:]
    worldCities['population'] = [int(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['population']]
    worldCities['density'] = [int(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['density']]
    #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- ---
    
    #--- population per cities in United States --- ---- ---- --- ---- --- --- --- ---
    html = wp.page("List_of_United_States_cities_by_population_density").html().encode("UTF-8")
    UScities = pd.read_html(html)[1]
    UScities = UScities.drop([0,2,4,6,8],axis=1)
    UScities = UScities.rename(columns={1:'city',3:'state',5: 'land area (mi^2)',7:'density'})
    UScities = UScities.iloc[1:]
    #df['population']=[int(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in df['population']]
    UScities['density'] = [float(city.split('\xa0')[-1].split('[')[0].replace(',','')) for city in UScities['density']]
    UScities['land area (mi^2)']=[float(area.split('\xa0')[-1]) for area in UScities['land area (mi^2)']]
    #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- ---
    
    #--- Get tweets by Worlds most densily populated cities ---- --- ---- ---- --- ---
    translator = Translator()
    comparisons=pd.DataFrame(columns=['time density','sentiment'])
    cityCount = 3
    for index,row in worldCities.iterrows():
        #-- location --- ----- --- ----
        city,pop,density,country = row
        cityCountry = city+' , '+country

        #-- language conversion --- ---- --
        languagesDf = getCountryLanguages()
        if translateToLocalLanguage:
            try:
                translatedSearch = translator.translate(search_term, src='en', dest=languagesDf.loc[country,'language']).text
            except ValueError:
                print("could not translate ", languagesDf.loc[country,'language'])
                translatedSearch=search_term
                print('translated word: ',translatedSearch)
        else:
            translatedSearch=search_term
        #--- --- --- ---- ---- --- --- ---
        
        try:
            tweetsWorld = pd.concat([tweetsWorld, SearchForData(translatedSearch, numTweets, cityCountry, 100)], axis=0)
        except:
            tweetsWorld = SearchForData(translatedSearch, numTweets, cityCountry, 100)
        print('\n')
        time.sleep(4)
        #if cityCount==0:
        #    break
        #else:
        #    cityCount-=1
    
    #--- Add US Cities --- ---- ---- ---- ---- ---
    cityCount = 5
    for index,row in UScities.iterrows():
        #-- location --- ----- --- ----
        city,state,area,density = row
        cityCountry = state+' , '+city

        try:
            tweetsUS = pd.concat([tweetsUS, SearchForData(translatedSearch, numTweets, cityCountry, max(area,5))], axis=0)
        except:
            tweetsUS = SearchForData(translatedSearch, numTweets, cityCountry, max(area,5))
        
        print('\n')
        time.sleep(4)
        #if cityCount==0:
        #    break
        #else:
        #    cityCount-=1
    return tweetsUS, tweetsWorld


tweetsUS, tweetsWorld = GetTweetsByPopularCities('trump', 200, False)
print(tweetsUS.groupby('location').mean())
print(tweetsWorld.groupby('location').mean())


                                                               #locations are not required inputs
#tweets = SearchForData(search_term='baguettes', nTweets=100, cityCountry='paris,france',radius=100)
#tweets