   "source": [
    "import pandas as pd\n",
    "import wikipedia as wp\n",
    "from pygeocoder import Geocoder\n",
    "import time\n",
    "from googletrans import Translator\n",
    "# Import and Initialize Sentiment Analyzer\n",
    "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
    "analyzer = SentimentIntensityAnalyzer()\n",
    "import requests\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "from datetime import datetime\n",
    "from datetime import date, timedelta\n",
    "import tweepy; import json\n",
    "from apikeys import twitterAccessToken as access_token\n",
    "from apikeys import twitterAccessTokenSecret as access_token_secret\n",
    "from apikeys import twitterConsumerKey as consumer_key\n",
    "from apikeys import twitterConsumerSecretKey as consumer_secret\n",
    "def parse_url( url):\n",
    "    response = requests.get(url)\n",
    "    soup = BeautifulSoup(response.text, 'lxml')\n",
    "    listylist=[]\n",
    "    for table in soup.find_all('table'):\n",
    "        listylist.append(parse_html_table(table))\n",
    "    return listylist\n",
    "def parse_html_table( table):\n",
    "    n_columns = 0; n_rows=0; column_names = []\n",
    "    # Find number of rows and columns\n",
    "    # we also find the column titles if we can\n",
    "    for row in table.find_all('tr'):\n",
    "        # Determine the number of rows in the table\n",
    "        td_tags = row.find_all('td')\n",
    "        if len(td_tags) > 0:\n",
    "            n_rows+=1\n",
    "            if n_columns == 0:\n",
    "                # Set the number of columns for our table\n",
    "                n_columns = len(td_tags)\n",
    "        # Handle column names if we find them\n",
    "        th_tags = row.find_all('th') \n",
    "        if len(th_tags) > 0 and len(column_names) == 0:\n",
    "            for th in th_tags:\n",
    "                column_names.append(th.get_text())\n",
    "    # Safeguard on Column Titles\n",
    "    if len(column_names) > 0 and len(column_names) != n_columns:\n",
    "        raise Exception(\"Column titles do not match the number of columns\")\n",
    "    columns = column_names if len(column_names) > 0 else range(0,n_columns)\n",
    "    df = pd.DataFrame(columns = columns,\n",
    "                      index= range(0,n_rows))\n",
    "    row_marker = 0\n",
    "    for row in table.find_all('tr'):\n",
    "        column_marker = 0\n",
    "        columns = row.find_all('td')\n",
    "        for column in columns:\n",
    "            df.iat[row_marker,column_marker] = column.get_text()\n",
    "            column_marker += 1\n",
    "        if len(columns) > 0:\n",
    "            row_marker += 1\n",
    "    # Convert to float if possible\n",
    "    for col in df:\n",
    "        try:\n",
    "            df[col] = df[col].astype(float)\n",
    "        except ValueError:\n",
    "            pass\n",
    "    return df\n",
    "def getCountryLanguages():\n",
    "    #TODO: Use the .apply to just change the table to one dialect. Imrpove language scope later.\n",
    "    df = parse_url('https://www.infoplease.com/world/countries-world/languages-spoken-each-country-world')\n",
    "    countryLanguages = df[0].rename(columns={0:'country',1:'language'}).set_index('country')\n",
    "    countryLanguages['language'] = [re.sub('\\d+|%|\\(.*\\)|\\s','',i).split(',')[0].split(';')[0] for i in countryLanguages['language']]\n",
    "    return countryLanguages\n",
    "#returns hashtag, followers, following, text, geo, date\n",
    "#cityCountry example: 'paris,france'\n",
    "def SearchForData(search_term, nTweets, cityCountry='',radius=100):\n",
    "    \n",
    "    # Setup Tweepy API Authentication\n",
    "    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
    "    auth.set_access_token(access_token, access_token_secret)\n",
    "    api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())\n",
    "    \n",
    "    #--- Calculate geocordinates from cityCountry --- --- ---- ---- --- --- --- --\n",
    "    geoConvertTries = 0\n",
    "    while True:\n",
    "        try:\n",
    "            result = Geocoder.geocode(cityCountry)\n",
    "        except Exception as error:\n",
    "            #print('errrooooorrrrr: ',error.message)\n",
    "            if 'OVER_QUERY_LIMIT' in str(error):\n",
    "                print('Encountered an error:{0}\\nWaiting 30 seconds and trying again.'.format(error))\n",
    "                time.sleep(30)\n",
    "                if geoConvertTries>10:\n",
    "                    print(\"Could not convert geo. returning empty list\")\n",
    "                    return []\n",
    "            elif not (re.search('^\\w+,\\w+$',cityCountry)):\n",
    "                print(\"cityCountry input format is incorrect. It should be \\'city,Country\\' like \\'paris,france\\'\")\n",
    "                return []\n",
    "            else:\n",
    "                print(\"Could not convert geo. returning empty list\")\n",
    "                return []\n",
    "        else:\n",
    "            break\n",
    "        geoConvertTries+=1\n",
    "    # 34.0934,56.134,50mi\n",
    "    coords = str(result[0].coordinates).replace('(','').replace(')','') + f',{radius}mi'\n",
    "    coords=coords.replace(' ','')\n",
    "    print(cityCountry, \": \", coords)\n",
    "    #--- ---- ----- ---- ---- ---- ---- ---- --- ---- ---- --- ---- --- ---- --- --\n",
    "    #--- grab tweets --- ---- ---- ---- ---- ---- ---- ---- --- --- ---- ---- ----\n",
    "    maxTweets = 10000; oldest_tweet = None; unique_ids = []; desiredTweets = [];nTweetsPerDay=nTweets/8\n",
    "    for day,num in zip([str(date.today() - timedelta(i)).split()[0] for i in range(8)], range(1,9)):\n",
    "        tweetsPerDay=[]\n",
    "        while len(tweetsPerDay) < min(nTweetsPerDay,maxTweets/8):\n",
    "            #--- determine whether to grab tweets by geo or not --- ---- --- ----- --\n",
    "            while True:\n",
    "                try:\n",
    "                    if cityCountry:\n",
    "                        tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type=\"recent\", max_id=oldest_tweet, geocode=coords, until=day)\n",
    "                    else:\n",
    "                        tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type=\"recent\", max_id=oldest_tweet, until=day)\n",
    "                except Exception as error:\n",
    "                    print(error,'Trying again after 1 minute.')\n",
    "                    time.sleep(60)\n",
    "                else:\n",
    "                    break\n",
    "            #---- ----- ----- ---- ----- ---- ----- ---- ----- ---- ---- ---- ---- --\n",
    "            #--- Dont go through an infinite loop trying to fill tweets that don't exist -----\n",
    "            if len(tweetsPerDay['statuses'])==0:\n",
    "                print(f'No tweets returned while searching for \\'{search_term}\\'\\n',len(desiredTweets)\\\n",
    "                     ,'\\n',day)\n",
    "                return pd.DataFrame(desiredTweets)\n",
    "            #--- Append relevent tweets to output listy list ---- --- ---- ---- ---- --- ---\n",
    "            for tweet in tweetsPerDay['statuses']:\n",
    "                # Append tweet_id to ids list if it doesn't already exist. This allows checking for duplicate tweets\n",
    "                if tweet[\"id\"] not in unique_ids :\n",
    "                    unique_ids.append(tweet['id'])\n",
    "                    desiredTweets.append({'text':tweet['text'], 'vader':analyzer.polarity_scores(tweet['text'])['compound'],\n",
    "                                          'location':cityCountry,\n",
    "                                          'hashtags':tweet['entities']['hashtags'], 'followers':tweet['user']['followers_count'],\n",
    "                                         'friends_count':tweet['user']['friends_count'],'statuses_count':tweet['user']['statuses_count'],\n",
    "                                          'created_at':datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')})\n",
    "                \n",
    "                # Reassign the the oldest tweet (i.e. the max_id) subtract 1 so the previous oldest isn't included\n",
    "                oldest_tweet = tweet[\"id\"] - 1\n",
    "            \n",
    "    #--- Print sample tweet --- --- ---- ---- --- ---- ---- --- ---- ---- ---\n",
    "    translator = Translator()\n",
    "    try:\n",
    "        print ('Sample Tweet:',translator.translate(desiredTweets[0]['text'], dest='en').text)\n",
    "    except:\n",
    "        print('there was an error translating sample tweet: ',desiredTweets[0]['text'])\n",
    "    return pd.DataFrame(desiredTweets)\n",
    "def GetTweetsByPopularCities(search_term, numTweets, translateToLocalLanguage = True):\n",
    "    #-- Get the most populated cities from wikipedia (Thank you wikipedia library!) --\n",
    "    html = wp.page(\"List_of_cities_by_population_density\").html().encode(\"UTF-8\")\n",
    "    worldCities = pd.read_html(html)[1]\n",
    "    worldCities = worldCities.drop([2,3,4],axis=1)\n",
    "    worldCities = worldCities.rename(columns={0:'city',1:'population',5:'density',6:'country'})\n",
    "    worldCities = worldCities.iloc[1:]\n",
    "    worldCities['population'] = [int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['population']]\n",
    "    worldCities['density'] = [int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['density']]\n",
    "    #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- ---\n",
    "    \n",
    "    #--- population per cities in United States --- ---- ---- --- ---- --- --- --- ---\n",
    "    html = wp.page(\"List_of_United_States_cities_by_population_density\").html().encode(\"UTF-8\")\n",
    "    UScities = pd.read_html(html)[1]\n",
    "    UScities = UScities.drop([0,2,4,6,8],axis=1)\n",
    "    UScities = UScities.rename(columns={1:'city',3:'state',5: 'land area (mi^2)',7:'density'})\n",
    "    UScities = UScities.iloc[1:]\n",
    "    #df['population']=[int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in df['population']]\n",
    "    UScities['density'] = [float(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in UScities['density']]\n",
    "    UScities['land area (mi^2)']=[float(area.split('\\xa0')[-1]) for area in UScities['land area (mi^2)']]\n",
    "    #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- ---\n",
    "    \n",
    "    #--- Get tweets by Worlds most densily populated cities ---- --- ---- ---- --- ---\n",
    "    translator = Translator()\n",
    "    comparisons=pd.DataFrame(columns=['time density','sentiment'])\n",
    "    cityCount = 3\n",
    "    for index,row in worldCities.iterrows():\n",
    "        #-- location --- ----- --- ----\n",
    "        city,pop,density,country = row\n",
    "        cityCountry = city+' , '+country\n",
    "        #-- language conversion --- ---- --\n",
    "        languagesDf = getCountryLanguages()\n",
    "        if translateToLocalLanguage:\n",
    "            try:\n",
    "                translatedSearch = translator.translate(search_term, src='en', dest=languagesDf.loc[country,'language']).text\n",
    "            except ValueError:\n",
    "                print(\"could not translate \", languagesDf.loc[country,'language'])\n",
    "                translatedSearch=search_term\n",
    "                print('translated word: ',translatedSearch)\n",
    "        else:\n",
    "            translatedSearch=search_term\n",
    "        #--- --- --- ---- ---- --- --- ---\n",
    "        \n",
    "        try:\n",
    "            tweetsWorld = pd.concat([tweetsWorld, SearchForData(translatedSearch, numTweets, cityCountry, 100)], axis=0)\n",
    "        except:\n",
    "            tweetsWorld = SearchForData(translatedSearch, numTweets, cityCountry, 100)\n",
    "        print('\\n')\n",
    "        time.sleep(4)\n",
    "        #if cityCount==0:\n",
    "        #    break\n",
    "        #else:\n",
    "        #    cityCount-=1\n",
    "    \n",
    "    #--- Add US Cities --- ---- ---- ---- ---- ---\n",
    "    cityCount = 5\n",
    "    for index,row in UScities.iterrows():\n",
    "        #-- location --- ----- --- ----\n",
    "        city,state,area,density = row\n",
    "        cityCountry = state+' , '+city\n",
    "        try:\n",
    "            tweetsUS = pd.concat([tweetsUS, SearchForData(translatedSearch, numTweets, cityCountry, max(area,5))], axis=0)\n",
    "        except:\n",
    "            tweetsUS = SearchForData(translatedSearch, numTweets, cityCountry, max(area,5))\n",
    "        \n",
    "        print('\\n')\n",
    "        time.sleep(4)\n",
    "        #if cityCount==0:\n",
    "        #    break\n",
    "        #else:\n",
    "        #    cityCount-=1\n",
    "    return tweetsUS, tweetsWorld\n",
    "tweetsUS, tweetsWorld = GetTweetsByPopularCities('trump', 200, False)\n",
    "                                                               #locations are not required inputs\n",
    "#tweets = SearchForData(search_term='baguettes', nTweets=100, cityCountry='paris,france',radius=100)\n",
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "source": [
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "import matplotlib.pyplot as plt\n",
    "prevCreatedAt = datetime.strptime(tweets[0]['created_at'],'%a %b %d %H:%M:%S %z %Y')\n",
    "for tweet in tweets[1:]:\n",
    "    #print(tweet['created_at'])\n",
    "    timeBetween.append((prevCreatedAt - datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')).total_seconds()/60)\n",
    "    prevCreatedAt = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')\n",
    "plt.plot([datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y') for tweet in tweets][::-1],range(1000))\n",
    "print((datetime.strptime(tweets[-1]['created_at'],'%a %b %d %H:%M:%S %z %Y')-datetime.strptime(tweets[0]['created_at'],'%a %b %d %H:%M:%S %z %Y')).total_seconds()//60//60)"
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#This code came from the following link:\n",
    "import requests\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "class HTMLTableParser():\n",
    "    def parse_url(self, url):\n",
    "        response = requests.get(url)\n",
    "        soup = BeautifulSoup(response.text, 'lxml')\n",
    "        listylist=[]\n",
    "        for table in soup.find_all('table'):\n",
    "            listylist.append(self.parse_html_table(table))\n",
    "        return listylist\n",
    "    def parse_html_table(self, table):\n",
    "        n_columns = 0; n_rows=0; column_names = []\n",
    "        # Find number of rows and columns\n",
    "        # we also find the column titles if we can\n",
    "        for row in table.find_all('tr'):\n",
    "            # Determine the number of rows in the table\n",
    "            td_tags = row.find_all('td')\n",
    "            if len(td_tags) > 0:\n",
    "                n_rows+=1\n",
    "                if n_columns == 0:\n",
    "                    # Set the number of columns for our table\n",
    "                    n_columns = len(td_tags)\n",
    "            # Handle column names if we find them\n",
    "            th_tags = row.find_all('th') \n",
    "            if len(th_tags) > 0 and len(column_names) == 0:\n",
    "                for th in th_tags:\n",
    "                    column_names.append(th.get_text())\n",
    "        # Safeguard on Column Titles\n",
    "        if len(column_names) > 0 and len(column_names) != n_columns:\n",
    "            raise Exception(\"Column titles do not match the number of columns\")\n",
    "        columns = column_names if len(column_names) > 0 else range(0,n_columns)\n",
    "        df = pd.DataFrame(columns = columns,\n",
    "                          index= range(0,n_rows))\n",
    "        row_marker = 0\n",
    "        for row in table.find_all('tr'):\n",
    "            column_marker = 0\n",
    "            columns = row.find_all('td')\n",
    "            for column in columns:\n",
    "                df.iat[row_marker,column_marker] = column.get_text()\n",
    "                column_marker += 1\n",
    "            if len(columns) > 0:\n",
    "                row_marker += 1\n",
    "        # Convert to float if possible\n",
    "        for col in df:\n",
    "            try:\n",
    "                df[col] = df[col].astype(float)\n",
    "            except ValueError:\n",
    "                pass\n",
    "        return df\n",
    "#TODO: Use the .apply to just change the table to one dialect. Imrpove language scope later.\n",
    "obj = HTMLTableParser()\n",
    "df = obj.parse_url('https://www.infoplease.com/world/countries-world/languages-spoken-each-country-world')\n",
    "countryLanguages = df[0].rename(columns={0:'country',1:'language'}).set_index('country')\n",
    "newDict = []\n",
    "for index, row in countryLanguages.iterrows():\n",
    "    language = re.sub('\\d+|%|\\(.*\\)|\\s','',countryLanguages.loc[index].values[0]).split(',')[0].split(';')[0]\n",
    "    newDict.append([index, language]) #print(index,\": \",language)\n",
    "newDf = pd.DataFrame(newDict)\n",
    "newDf = newDf.rename(columns={0:'country',1:'language'}).set_index('country')\n",
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
     "data": {
      "text/html": [
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>language</th>\n",
       "      <th>languages</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Afghanistan</th>\n",
       "      <td>Dari Persian, Pashtu (both official), other Tu...</td>\n",
       "      <td>DariPersian</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Albania</th>\n",
       "      <td>Albanian (Tosk is the official dialect), Greek</td>\n",
       "      <td>Albanian</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Algeria</th>\n",
       "      <td>Arabic (official), French, Berber dialects</td>\n",
       "      <td>Arabic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Andorra</th>\n",
       "      <td>Catalán (official), French, Castilian, Portuguese</td>\n",
       "      <td>Catalán</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Angola</th>\n",
       "      <td>Portuguese (official), Bantu and other African...</td>\n",
       "      <td>Portuguese</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Antigua and Barbuda</th>\n",
       "      <td>English (official), local dialects</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Argentina</th>\n",
       "      <td>Spanish (official), English, Italian, German, ...</td>\n",
       "      <td>Spanish</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Armenia</th>\n",
       "      <td>Armenian 98%, Yezidi, Russian</td>\n",
       "      <td>Armenian</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Australia</th>\n",
       "      <td>English 79%, native and other languages</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Austria</th>\n",
       "      <td>German (official nationwide); Slovene, Croatia...</td>\n",
       "      <td>German</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Azerbaijan</th>\n",
       "      <td>Azerbaijani Turkic 89%, Russian 3%, Armenian 2...</td>\n",
       "      <td>AzerbaijaniTurkic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bahamas</th>\n",
       "      <td>English (official), Creole (among Haitian immi...</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bahrain</th>\n",
       "      <td>Arabic, English, Farsi, Urdu</td>\n",
       "      <td>Arabic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bangladesh</th>\n",
       "      <td>Bangla (official), English</td>\n",
       "      <td>Bangla</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Barbados</th>\n",
       "      <td>English</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Belarus</th>\n",
       "      <td>Belorussian (White Russian), Russian, other</td>\n",
       "      <td>Belorussian</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>Dutch (Flemish) 60%, French 40%, German less t...</td>\n",
       "      <td>Dutch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Belize</th>\n",
       "      <td>English (official), Spanish, Mayan, Garifuna (...</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Benin</th>\n",
       "      <td>French (official), Fon, Yoruba, tribal languages</td>\n",
       "      <td>French</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bhutan</th>\n",
       "      <td>Dzongkha (official), Tibetan dialects (among B...</td>\n",
       "      <td>Dzongkha</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bolivia</th>\n",
       "      <td>Spanish, Quechua, Aymara (all official)</td>\n",
       "      <td>Spanish</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bosnia and Herzegovina</th>\n",
       "      <td>Bosnian, Croatian, Serbian</td>\n",
       "      <td>Bosnian</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Botswana</th>\n",
       "      <td>English 2% (official), Setswana 78%, Kalanga 8...</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Brazil</th>\n",
       "      <td>Portuguese (official), Spanish, English, French</td>\n",
       "      <td>Portuguese</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Brunei</th>\n",
       "      <td>Malay (official), English, Chinese</td>\n",
       "      <td>Malay</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bulgaria</th>\n",
       "      <td>Bulgarian 85%, Turkish 10%, Roma 4%</td>\n",
       "      <td>Bulgarian</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Burkina Faso</th>\n",
       "      <td>French (official); native African (Sudanic) la...</td>\n",
       "      <td>Frenchlanguages</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Burundi</th>\n",
       "      <td>Kirundi and French (official), Swahili</td>\n",
       "      <td>KirundiandFrench</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cambodia</th>\n",
       "      <td>Khmer 95% (official), French, English</td>\n",
       "      <td>Khmer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cameroon</th>\n",
       "      <td>French, English (both official); 24 major Afri...</td>\n",
       "      <td>French</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Swaziland</th>\n",
       "      <td>English, siSwati (both official)</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sweden</th>\n",
       "      <td>Swedish, small Sami- and Finnish-speaking mino...</td>\n",
       "      <td>Swedish</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Switzerland</th>\n",
       "      <td>German 64%, French 20%, Italian 7% (all offici...</td>\n",
       "      <td>German</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Syria</th>\n",
       "      <td>Arabic (official); Kurdish, Armenian, Aramaic,...</td>\n",
       "      <td>Arabic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Taiwan</th>\n",
       "      <td>Chinese (Mandarin, official), Taiwanese (Min),...</td>\n",
       "      <td>Chinese</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Tajikistan</th>\n",
       "      <td>Tajik (official), Russian widely used in gover...</td>\n",
       "      <td>Tajik</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Tanzania</th>\n",
       "      <td>Swahili, English (both official); Arabic; many...</td>\n",
       "      <td>Swahili</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Thailand</th>\n",
       "      <td>Thai (Siamese), English (secondary language of...</td>\n",
       "      <td>Thai</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Togo</th>\n",
       "      <td>French (official, commerce); Ewé, Mina (south)...</td>\n",
       "      <td>French</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Tonga</th>\n",
       "      <td>Tongan (an Austronesian language), English</td>\n",
       "      <td>Tongan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Trinidad and Tobago</th>\n",
       "      <td>English (official), Hindi, French, Spanish, Ch...</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Tunisia</th>\n",
       "      <td>Arabic (official, commerce), French (commerce)</td>\n",
       "      <td>Arabic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Turkey</th>\n",
       "      <td>Turkish (official), Kurdish, Dimli, Azeri, Kab...</td>\n",
       "      <td>Turkish</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Turkmenistan</th>\n",
       "      <td>Turkmen 72%; Russian 12%; Uzbek 9%, other 7%</td>\n",
       "      <td>Turkmen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Tuvalu</th>\n",
       "      <td>Tuvaluan, English, Samoan, Kiribati (on the is...</td>\n",
       "      <td>Tuvaluan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Uganda</th>\n",
       "      <td>English (official), Ganda or Luganda, other Ni...</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ukraine</th>\n",
       "      <td>Ukrainian 67%, Russian 24%, Romanian, Polish, ...</td>\n",
       "      <td>Ukrainian</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Arab Emirates</th>\n",
       "      <td>Arabic (official), Persian, English, Hindi, Urdu</td>\n",
       "      <td>Arabic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>English, Welsh, Scots Gaelic</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United States</th>\n",
       "      <td>English 82%, Spanish 11% (2000)</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Uruguay</th>\n",
       "      <td>Spanish, Portunol, or Brazilero</td>\n",
       "      <td>Spanish</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Uzbekistan</th>\n",
       "      <td>Uzbek 74.3%, Russian 14.2%, Tajik 4.4%, other ...</td>\n",
       "      <td>Uzbek.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Vanuatu</th>\n",
       "      <td>Bislama 23% (a Melanesian pidgin English), Eng...</td>\n",
       "      <td>Bislama</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Vatican City (Holy See)</th>\n",
       "      <td>Italian, Latin, French, various other languages</td>\n",
       "      <td>Italian</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Venezuela</th>\n",
       "      <td>Spanish (official), numerous indigenous dialects</td>\n",
       "      <td>Spanish</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Vietnam</th>\n",
       "      <td>Vietnamese (official); English (increasingly f...</td>\n",
       "      <td>Vietnamese</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Western Sahara (proposed state)</th>\n",
       "      <td>Hassaniya Arabic, Moroccan Arabic</td>\n",
       "      <td>HassaniyaArabic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Yemen</th>\n",
       "      <td>Arabic</td>\n",
       "      <td>Arabic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Zambia</th>\n",
       "      <td>English (official); major vernaculars: Bemba, ...</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Zimbabwe</th>\n",
       "      <td>English (official), Shona, Ndebele (Sindebele)...</td>\n",
       "      <td>English</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "<p>198 rows × 2 columns</p>\n",
      "text/plain": [
       "                                                                          language  \\\n",
       "country                                                                              \n",
       "Afghanistan                      Dari Persian, Pashtu (both official), other Tu...   \n",
       "Albania                             Albanian (Tosk is the official dialect), Greek   \n",
       "Algeria                                 Arabic (official), French, Berber dialects   \n",
       "Andorra                          Catalán (official), French, Castilian, Portuguese   \n",
       "Angola                           Portuguese (official), Bantu and other African...   \n",
       "Antigua and Barbuda                             English (official), local dialects   \n",
       "Argentina                        Spanish (official), English, Italian, German, ...   \n",
       "Armenia                                              Armenian 98%, Yezidi, Russian   \n",
       "Australia                                  English 79%, native and other languages   \n",
       "Austria                          German (official nationwide); Slovene, Croatia...   \n",
       "Azerbaijan                       Azerbaijani Turkic 89%, Russian 3%, Armenian 2...   \n",
       "Bahamas                          English (official), Creole (among Haitian immi...   \n",
       "Bahrain                                               Arabic, English, Farsi, Urdu   \n",
       "Bangladesh                                              Bangla (official), English   \n",
       "Barbados                                                                   English   \n",
       "Belarus                                Belorussian (White Russian), Russian, other   \n",
       "Belgium                          Dutch (Flemish) 60%, French 40%, German less t...   \n",
       "Belize                           English (official), Spanish, Mayan, Garifuna (...   \n",
       "Benin                             French (official), Fon, Yoruba, tribal languages   \n",
       "Bhutan                           Dzongkha (official), Tibetan dialects (among B...   \n",
       "Bolivia                                    Spanish, Quechua, Aymara (all official)   \n",
       "Bosnia and Herzegovina                                  Bosnian, Croatian, Serbian   \n",
       "Botswana                         English 2% (official), Setswana 78%, Kalanga 8...   \n",
       "Brazil                             Portuguese (official), Spanish, English, French   \n",
       "Brunei                                          Malay (official), English, Chinese   \n",
       "Bulgaria                                       Bulgarian 85%, Turkish 10%, Roma 4%   \n",
       "Burkina Faso                     French (official); native African (Sudanic) la...   \n",
       "Burundi                                     Kirundi and French (official), Swahili   \n",
       "Cambodia                                     Khmer 95% (official), French, English   \n",
       "Cameroon                         French, English (both official); 24 major Afri...   \n",
       "...                                                                            ...   \n",
       "Swaziland                                         English, siSwati (both official)   \n",
       "Sweden                           Swedish, small Sami- and Finnish-speaking mino...   \n",
       "Switzerland                      German 64%, French 20%, Italian 7% (all offici...   \n",
       "Syria                            Arabic (official); Kurdish, Armenian, Aramaic,...   \n",
       "Taiwan                           Chinese (Mandarin, official), Taiwanese (Min),...   \n",
       "Tajikistan                       Tajik (official), Russian widely used in gover...   \n",
       "Tanzania                         Swahili, English (both official); Arabic; many...   \n",
       "Thailand                         Thai (Siamese), English (secondary language of...   \n",
       "Togo                             French (official, commerce); Ewé, Mina (south)...   \n",
       "Tonga                                   Tongan (an Austronesian language), English   \n",
       "Trinidad and Tobago              English (official), Hindi, French, Spanish, Ch...   \n",
       "Tunisia                             Arabic (official, commerce), French (commerce)   \n",
       "Turkey                           Turkish (official), Kurdish, Dimli, Azeri, Kab...   \n",
       "Turkmenistan                          Turkmen 72%; Russian 12%; Uzbek 9%, other 7%   \n",
       "Tuvalu                           Tuvaluan, English, Samoan, Kiribati (on the is...   \n",
       "Uganda                           English (official), Ganda or Luganda, other Ni...   \n",
       "Ukraine                          Ukrainian 67%, Russian 24%, Romanian, Polish, ...   \n",
       "United Arab Emirates              Arabic (official), Persian, English, Hindi, Urdu   \n",
       "United Kingdom                                        English, Welsh, Scots Gaelic   \n",
       "United States                                      English 82%, Spanish 11% (2000)   \n",
       "Uruguay                                            Spanish, Portunol, or Brazilero   \n",
       "Uzbekistan                       Uzbek 74.3%, Russian 14.2%, Tajik 4.4%, other ...   \n",
       "Vanuatu                          Bislama 23% (a Melanesian pidgin English), Eng...   \n",
       "Vatican City (Holy See)            Italian, Latin, French, various other languages   \n",
       "Venezuela                         Spanish (official), numerous indigenous dialects   \n",
       "Vietnam                          Vietnamese (official); English (increasingly f...   \n",
       "Western Sahara (proposed state)                  Hassaniya Arabic, Moroccan Arabic   \n",
       "Yemen                                                                       Arabic   \n",
       "Zambia                           English (official); major vernaculars: Bemba, ...   \n",
       "Zimbabwe                         English (official), Shona, Ndebele (Sindebele)...   \n",
       "                                         languages  \n",
       "country                                             \n",
       "Afghanistan                            DariPersian  \n",
       "Albania                                   Albanian  \n",
       "Algeria                                     Arabic  \n",
       "Andorra                                    Catalán  \n",
       "Angola                                  Portuguese  \n",
       "Antigua and Barbuda                        English  \n",
       "Argentina                                  Spanish  \n",
       "Armenia                                   Armenian  \n",
       "Australia                                  English  \n",
       "Austria                                     German  \n",
       "Azerbaijan                       AzerbaijaniTurkic  \n",
       "Bahamas                                    English  \n",
       "Bahrain                                     Arabic  \n",
       "Bangladesh                                  Bangla  \n",
       "Barbados                                   English  \n",
       "Belarus                                Belorussian  \n",
       "Belgium                                      Dutch  \n",
       "Belize                                     English  \n",
       "Benin                                       French  \n",
       "Bhutan                                    Dzongkha  \n",
       "Bolivia                                    Spanish  \n",
       "Bosnia and Herzegovina                     Bosnian  \n",
       "Botswana                                   English  \n",
       "Brazil                                  Portuguese  \n",
       "Brunei                                       Malay  \n",
       "Bulgaria                                 Bulgarian  \n",
       "Burkina Faso                       Frenchlanguages  \n",
       "Burundi                           KirundiandFrench  \n",
       "Cambodia                                     Khmer  \n",
       "Cameroon                                    French  \n",
       "...                                            ...  \n",
       "Swaziland                                  English  \n",
       "Sweden                                     Swedish  \n",
       "Switzerland                                 German  \n",
       "Syria                                       Arabic  \n",
       "Taiwan                                     Chinese  \n",
       "Tajikistan                                   Tajik  \n",
       "Tanzania                                   Swahili  \n",
       "Thailand                                      Thai  \n",
       "Togo                                        French  \n",
       "Tonga                                       Tongan  \n",
       "Trinidad and Tobago                        English  \n",
       "Tunisia                                     Arabic  \n",
       "Turkey                                     Turkish  \n",
       "Turkmenistan                               Turkmen  \n",
       "Tuvalu                                    Tuvaluan  \n",
       "Uganda                                     English  \n",
       "Ukraine                                  Ukrainian  \n",
       "United Arab Emirates                        Arabic  \n",
       "United Kingdom                             English  \n",
       "United States                              English  \n",
       "Uruguay                                    Spanish  \n",
       "Uzbekistan                                  Uzbek.  \n",
       "Vanuatu                                    Bislama  \n",
       "Vatican City (Holy See)                    Italian  \n",
       "Venezuela                                  Spanish  \n",
       "Vietnam                                 Vietnamese  \n",
       "Western Sahara (proposed state)    HassaniyaArabic  \n",
       "Yemen                                       Arabic  \n",
       "Zambia                                     English  \n",
       "Zimbabwe                                   English  \n",
       "[198 rows x 2 columns]"
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
   "source": [
    "df = parse_url('https://www.infoplease.com/world/countries-world/languages-spoken-each-country-world')\n",
    "countryLanguages = df[0].rename(columns={0:'country',1:'language'}).set_index('country')\n",
    "countryLanguages['languages'] = [re.sub('\\d+|%|\\(.*\\)|\\s','',i).split(',')[0].split(';')[0] for i in countryLanguages['language']]\n",
    "countryLanguages    "
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "translator.translate('Hola me llamo Jennifer 😜😜', dest='en').text"
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "source": [
   "cell_type": "code",
   "source": [
    "# Setup Tweepy API Authentication\n",
    "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
    "auth.set_access_token(access_token, access_token_secret)\n",
    "api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())\n",
    "tweetsa=api.search(q='trump', geocode='52.132633,5.29126,100km')\n",
   "cell_type": "code",
   "source": [
   "cell_type": "code",
   "source": [
    "#-- Get the most populated cities from wikipedia (Thank you wikipedia library!) --\n",
    "html = wp.page(\"List_of_United_States_cities_by_population_density\").html().encode(\"UTF-8\")\n",
    "df = pd.read_html(html)[1]\n",
    "#df['population']=[int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in df['population']]\n",
    "df['density']=[int(city.split('\\xa0')[-1].split('[')[0].replace(',','').split('.')[0]) for city in df['density']]\n",
    "    #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- ---"
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
