{ "cells": [ { "cell_type": "code", "execution_count": 140, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Manila , Philippines : 14.5995124,120.9842195,100mi\n", "[{'message': 'Rate limit exceeded', 'code': 88}] Trying again after 1 minute.\n", "[{'message': 'Rate limit exceeded', 'code': 88}] Trying again after 1 minute.\n", "[{'message': 'Rate limit exceeded', 'code': 88}] Trying again after 1 minute.\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-140-020a8135f50b>\u001b[0m in \u001b[0;36mGetTweetsByPopularCities\u001b[0;34m(search_term, numTweets, translateToLocalLanguage)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 215\u001b[0;31m \u001b[0mtweetsWorld\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtweetsWorld\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSearchForData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtranslatedSearch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnumTweets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcityCountry\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 216\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'tweetsWorld' referenced before assignment", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mRateLimitError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-140-020a8135f50b>\u001b[0m in \u001b[0;36mSearchForData\u001b[0;34m(search_term, nTweets, cityCountry, radius)\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcityCountry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 131\u001b[0;31m \u001b[0mtweetsPerDay\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mapi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msearch_term\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnTweetsPerDay\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"recent\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moldest_tweet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgeocode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcoords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muntil\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mday\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 132\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/tweepy/binder.py\u001b[0m in \u001b[0;36m_call\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 250\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 251\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/tweepy/binder.py\u001b[0m in \u001b[0;36mexecute\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 231\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_rate_limit_error_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 232\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRateLimitError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 233\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mRateLimitError\u001b[0m: [{'message': 'Rate limit exceeded', 'code': 88}]", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-140-020a8135f50b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 245\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 247\u001b[0;31m \u001b[0mtweetsUS\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtweetsWorld\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGetTweetsByPopularCities\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'trump'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m200\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 248\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweetsUS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'location'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 249\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweetsWorld\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'location'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m<ipython-input-140-020a8135f50b>\u001b[0m in \u001b[0;36mGetTweetsByPopularCities\u001b[0;34m(search_term, numTweets, translateToLocalLanguage)\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0mtweetsWorld\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtweetsWorld\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSearchForData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtranslatedSearch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnumTweets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcityCountry\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 217\u001b[0;31m \u001b[0mtweetsWorld\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSearchForData\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtranslatedSearch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnumTweets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcityCountry\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 218\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m<ipython-input-140-020a8135f50b>\u001b[0m in \u001b[0;36mSearchForData\u001b[0;34m(search_term, nTweets, cityCountry, radius)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'Trying again after 1 minute.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 136\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m60\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 137\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "import pandas as pd\n", "import wikipedia as wp\n", "from pygeocoder import Geocoder\n", "import time\n", "from googletrans import Translator\n", "# Import and Initialize Sentiment Analyzer\n", "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n", "analyzer = SentimentIntensityAnalyzer()\n", "import requests\n", "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import re\n", "from datetime import datetime\n", "from datetime import date, timedelta\n", "\n", "import tweepy; import json\n", "from apikeys import twitterAccessToken as access_token\n", "from apikeys import twitterAccessTokenSecret as access_token_secret\n", "from apikeys import twitterConsumerKey as consumer_key\n", "from apikeys import twitterConsumerSecretKey as consumer_secret\n", "\n", "def parse_url( url):\n", " response = requests.get(url)\n", " soup = BeautifulSoup(response.text, 'lxml')\n", " listylist=[]\n", " for table in soup.find_all('table'):\n", " listylist.append(parse_html_table(table))\n", " return listylist\n", "\n", "def parse_html_table( table):\n", " n_columns = 0; n_rows=0; column_names = []\n", "\n", " # Find number of rows and columns\n", " # we also find the column titles if we can\n", " for row in table.find_all('tr'):\n", "\n", " # Determine the number of rows in the table\n", " td_tags = row.find_all('td')\n", " if len(td_tags) > 0:\n", " n_rows+=1\n", " if n_columns == 0:\n", " # Set the number of columns for our table\n", " n_columns = len(td_tags)\n", "\n", " # Handle column names if we find them\n", " th_tags = row.find_all('th') \n", " if len(th_tags) > 0 and len(column_names) == 0:\n", " for th in th_tags:\n", " column_names.append(th.get_text())\n", "\n", " # Safeguard on Column Titles\n", " if len(column_names) > 0 and len(column_names) != n_columns:\n", " raise Exception(\"Column titles do not match the number of columns\")\n", "\n", " columns = column_names if len(column_names) > 0 else range(0,n_columns)\n", " df = pd.DataFrame(columns = columns,\n", " index= range(0,n_rows))\n", " row_marker = 0\n", " for row in table.find_all('tr'):\n", " column_marker = 0\n", " columns = row.find_all('td')\n", " for column in columns:\n", " df.iat[row_marker,column_marker] = column.get_text()\n", " column_marker += 1\n", " if len(columns) > 0:\n", " row_marker += 1\n", "\n", " # Convert to float if possible\n", " for col in df:\n", " try:\n", " df[col] = df[col].astype(float)\n", " except ValueError:\n", " pass\n", "\n", " return df\n", "\n", "def getCountryLanguages():\n", " #TODO: Use the .apply to just change the table to one dialect. Imrpove language scope later.\n", " df = parse_url('https://www.infoplease.com/world/countries-world/languages-spoken-each-country-world')\n", " countryLanguages = df[0].rename(columns={0:'country',1:'language'}).set_index('country')\n", " countryLanguages['language'] = [re.sub('\\d+|%|\\(.*\\)|\\s','',i).split(',')[0].split(';')[0] for i in countryLanguages['language']]\n", " return countryLanguages\n", "\n", "\n", "#returns hashtag, followers, following, text, geo, date\n", "#cityCountry example: 'paris,france'\n", "def SearchForData(search_term, nTweets, cityCountry='',radius=100):\n", " \n", " # Setup Tweepy API Authentication\n", " auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", " auth.set_access_token(access_token, access_token_secret)\n", " api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())\n", " \n", " #--- Calculate geocordinates from cityCountry --- --- ---- ---- --- --- --- --\n", " geoConvertTries = 0\n", " while True:\n", " try:\n", " result = Geocoder.geocode(cityCountry)\n", " except Exception as error:\n", " #print('errrooooorrrrr: ',error.message)\n", " if 'OVER_QUERY_LIMIT' in str(error):\n", " print('Encountered an error:{0}\\nWaiting 30 seconds and trying again.'.format(error))\n", " time.sleep(30)\n", " if geoConvertTries>10:\n", " print(\"Could not convert geo. returning empty list\")\n", " return []\n", " elif not (re.search('^\\w+,\\w+$',cityCountry)):\n", " print(\"cityCountry input format is incorrect. It should be \\'city,Country\\' like \\'paris,france\\'\")\n", " return []\n", " else:\n", " print(\"Could not convert geo. returning empty list\")\n", " return []\n", " else:\n", " break\n", " geoConvertTries+=1\n", " # 34.0934,56.134,50mi\n", " coords = str(result[0].coordinates).replace('(','').replace(')','') + f',{radius}mi'\n", " coords=coords.replace(' ','')\n", " print(cityCountry, \": \", coords)\n", " #--- ---- ----- ---- ---- ---- ---- ---- --- ---- ---- --- ---- --- ---- --- --\n", "\n", " #--- grab tweets --- ---- ---- ---- ---- ---- ---- ---- --- --- ---- ---- ----\n", " maxTweets = 10000; oldest_tweet = None; unique_ids = []; desiredTweets = [];nTweetsPerDay=nTweets/8\n", " for day,num in zip([str(date.today() - timedelta(i)).split()[0] for i in range(8)], range(1,9)):\n", " tweetsPerDay=[]\n", " while len(tweetsPerDay) < min(nTweetsPerDay,maxTweets/8):\n", " #--- determine whether to grab tweets by geo or not --- ---- --- ----- --\n", " while True:\n", " try:\n", " if cityCountry:\n", " tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type=\"recent\", max_id=oldest_tweet, geocode=coords, until=day)\n", " else:\n", " tweetsPerDay = api.search(search_term, count=nTweetsPerDay, result_type=\"recent\", max_id=oldest_tweet, until=day)\n", " except Exception as error:\n", " print(error,'Trying again after 1 minute.')\n", " time.sleep(60)\n", " else:\n", " break\n", " #---- ----- ----- ---- ----- ---- ----- ---- ----- ---- ---- ---- ---- --\n", "\n", " #--- Dont go through an infinite loop trying to fill tweets that don't exist -----\n", " if len(tweetsPerDay['statuses'])==0:\n", " print(f'No tweets returned while searching for \\'{search_term}\\'\\n',len(desiredTweets)\\\n", " ,'\\n',day)\n", " return pd.DataFrame(desiredTweets)\n", "\n", " #--- Append relevent tweets to output listy list ---- --- ---- ---- ---- --- ---\n", " for tweet in tweetsPerDay['statuses']:\n", " # Append tweet_id to ids list if it doesn't already exist. This allows checking for duplicate tweets\n", " if tweet[\"id\"] not in unique_ids :\n", " unique_ids.append(tweet['id'])\n", " desiredTweets.append({'text':tweet['text'], 'vader':analyzer.polarity_scores(tweet['text'])['compound'],\n", " 'location':cityCountry,\n", " 'hashtags':tweet['entities']['hashtags'], 'followers':tweet['user']['followers_count'],\n", " 'friends_count':tweet['user']['friends_count'],'statuses_count':tweet['user']['statuses_count'],\n", " 'created_at':datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')})\n", " \n", " # Reassign the the oldest tweet (i.e. the max_id) subtract 1 so the previous oldest isn't included\n", " oldest_tweet = tweet[\"id\"] - 1\n", " \n", "\n", " #--- Print sample tweet --- --- ---- ---- --- ---- ---- --- ---- ---- ---\n", " translator = Translator()\n", " try:\n", " print ('Sample Tweet:',translator.translate(desiredTweets[0]['text'], dest='en').text)\n", " except:\n", " print('there was an error translating sample tweet: ',desiredTweets[0]['text'])\n", " return pd.DataFrame(desiredTweets)\n", "\n", "\n", "def GetTweetsByPopularCities(search_term, numTweets, translateToLocalLanguage = True):\n", " #-- Get the most populated cities from wikipedia (Thank you wikipedia library!) --\n", " html = wp.page(\"List_of_cities_by_population_density\").html().encode(\"UTF-8\")\n", " worldCities = pd.read_html(html)[1]\n", " worldCities = worldCities.drop([2,3,4],axis=1)\n", " worldCities = worldCities.rename(columns={0:'city',1:'population',5:'density',6:'country'})\n", " worldCities = worldCities.iloc[1:]\n", " worldCities['population'] = [int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['population']]\n", " worldCities['density'] = [int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in worldCities['density']]\n", " #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- ---\n", " \n", " #--- population per cities in United States --- ---- ---- --- ---- --- --- --- ---\n", " html = wp.page(\"List_of_United_States_cities_by_population_density\").html().encode(\"UTF-8\")\n", " UScities = pd.read_html(html)[1]\n", " UScities = UScities.drop([0,2,4,6,8],axis=1)\n", " UScities = UScities.rename(columns={1:'city',3:'state',5: 'land area (mi^2)',7:'density'})\n", " UScities = UScities.iloc[1:]\n", " #df['population']=[int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in df['population']]\n", " UScities['density'] = [float(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in UScities['density']]\n", " UScities['land area (mi^2)']=[float(area.split('\\xa0')[-1]) for area in UScities['land area (mi^2)']]\n", " #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- ---\n", " \n", " #--- Get tweets by Worlds most densily populated cities ---- --- ---- ---- --- ---\n", " translator = Translator()\n", " comparisons=pd.DataFrame(columns=['time density','sentiment'])\n", " cityCount = 3\n", " for index,row in worldCities.iterrows():\n", " #-- location --- ----- --- ----\n", " city,pop,density,country = row\n", " cityCountry = city+' , '+country\n", "\n", " #-- language conversion --- ---- --\n", " languagesDf = getCountryLanguages()\n", " if translateToLocalLanguage:\n", " try:\n", " translatedSearch = translator.translate(search_term, src='en', dest=languagesDf.loc[country,'language']).text\n", " except ValueError:\n", " print(\"could not translate \", languagesDf.loc[country,'language'])\n", " translatedSearch=search_term\n", " print('translated word: ',translatedSearch)\n", " else:\n", " translatedSearch=search_term\n", " #--- --- --- ---- ---- --- --- ---\n", " \n", " try:\n", " tweetsWorld = pd.concat([tweetsWorld, SearchForData(translatedSearch, numTweets, cityCountry, 100)], axis=0)\n", " except:\n", " tweetsWorld = SearchForData(translatedSearch, numTweets, cityCountry, 100)\n", " print('\\n')\n", " time.sleep(4)\n", " #if cityCount==0:\n", " # break\n", " #else:\n", " # cityCount-=1\n", " \n", " #--- Add US Cities --- ---- ---- ---- ---- ---\n", " cityCount = 5\n", " for index,row in UScities.iterrows():\n", " #-- location --- ----- --- ----\n", " city,state,area,density = row\n", " cityCountry = state+' , '+city\n", "\n", " try:\n", " tweetsUS = pd.concat([tweetsUS, SearchForData(translatedSearch, numTweets, cityCountry, max(area,5))], axis=0)\n", " except:\n", " tweetsUS = SearchForData(translatedSearch, numTweets, cityCountry, max(area,5))\n", " \n", " print('\\n')\n", " time.sleep(4)\n", " #if cityCount==0:\n", " # break\n", " #else:\n", " # cityCount-=1\n", " return tweetsUS, tweetsWorld\n", "\n", "\n", "\n", "tweetsUS, tweetsWorld = GetTweetsByPopularCities('trump', 200, False)\n", "print(tweetsUS.groupby('location').mean())\n", "print(tweetsWorld.groupby('location').mean())\n", "\n", "\n", " #locations are not required inputs\n", "#tweets = SearchForData(search_term='baguettes', nTweets=100, cityCountry='paris,france',radius=100)\n", "#tweets" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>created_at</th>\n", " <th>followers</th>\n", " <th>friends_count</th>\n", " <th>hashtags</th>\n", " <th>location</th>\n", " <th>statuses_count</th>\n", " <th>text</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2018-06-29 23:17:03+00:00</td>\n", " <td>683</td>\n", " <td>2054</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>33583</td>\n", " <td>I have a strong feeling ....... that due to t...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2018-06-29 22:47:07+00:00</td>\n", " <td>196</td>\n", " <td>1740</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>5359</td>\n", " <td>@tedcruz It must be a cold day in hell, becaus...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2018-06-29 22:17:22+00:00</td>\n", " <td>679</td>\n", " <td>111</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>22461</td>\n", " <td>Loads of respect for @AndrewCMcCarthy - but if...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2018-06-29 21:49:06+00:00</td>\n", " <td>1291</td>\n", " <td>2499</td>\n", " <td>[{'text': 'TuckFrump', 'indices': [66, 76]}]</td>\n", " <td>Manila , Philippines</td>\n", " <td>62316</td>\n", " <td>@BishTrumpsCray @zeitgeistbabe @IvankaTrump @S...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2018-06-29 21:41:01+00:00</td>\n", " <td>76</td>\n", " <td>108</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>5491</td>\n", " <td>The media no longer has any credibility with p...</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>2018-06-29 21:38:04+00:00</td>\n", " <td>1407</td>\n", " <td>789</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>14301</td>\n", " <td>Would you rather have Donald Trump or Kanye We...</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>2018-06-29 21:21:40+00:00</td>\n", " <td>1371</td>\n", " <td>1069</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>100819</td>\n", " <td>Did Poland eventually lose World War II becaus...</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>2018-06-29 21:16:25+00:00</td>\n", " <td>1464</td>\n", " <td>2907</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>5879</td>\n", " <td>RT @Starshadow: @dimobey @ananavarro \"He was j...</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>2018-06-29 21:11:44+00:00</td>\n", " <td>49</td>\n", " <td>135</td>\n", " <td>[{'text': 'StutteringJohn', 'indices': [88, 10...</td>\n", " <td>Manila , Philippines</td>\n", " <td>583</td>\n", " <td>Hopefully the Donald has a sense of humor abou...</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>2018-06-29 20:56:39+00:00</td>\n", " <td>1578</td>\n", " <td>1004</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>35862</td>\n", " <td>@flotus isn't it true when Melania Trump first...</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>2018-06-28 23:43:53+00:00</td>\n", " <td>134</td>\n", " <td>356</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>26806</td>\n", " <td>Oprah’s SICK S*x Vid Out And Completely Demoli...</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>2018-06-28 23:33:56+00:00</td>\n", " <td>72</td>\n", " <td>128</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>4945</td>\n", " <td>@rob1cox More from our unbiased MSM. Need a...</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>2018-06-28 23:20:09+00:00</td>\n", " <td>151</td>\n", " <td>1104</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>1073</td>\n", " <td>RT @RinChupeco: White people, journalists who ...</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>2018-06-28 23:12:38+00:00</td>\n", " <td>248</td>\n", " <td>1989</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>4702</td>\n", " <td>@ESSsubreddit @SenSanders No they didn't. Do y...</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>2018-06-28 23:06:50+00:00</td>\n", " <td>366677</td>\n", " <td>201</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>201963</td>\n", " <td>Trump-Putin meeting to follow NATO gathering a...</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>2018-06-28 23:01:51+00:00</td>\n", " <td>293</td>\n", " <td>1382</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>8802</td>\n", " <td>RT @dzIQ990: US President Donald Trump at Russ...</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>2018-06-28 22:55:40+00:00</td>\n", " <td>1757</td>\n", " <td>1685</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>37555</td>\n", " <td>RT @RinChupeco: White people, journalists who ...</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>2018-06-28 22:54:25+00:00</td>\n", " <td>3789</td>\n", " <td>4907</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>26129</td>\n", " <td>RT @RinChupeco: White people, journalists who ...</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>2018-06-27 23:57:35+00:00</td>\n", " <td>21</td>\n", " <td>208</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>917</td>\n", " <td>RT @RinChupeco: White people, journalists who ...</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>2018-06-27 23:49:44+00:00</td>\n", " <td>42</td>\n", " <td>66</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>5268</td>\n", " <td>@charliekirk11 @realDonaldTrump Trump has quit...</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>2018-06-27 23:45:00+00:00</td>\n", " <td>5863642</td>\n", " <td>1304</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>602468</td>\n", " <td>Trump trade uncertainty weighs on US stocks as...</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>2018-06-27 23:38:20+00:00</td>\n", " <td>207</td>\n", " <td>1055</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>4507</td>\n", " <td>RT @RinChupeco: White people, journalists who ...</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>2018-06-27 23:36:29+00:00</td>\n", " <td>510</td>\n", " <td>434</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>32256</td>\n", " <td>What do you think of Trump's presidential run ...</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>2018-06-27 23:35:27+00:00</td>\n", " <td>32</td>\n", " <td>155</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>1744</td>\n", " <td>RT @marieAnne0915: Im a bit disappointed sa mg...</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>2018-06-27 23:34:16+00:00</td>\n", " <td>141</td>\n", " <td>528</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>3211</td>\n", " <td>RT @RinChupeco: White people, journalists who ...</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>2018-06-27 23:10:01+00:00</td>\n", " <td>544428</td>\n", " <td>212</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>499788</td>\n", " <td>EU chief says Europe must prepare for the wors...</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>2018-06-26 23:59:58+00:00</td>\n", " <td>7452</td>\n", " <td>879</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>19731</td>\n", " <td>RT @RinChupeco: White people, journalists who ...</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>2018-06-26 23:55:02+00:00</td>\n", " <td>291</td>\n", " <td>162</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>7863</td>\n", " <td>RT @RinChupeco: White people, journalists who ...</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>2018-06-26 23:54:54+00:00</td>\n", " <td>37</td>\n", " <td>30</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>58215</td>\n", " <td>Missouri Nail Manufacturer Loses Half Its Busi...</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>2018-06-26 23:53:21+00:00</td>\n", " <td>41</td>\n", " <td>451</td>\n", " <td>[]</td>\n", " <td>Manila , Philippines</td>\n", " <td>14044</td>\n", " <td>RT @RinChupeco: White people, journalists who ...</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>2018-06-26 23:59:58+00:00</td>\n", " <td>906</td>\n", " <td>721</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>107703</td>\n", " <td>RT @kimguilfoyle: A HUGE win for President Tru...</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>2018-06-26 23:59:58+00:00</td>\n", " <td>1269</td>\n", " <td>365</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>34076</td>\n", " <td>RT @faiza_n_ali: This just happened. The highe...</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>2018-06-26 23:59:58+00:00</td>\n", " <td>6250</td>\n", " <td>5658</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>22409</td>\n", " <td>DOJ watchdog report sheds light on love lives ...</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>2018-06-26 23:59:58+00:00</td>\n", " <td>2233</td>\n", " <td>494</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>311601</td>\n", " <td>RT @true_pundit: Barack Obama Concerned About ...</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", " <td>2018-06-26 23:59:58+00:00</td>\n", " <td>4297</td>\n", " <td>4376</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>42347</td>\n", " <td>One NBA Legend Attended A Trump Rally And The ...</td>\n", " </tr>\n", " <tr>\n", " <th>31</th>\n", " <td>2018-06-26 23:59:58+00:00</td>\n", " <td>290</td>\n", " <td>1210</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>144325</td>\n", " <td>RT @MrDane1982: Give me a fucking break, Berni...</td>\n", " </tr>\n", " <tr>\n", " <th>32</th>\n", " <td>2018-06-25 23:59:59+00:00</td>\n", " <td>145</td>\n", " <td>205</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>35478</td>\n", " <td>RT @maddow: NEW: MSNBC has obtained the first...</td>\n", " </tr>\n", " <tr>\n", " <th>33</th>\n", " <td>2018-06-25 23:59:58+00:00</td>\n", " <td>2723</td>\n", " <td>1986</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>42025</td>\n", " <td>Related: for some reason I blocked @Nickelodeo...</td>\n", " </tr>\n", " <tr>\n", " <th>34</th>\n", " <td>2018-06-25 23:59:58+00:00</td>\n", " <td>4489</td>\n", " <td>4998</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>27990</td>\n", " <td>RT @Kipnis4Congress: .@PhilMurphyNJ @RepBonnie...</td>\n", " </tr>\n", " <tr>\n", " <th>35</th>\n", " <td>2018-06-25 23:59:58+00:00</td>\n", " <td>169</td>\n", " <td>607</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>1583</td>\n", " <td>RT @JoeNBC: Trump Administration report shows ...</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>2018-06-25 23:59:57+00:00</td>\n", " <td>91</td>\n", " <td>186</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>26056</td>\n", " <td>RT @kylegriffin1: The Family Case Management P...</td>\n", " </tr>\n", " <tr>\n", " <th>37</th>\n", " <td>2018-06-25 23:59:57+00:00</td>\n", " <td>5145</td>\n", " <td>3198</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>425496</td>\n", " <td>RT @NBCNews: A 20-foot-tall inflatable orange ...</td>\n", " </tr>\n", " <tr>\n", " <th>38</th>\n", " <td>2018-06-25 23:59:57+00:00</td>\n", " <td>805</td>\n", " <td>1150</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>60897</td>\n", " <td>RT @maddow: NEW: MSNBC has obtained the first...</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>2018-06-25 23:59:57+00:00</td>\n", " <td>38</td>\n", " <td>126</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>1362</td>\n", " <td>RT @kylegriffin1: The Red Hen passed its most ...</td>\n", " </tr>\n", " <tr>\n", " <th>40</th>\n", " <td>2018-06-24 23:59:59+00:00</td>\n", " <td>5824</td>\n", " <td>5835</td>\n", " <td>[{'text': 'TrumpConcentrationCamps', 'indices'...</td>\n", " <td>New York City , New York</td>\n", " <td>478</td>\n", " <td>RT @leecaly: Nice message Melania Trump wore o...</td>\n", " </tr>\n", " <tr>\n", " <th>41</th>\n", " <td>2018-06-24 23:59:59+00:00</td>\n", " <td>941</td>\n", " <td>208</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>341641</td>\n", " <td>RT @NBCNews: This Obama-era pilot program kept...</td>\n", " </tr>\n", " <tr>\n", " <th>42</th>\n", " <td>2018-06-24 23:59:59+00:00</td>\n", " <td>304</td>\n", " <td>313</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>14166</td>\n", " <td>RT @Jamierodr10: Thank you @RealJamesWoods for...</td>\n", " </tr>\n", " <tr>\n", " <th>43</th>\n", " <td>2018-06-24 23:59:58+00:00</td>\n", " <td>750</td>\n", " <td>248</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>26266</td>\n", " <td>RT @gr8tjude: Virginia Lawmakers Rebuke Anti-T...</td>\n", " </tr>\n", " <tr>\n", " <th>44</th>\n", " <td>2018-06-24 23:59:58+00:00</td>\n", " <td>2765</td>\n", " <td>4921</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>69653</td>\n", " <td>RT @Amy_Siskind: Some things you need to notic...</td>\n", " </tr>\n", " <tr>\n", " <th>45</th>\n", " <td>2018-06-24 23:59:58+00:00</td>\n", " <td>1744</td>\n", " <td>228</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>22462</td>\n", " <td>RT @TwitterMoments: California @RepMaxineWater...</td>\n", " </tr>\n", " <tr>\n", " <th>46</th>\n", " <td>2018-06-24 23:59:58+00:00</td>\n", " <td>216</td>\n", " <td>361</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>12461</td>\n", " <td>RT @JoeNBC: I cannot wait to hear Trump apolog...</td>\n", " </tr>\n", " <tr>\n", " <th>47</th>\n", " <td>2018-06-24 23:59:58+00:00</td>\n", " <td>33</td>\n", " <td>176</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>94</td>\n", " <td>RT @JoeNBC: Trump Administration report shows ...</td>\n", " </tr>\n", " <tr>\n", " <th>48</th>\n", " <td>2018-06-23 23:59:59+00:00</td>\n", " <td>2143</td>\n", " <td>2597</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>7256</td>\n", " <td>RT @SenSchumer: The Special Counsel’s investig...</td>\n", " </tr>\n", " <tr>\n", " <th>49</th>\n", " <td>2018-06-23 23:59:59+00:00</td>\n", " <td>37177</td>\n", " <td>6292</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>220250</td>\n", " <td>RT @dicktofel: This story makes a big deal of ...</td>\n", " </tr>\n", " <tr>\n", " <th>50</th>\n", " <td>2018-06-23 23:59:59+00:00</td>\n", " <td>43</td>\n", " <td>110</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>3208</td>\n", " <td>RT @DanRather: When it comes to the Trump Admi...</td>\n", " </tr>\n", " <tr>\n", " <th>51</th>\n", " <td>2018-06-23 23:59:58+00:00</td>\n", " <td>228</td>\n", " <td>507</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>15361</td>\n", " <td>RT @SenSchumer: The Special Counsel’s investig...</td>\n", " </tr>\n", " <tr>\n", " <th>52</th>\n", " <td>2018-06-23 23:59:58+00:00</td>\n", " <td>40</td>\n", " <td>151</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>290</td>\n", " <td>RT @JoeNBC: “This is beyond narcissism. I used...</td>\n", " </tr>\n", " <tr>\n", " <th>53</th>\n", " <td>2018-06-23 23:59:58+00:00</td>\n", " <td>39</td>\n", " <td>191</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>483</td>\n", " <td>RT @RubenBaezJr: @PressSec @POTUS Today has be...</td>\n", " </tr>\n", " <tr>\n", " <th>54</th>\n", " <td>2018-06-23 23:59:57+00:00</td>\n", " <td>288</td>\n", " <td>432</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>2079</td>\n", " <td>RT @GrassrootsSpeak: Dear Kim Kardashian\\n\\nLa...</td>\n", " </tr>\n", " <tr>\n", " <th>55</th>\n", " <td>2018-06-23 23:59:56+00:00</td>\n", " <td>451</td>\n", " <td>444</td>\n", " <td>[]</td>\n", " <td>New York City , New York</td>\n", " <td>3615</td>\n", " <td>@cswany2 @Lawrence I know you're a Trump perso...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>520 rows × 7 columns</p>\n", "</div>" ], "text/plain": [ " created_at followers friends_count \\\n", "0 2018-06-29 23:17:03+00:00 683 2054 \n", "1 2018-06-29 22:47:07+00:00 196 1740 \n", "2 2018-06-29 22:17:22+00:00 679 111 \n", "3 2018-06-29 21:49:06+00:00 1291 2499 \n", "4 2018-06-29 21:41:01+00:00 76 108 \n", "5 2018-06-29 21:38:04+00:00 1407 789 \n", "6 2018-06-29 21:21:40+00:00 1371 1069 \n", "7 2018-06-29 21:16:25+00:00 1464 2907 \n", "8 2018-06-29 21:11:44+00:00 49 135 \n", "9 2018-06-29 20:56:39+00:00 1578 1004 \n", "10 2018-06-28 23:43:53+00:00 134 356 \n", "11 2018-06-28 23:33:56+00:00 72 128 \n", "12 2018-06-28 23:20:09+00:00 151 1104 \n", "13 2018-06-28 23:12:38+00:00 248 1989 \n", "14 2018-06-28 23:06:50+00:00 366677 201 \n", "15 2018-06-28 23:01:51+00:00 293 1382 \n", "16 2018-06-28 22:55:40+00:00 1757 1685 \n", "17 2018-06-28 22:54:25+00:00 3789 4907 \n", "18 2018-06-27 23:57:35+00:00 21 208 \n", "19 2018-06-27 23:49:44+00:00 42 66 \n", "20 2018-06-27 23:45:00+00:00 5863642 1304 \n", "21 2018-06-27 23:38:20+00:00 207 1055 \n", "22 2018-06-27 23:36:29+00:00 510 434 \n", "23 2018-06-27 23:35:27+00:00 32 155 \n", "24 2018-06-27 23:34:16+00:00 141 528 \n", "25 2018-06-27 23:10:01+00:00 544428 212 \n", "26 2018-06-26 23:59:58+00:00 7452 879 \n", "27 2018-06-26 23:55:02+00:00 291 162 \n", "28 2018-06-26 23:54:54+00:00 37 30 \n", "29 2018-06-26 23:53:21+00:00 41 451 \n", ".. ... ... ... \n", "26 2018-06-26 23:59:58+00:00 906 721 \n", "27 2018-06-26 23:59:58+00:00 1269 365 \n", "28 2018-06-26 23:59:58+00:00 6250 5658 \n", "29 2018-06-26 23:59:58+00:00 2233 494 \n", "30 2018-06-26 23:59:58+00:00 4297 4376 \n", "31 2018-06-26 23:59:58+00:00 290 1210 \n", "32 2018-06-25 23:59:59+00:00 145 205 \n", "33 2018-06-25 23:59:58+00:00 2723 1986 \n", "34 2018-06-25 23:59:58+00:00 4489 4998 \n", "35 2018-06-25 23:59:58+00:00 169 607 \n", "36 2018-06-25 23:59:57+00:00 91 186 \n", "37 2018-06-25 23:59:57+00:00 5145 3198 \n", "38 2018-06-25 23:59:57+00:00 805 1150 \n", "39 2018-06-25 23:59:57+00:00 38 126 \n", "40 2018-06-24 23:59:59+00:00 5824 5835 \n", "41 2018-06-24 23:59:59+00:00 941 208 \n", "42 2018-06-24 23:59:59+00:00 304 313 \n", "43 2018-06-24 23:59:58+00:00 750 248 \n", "44 2018-06-24 23:59:58+00:00 2765 4921 \n", "45 2018-06-24 23:59:58+00:00 1744 228 \n", "46 2018-06-24 23:59:58+00:00 216 361 \n", "47 2018-06-24 23:59:58+00:00 33 176 \n", "48 2018-06-23 23:59:59+00:00 2143 2597 \n", "49 2018-06-23 23:59:59+00:00 37177 6292 \n", "50 2018-06-23 23:59:59+00:00 43 110 \n", "51 2018-06-23 23:59:58+00:00 228 507 \n", "52 2018-06-23 23:59:58+00:00 40 151 \n", "53 2018-06-23 23:59:58+00:00 39 191 \n", "54 2018-06-23 23:59:57+00:00 288 432 \n", "55 2018-06-23 23:59:56+00:00 451 444 \n", "\n", " hashtags \\\n", "0 [] \n", "1 [] \n", "2 [] \n", "3 [{'text': 'TuckFrump', 'indices': [66, 76]}] \n", "4 [] \n", "5 [] \n", "6 [] \n", "7 [] \n", "8 [{'text': 'StutteringJohn', 'indices': [88, 10... \n", "9 [] \n", "10 [] \n", "11 [] \n", "12 [] \n", "13 [] \n", "14 [] \n", "15 [] \n", "16 [] \n", "17 [] \n", "18 [] \n", "19 [] \n", "20 [] \n", "21 [] \n", "22 [] \n", "23 [] \n", "24 [] \n", "25 [] \n", "26 [] \n", "27 [] \n", "28 [] \n", "29 [] \n", ".. ... \n", "26 [] \n", "27 [] \n", "28 [] \n", "29 [] \n", "30 [] \n", "31 [] \n", "32 [] \n", "33 [] \n", "34 [] \n", "35 [] \n", "36 [] \n", "37 [] \n", "38 [] \n", "39 [] \n", "40 [{'text': 'TrumpConcentrationCamps', 'indices'... \n", "41 [] \n", "42 [] \n", "43 [] \n", "44 [] \n", "45 [] \n", "46 [] \n", "47 [] \n", "48 [] \n", "49 [] \n", "50 [] \n", "51 [] \n", "52 [] \n", "53 [] \n", "54 [] \n", "55 [] \n", "\n", " location statuses_count \\\n", "0 Manila , Philippines 33583 \n", "1 Manila , Philippines 5359 \n", "2 Manila , Philippines 22461 \n", "3 Manila , Philippines 62316 \n", "4 Manila , Philippines 5491 \n", "5 Manila , Philippines 14301 \n", "6 Manila , Philippines 100819 \n", "7 Manila , Philippines 5879 \n", "8 Manila , Philippines 583 \n", "9 Manila , Philippines 35862 \n", "10 Manila , Philippines 26806 \n", "11 Manila , Philippines 4945 \n", "12 Manila , Philippines 1073 \n", "13 Manila , Philippines 4702 \n", "14 Manila , Philippines 201963 \n", "15 Manila , Philippines 8802 \n", "16 Manila , Philippines 37555 \n", "17 Manila , Philippines 26129 \n", "18 Manila , Philippines 917 \n", "19 Manila , Philippines 5268 \n", "20 Manila , Philippines 602468 \n", "21 Manila , Philippines 4507 \n", "22 Manila , Philippines 32256 \n", "23 Manila , Philippines 1744 \n", "24 Manila , Philippines 3211 \n", "25 Manila , Philippines 499788 \n", "26 Manila , Philippines 19731 \n", "27 Manila , Philippines 7863 \n", "28 Manila , Philippines 58215 \n", "29 Manila , Philippines 14044 \n", ".. ... ... \n", "26 New York City , New York 107703 \n", "27 New York City , New York 34076 \n", "28 New York City , New York 22409 \n", "29 New York City , New York 311601 \n", "30 New York City , New York 42347 \n", "31 New York City , New York 144325 \n", "32 New York City , New York 35478 \n", "33 New York City , New York 42025 \n", "34 New York City , New York 27990 \n", "35 New York City , New York 1583 \n", "36 New York City , New York 26056 \n", "37 New York City , New York 425496 \n", "38 New York City , New York 60897 \n", "39 New York City , New York 1362 \n", "40 New York City , New York 478 \n", "41 New York City , New York 341641 \n", "42 New York City , New York 14166 \n", "43 New York City , New York 26266 \n", "44 New York City , New York 69653 \n", "45 New York City , New York 22462 \n", "46 New York City , New York 12461 \n", "47 New York City , New York 94 \n", "48 New York City , New York 7256 \n", "49 New York City , New York 220250 \n", "50 New York City , New York 3208 \n", "51 New York City , New York 15361 \n", "52 New York City , New York 290 \n", "53 New York City , New York 483 \n", "54 New York City , New York 2079 \n", "55 New York City , New York 3615 \n", "\n", " text \n", "0 I have a strong feeling ....... that due to t... \n", "1 @tedcruz It must be a cold day in hell, becaus... \n", "2 Loads of respect for @AndrewCMcCarthy - but if... \n", "3 @BishTrumpsCray @zeitgeistbabe @IvankaTrump @S... \n", "4 The media no longer has any credibility with p... \n", "5 Would you rather have Donald Trump or Kanye We... \n", "6 Did Poland eventually lose World War II becaus... \n", "7 RT @Starshadow: @dimobey @ananavarro \"He was j... \n", "8 Hopefully the Donald has a sense of humor abou... \n", "9 @flotus isn't it true when Melania Trump first... \n", "10 Oprah’s SICK S*x Vid Out And Completely Demoli... \n", "11 @rob1cox More from our unbiased MSM. Need a... \n", "12 RT @RinChupeco: White people, journalists who ... \n", "13 @ESSsubreddit @SenSanders No they didn't. Do y... \n", "14 Trump-Putin meeting to follow NATO gathering a... \n", "15 RT @dzIQ990: US President Donald Trump at Russ... \n", "16 RT @RinChupeco: White people, journalists who ... \n", "17 RT @RinChupeco: White people, journalists who ... \n", "18 RT @RinChupeco: White people, journalists who ... \n", "19 @charliekirk11 @realDonaldTrump Trump has quit... \n", "20 Trump trade uncertainty weighs on US stocks as... \n", "21 RT @RinChupeco: White people, journalists who ... \n", "22 What do you think of Trump's presidential run ... \n", "23 RT @marieAnne0915: Im a bit disappointed sa mg... \n", "24 RT @RinChupeco: White people, journalists who ... \n", "25 EU chief says Europe must prepare for the wors... \n", "26 RT @RinChupeco: White people, journalists who ... \n", "27 RT @RinChupeco: White people, journalists who ... \n", "28 Missouri Nail Manufacturer Loses Half Its Busi... \n", "29 RT @RinChupeco: White people, journalists who ... \n", ".. ... \n", "26 RT @kimguilfoyle: A HUGE win for President Tru... \n", "27 RT @faiza_n_ali: This just happened. The highe... \n", "28 DOJ watchdog report sheds light on love lives ... \n", "29 RT @true_pundit: Barack Obama Concerned About ... \n", "30 One NBA Legend Attended A Trump Rally And The ... \n", "31 RT @MrDane1982: Give me a fucking break, Berni... \n", "32 RT @maddow: NEW: MSNBC has obtained the first... \n", "33 Related: for some reason I blocked @Nickelodeo... \n", "34 RT @Kipnis4Congress: .@PhilMurphyNJ @RepBonnie... \n", "35 RT @JoeNBC: Trump Administration report shows ... \n", "36 RT @kylegriffin1: The Family Case Management P... \n", "37 RT @NBCNews: A 20-foot-tall inflatable orange ... \n", "38 RT @maddow: NEW: MSNBC has obtained the first... \n", "39 RT @kylegriffin1: The Red Hen passed its most ... \n", "40 RT @leecaly: Nice message Melania Trump wore o... \n", "41 RT @NBCNews: This Obama-era pilot program kept... \n", "42 RT @Jamierodr10: Thank you @RealJamesWoods for... \n", "43 RT @gr8tjude: Virginia Lawmakers Rebuke Anti-T... \n", "44 RT @Amy_Siskind: Some things you need to notic... \n", "45 RT @TwitterMoments: California @RepMaxineWater... \n", "46 RT @JoeNBC: I cannot wait to hear Trump apolog... \n", "47 RT @JoeNBC: Trump Administration report shows ... \n", "48 RT @SenSchumer: The Special Counsel’s investig... \n", "49 RT @dicktofel: This story makes a big deal of ... \n", "50 RT @DanRather: When it comes to the Trump Admi... \n", "51 RT @SenSchumer: The Special Counsel’s investig... \n", "52 RT @JoeNBC: “This is beyond narcissism. I used... \n", "53 RT @RubenBaezJr: @PressSec @POTUS Today has be... \n", "54 RT @GrassrootsSpeak: Dear Kim Kardashian\\n\\nLa... \n", "55 @cswany2 @Lawrence I know you're a Trump perso... \n", "\n", "[520 rows x 7 columns]" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datetime import datetime\n", "import matplotlib.pyplot as plt\n", "timeBetween=[]\n", "prevCreatedAt = datetime.strptime(tweets[0]['created_at'],'%a %b %d %H:%M:%S %z %Y')\n", "for tweet in tweets[1:]:\n", " #print(tweet['created_at'])\n", " timeBetween.append((prevCreatedAt - datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')).total_seconds()/60)\n", " prevCreatedAt = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y')\n", "plt.plot(range(999),timeBetween)\n", "plt.show()\n", "plt.plot([datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y') for tweet in tweets][::-1],range(1000))\n", "#plt.x('date',rotation='vertical')\n", "plt.show()\n", "print((datetime.strptime(tweets[-1]['created_at'],'%a %b %d %H:%M:%S %z %Y')-datetime.strptime(tweets[0]['created_at'],'%a %b %d %H:%M:%S %z %Y')).total_seconds()//60//60)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#This code came from the following link:\n", "#https://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/\n", "import requests\n", "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import re\n", "\n", "class HTMLTableParser():\n", "\n", " def parse_url(self, url):\n", " response = requests.get(url)\n", " soup = BeautifulSoup(response.text, 'lxml')\n", " listylist=[]\n", " for table in soup.find_all('table'):\n", " listylist.append(self.parse_html_table(table))\n", " return listylist\n", "\n", " def parse_html_table(self, table):\n", " n_columns = 0; n_rows=0; column_names = []\n", "\n", " # Find number of rows and columns\n", " # we also find the column titles if we can\n", " for row in table.find_all('tr'):\n", "\n", " # Determine the number of rows in the table\n", " td_tags = row.find_all('td')\n", " if len(td_tags) > 0:\n", " n_rows+=1\n", " if n_columns == 0:\n", " # Set the number of columns for our table\n", " n_columns = len(td_tags)\n", "\n", " # Handle column names if we find them\n", " th_tags = row.find_all('th') \n", " if len(th_tags) > 0 and len(column_names) == 0:\n", " for th in th_tags:\n", " column_names.append(th.get_text())\n", "\n", " # Safeguard on Column Titles\n", " if len(column_names) > 0 and len(column_names) != n_columns:\n", " raise Exception(\"Column titles do not match the number of columns\")\n", "\n", " columns = column_names if len(column_names) > 0 else range(0,n_columns)\n", " df = pd.DataFrame(columns = columns,\n", " index= range(0,n_rows))\n", " row_marker = 0\n", " for row in table.find_all('tr'):\n", " column_marker = 0\n", " columns = row.find_all('td')\n", " for column in columns:\n", " df.iat[row_marker,column_marker] = column.get_text()\n", " column_marker += 1\n", " if len(columns) > 0:\n", " row_marker += 1\n", "\n", " # Convert to float if possible\n", " for col in df:\n", " try:\n", " df[col] = df[col].astype(float)\n", " except ValueError:\n", " pass\n", "\n", " return df\n", "\n", "#TODO: Use the .apply to just change the table to one dialect. Imrpove language scope later.\n", "obj = HTMLTableParser()\n", "df = obj.parse_url('https://www.infoplease.com/world/countries-world/languages-spoken-each-country-world')\n", "countryLanguages = df[0].rename(columns={0:'country',1:'language'}).set_index('country')\n", "newDict = []\n", "for index, row in countryLanguages.iterrows():\n", " language = re.sub('\\d+|%|\\(.*\\)|\\s','',countryLanguages.loc[index].values[0]).split(',')[0].split(';')[0]\n", " newDict.append([index, language]) #print(index,\": \",language)\n", "newDict\n", "\n", "newDf = pd.DataFrame(newDict)\n", "newDf = newDf.rename(columns={0:'country',1:'language'}).set_index('country')\n", "newDf" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>language</th>\n", " <th>languages</th>\n", " </tr>\n", " <tr>\n", " <th>country</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Afghanistan</th>\n", " <td>Dari Persian, Pashtu (both official), other Tu...</td>\n", " <td>DariPersian</td>\n", " </tr>\n", " <tr>\n", " <th>Albania</th>\n", " <td>Albanian (Tosk is the official dialect), Greek</td>\n", " <td>Albanian</td>\n", " </tr>\n", " <tr>\n", " <th>Algeria</th>\n", " <td>Arabic (official), French, Berber dialects</td>\n", " <td>Arabic</td>\n", " </tr>\n", " <tr>\n", " <th>Andorra</th>\n", " <td>Catalán (official), French, Castilian, Portuguese</td>\n", " <td>Catalán</td>\n", " </tr>\n", " <tr>\n", " <th>Angola</th>\n", " <td>Portuguese (official), Bantu and other African...</td>\n", " <td>Portuguese</td>\n", " </tr>\n", " <tr>\n", " <th>Antigua and Barbuda</th>\n", " <td>English (official), local dialects</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Argentina</th>\n", " <td>Spanish (official), English, Italian, German, ...</td>\n", " <td>Spanish</td>\n", " </tr>\n", " <tr>\n", " <th>Armenia</th>\n", " <td>Armenian 98%, Yezidi, Russian</td>\n", " <td>Armenian</td>\n", " </tr>\n", " <tr>\n", " <th>Australia</th>\n", " <td>English 79%, native and other languages</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Austria</th>\n", " <td>German (official nationwide); Slovene, Croatia...</td>\n", " <td>German</td>\n", " </tr>\n", " <tr>\n", " <th>Azerbaijan</th>\n", " <td>Azerbaijani Turkic 89%, Russian 3%, Armenian 2...</td>\n", " <td>AzerbaijaniTurkic</td>\n", " </tr>\n", " <tr>\n", " <th>Bahamas</th>\n", " <td>English (official), Creole (among Haitian immi...</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Bahrain</th>\n", " <td>Arabic, English, Farsi, Urdu</td>\n", " <td>Arabic</td>\n", " </tr>\n", " <tr>\n", " <th>Bangladesh</th>\n", " <td>Bangla (official), English</td>\n", " <td>Bangla</td>\n", " </tr>\n", " <tr>\n", " <th>Barbados</th>\n", " <td>English</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Belarus</th>\n", " <td>Belorussian (White Russian), Russian, other</td>\n", " <td>Belorussian</td>\n", " </tr>\n", " <tr>\n", " <th>Belgium</th>\n", " <td>Dutch (Flemish) 60%, French 40%, German less t...</td>\n", " <td>Dutch</td>\n", " </tr>\n", " <tr>\n", " <th>Belize</th>\n", " <td>English (official), Spanish, Mayan, Garifuna (...</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Benin</th>\n", " <td>French (official), Fon, Yoruba, tribal languages</td>\n", " <td>French</td>\n", " </tr>\n", " <tr>\n", " <th>Bhutan</th>\n", " <td>Dzongkha (official), Tibetan dialects (among B...</td>\n", " <td>Dzongkha</td>\n", " </tr>\n", " <tr>\n", " <th>Bolivia</th>\n", " <td>Spanish, Quechua, Aymara (all official)</td>\n", " <td>Spanish</td>\n", " </tr>\n", " <tr>\n", " <th>Bosnia and Herzegovina</th>\n", " <td>Bosnian, Croatian, Serbian</td>\n", " <td>Bosnian</td>\n", " </tr>\n", " <tr>\n", " <th>Botswana</th>\n", " <td>English 2% (official), Setswana 78%, Kalanga 8...</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Brazil</th>\n", " <td>Portuguese (official), Spanish, English, French</td>\n", " <td>Portuguese</td>\n", " </tr>\n", " <tr>\n", " <th>Brunei</th>\n", " <td>Malay (official), English, Chinese</td>\n", " <td>Malay</td>\n", " </tr>\n", " <tr>\n", " <th>Bulgaria</th>\n", " <td>Bulgarian 85%, Turkish 10%, Roma 4%</td>\n", " <td>Bulgarian</td>\n", " </tr>\n", " <tr>\n", " <th>Burkina Faso</th>\n", " <td>French (official); native African (Sudanic) la...</td>\n", " <td>Frenchlanguages</td>\n", " </tr>\n", " <tr>\n", " <th>Burundi</th>\n", " <td>Kirundi and French (official), Swahili</td>\n", " <td>KirundiandFrench</td>\n", " </tr>\n", " <tr>\n", " <th>Cambodia</th>\n", " <td>Khmer 95% (official), French, English</td>\n", " <td>Khmer</td>\n", " </tr>\n", " <tr>\n", " <th>Cameroon</th>\n", " <td>French, English (both official); 24 major Afri...</td>\n", " <td>French</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>Swaziland</th>\n", " <td>English, siSwati (both official)</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Sweden</th>\n", " <td>Swedish, small Sami- and Finnish-speaking mino...</td>\n", " <td>Swedish</td>\n", " </tr>\n", " <tr>\n", " <th>Switzerland</th>\n", " <td>German 64%, French 20%, Italian 7% (all offici...</td>\n", " <td>German</td>\n", " </tr>\n", " <tr>\n", " <th>Syria</th>\n", " <td>Arabic (official); Kurdish, Armenian, Aramaic,...</td>\n", " <td>Arabic</td>\n", " </tr>\n", " <tr>\n", " <th>Taiwan</th>\n", " <td>Chinese (Mandarin, official), Taiwanese (Min),...</td>\n", " <td>Chinese</td>\n", " </tr>\n", " <tr>\n", " <th>Tajikistan</th>\n", " <td>Tajik (official), Russian widely used in gover...</td>\n", " <td>Tajik</td>\n", " </tr>\n", " <tr>\n", " <th>Tanzania</th>\n", " <td>Swahili, English (both official); Arabic; many...</td>\n", " <td>Swahili</td>\n", " </tr>\n", " <tr>\n", " <th>Thailand</th>\n", " <td>Thai (Siamese), English (secondary language of...</td>\n", " <td>Thai</td>\n", " </tr>\n", " <tr>\n", " <th>Togo</th>\n", " <td>French (official, commerce); Ewé, Mina (south)...</td>\n", " <td>French</td>\n", " </tr>\n", " <tr>\n", " <th>Tonga</th>\n", " <td>Tongan (an Austronesian language), English</td>\n", " <td>Tongan</td>\n", " </tr>\n", " <tr>\n", " <th>Trinidad and Tobago</th>\n", " <td>English (official), Hindi, French, Spanish, Ch...</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Tunisia</th>\n", " <td>Arabic (official, commerce), French (commerce)</td>\n", " <td>Arabic</td>\n", " </tr>\n", " <tr>\n", " <th>Turkey</th>\n", " <td>Turkish (official), Kurdish, Dimli, Azeri, Kab...</td>\n", " <td>Turkish</td>\n", " </tr>\n", " <tr>\n", " <th>Turkmenistan</th>\n", " <td>Turkmen 72%; Russian 12%; Uzbek 9%, other 7%</td>\n", " <td>Turkmen</td>\n", " </tr>\n", " <tr>\n", " <th>Tuvalu</th>\n", " <td>Tuvaluan, English, Samoan, Kiribati (on the is...</td>\n", " <td>Tuvaluan</td>\n", " </tr>\n", " <tr>\n", " <th>Uganda</th>\n", " <td>English (official), Ganda or Luganda, other Ni...</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Ukraine</th>\n", " <td>Ukrainian 67%, Russian 24%, Romanian, Polish, ...</td>\n", " <td>Ukrainian</td>\n", " </tr>\n", " <tr>\n", " <th>United Arab Emirates</th>\n", " <td>Arabic (official), Persian, English, Hindi, Urdu</td>\n", " <td>Arabic</td>\n", " </tr>\n", " <tr>\n", " <th>United Kingdom</th>\n", " <td>English, Welsh, Scots Gaelic</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>United States</th>\n", " <td>English 82%, Spanish 11% (2000)</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Uruguay</th>\n", " <td>Spanish, Portunol, or Brazilero</td>\n", " <td>Spanish</td>\n", " </tr>\n", " <tr>\n", " <th>Uzbekistan</th>\n", " <td>Uzbek 74.3%, Russian 14.2%, Tajik 4.4%, other ...</td>\n", " <td>Uzbek.</td>\n", " </tr>\n", " <tr>\n", " <th>Vanuatu</th>\n", " <td>Bislama 23% (a Melanesian pidgin English), Eng...</td>\n", " <td>Bislama</td>\n", " </tr>\n", " <tr>\n", " <th>Vatican City (Holy See)</th>\n", " <td>Italian, Latin, French, various other languages</td>\n", " <td>Italian</td>\n", " </tr>\n", " <tr>\n", " <th>Venezuela</th>\n", " <td>Spanish (official), numerous indigenous dialects</td>\n", " <td>Spanish</td>\n", " </tr>\n", " <tr>\n", " <th>Vietnam</th>\n", " <td>Vietnamese (official); English (increasingly f...</td>\n", " <td>Vietnamese</td>\n", " </tr>\n", " <tr>\n", " <th>Western Sahara (proposed state)</th>\n", " <td>Hassaniya Arabic, Moroccan Arabic</td>\n", " <td>HassaniyaArabic</td>\n", " </tr>\n", " <tr>\n", " <th>Yemen</th>\n", " <td>Arabic</td>\n", " <td>Arabic</td>\n", " </tr>\n", " <tr>\n", " <th>Zambia</th>\n", " <td>English (official); major vernaculars: Bemba, ...</td>\n", " <td>English</td>\n", " </tr>\n", " <tr>\n", " <th>Zimbabwe</th>\n", " <td>English (official), Shona, Ndebele (Sindebele)...</td>\n", " <td>English</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>198 rows × 2 columns</p>\n", "</div>" ], "text/plain": [ " language \\\n", "country \n", "Afghanistan Dari Persian, Pashtu (both official), other Tu... \n", "Albania Albanian (Tosk is the official dialect), Greek \n", "Algeria Arabic (official), French, Berber dialects \n", "Andorra Catalán (official), French, Castilian, Portuguese \n", "Angola Portuguese (official), Bantu and other African... \n", "Antigua and Barbuda English (official), local dialects \n", "Argentina Spanish (official), English, Italian, German, ... \n", "Armenia Armenian 98%, Yezidi, Russian \n", "Australia English 79%, native and other languages \n", "Austria German (official nationwide); Slovene, Croatia... \n", "Azerbaijan Azerbaijani Turkic 89%, Russian 3%, Armenian 2... \n", "Bahamas English (official), Creole (among Haitian immi... \n", "Bahrain Arabic, English, Farsi, Urdu \n", "Bangladesh Bangla (official), English \n", "Barbados English \n", "Belarus Belorussian (White Russian), Russian, other \n", "Belgium Dutch (Flemish) 60%, French 40%, German less t... \n", "Belize English (official), Spanish, Mayan, Garifuna (... \n", "Benin French (official), Fon, Yoruba, tribal languages \n", "Bhutan Dzongkha (official), Tibetan dialects (among B... \n", "Bolivia Spanish, Quechua, Aymara (all official) \n", "Bosnia and Herzegovina Bosnian, Croatian, Serbian \n", "Botswana English 2% (official), Setswana 78%, Kalanga 8... \n", "Brazil Portuguese (official), Spanish, English, French \n", "Brunei Malay (official), English, Chinese \n", "Bulgaria Bulgarian 85%, Turkish 10%, Roma 4% \n", "Burkina Faso French (official); native African (Sudanic) la... \n", "Burundi Kirundi and French (official), Swahili \n", "Cambodia Khmer 95% (official), French, English \n", "Cameroon French, English (both official); 24 major Afri... \n", "... ... \n", "Swaziland English, siSwati (both official) \n", "Sweden Swedish, small Sami- and Finnish-speaking mino... \n", "Switzerland German 64%, French 20%, Italian 7% (all offici... \n", "Syria Arabic (official); Kurdish, Armenian, Aramaic,... \n", "Taiwan Chinese (Mandarin, official), Taiwanese (Min),... \n", "Tajikistan Tajik (official), Russian widely used in gover... \n", "Tanzania Swahili, English (both official); Arabic; many... \n", "Thailand Thai (Siamese), English (secondary language of... \n", "Togo French (official, commerce); Ewé, Mina (south)... \n", "Tonga Tongan (an Austronesian language), English \n", "Trinidad and Tobago English (official), Hindi, French, Spanish, Ch... \n", "Tunisia Arabic (official, commerce), French (commerce) \n", "Turkey Turkish (official), Kurdish, Dimli, Azeri, Kab... \n", "Turkmenistan Turkmen 72%; Russian 12%; Uzbek 9%, other 7% \n", "Tuvalu Tuvaluan, English, Samoan, Kiribati (on the is... \n", "Uganda English (official), Ganda or Luganda, other Ni... \n", "Ukraine Ukrainian 67%, Russian 24%, Romanian, Polish, ... \n", "United Arab Emirates Arabic (official), Persian, English, Hindi, Urdu \n", "United Kingdom English, Welsh, Scots Gaelic \n", "United States English 82%, Spanish 11% (2000) \n", "Uruguay Spanish, Portunol, or Brazilero \n", "Uzbekistan Uzbek 74.3%, Russian 14.2%, Tajik 4.4%, other ... \n", "Vanuatu Bislama 23% (a Melanesian pidgin English), Eng... \n", "Vatican City (Holy See) Italian, Latin, French, various other languages \n", "Venezuela Spanish (official), numerous indigenous dialects \n", "Vietnam Vietnamese (official); English (increasingly f... \n", "Western Sahara (proposed state) Hassaniya Arabic, Moroccan Arabic \n", "Yemen Arabic \n", "Zambia English (official); major vernaculars: Bemba, ... \n", "Zimbabwe English (official), Shona, Ndebele (Sindebele)... \n", "\n", " languages \n", "country \n", "Afghanistan DariPersian \n", "Albania Albanian \n", "Algeria Arabic \n", "Andorra Catalán \n", "Angola Portuguese \n", "Antigua and Barbuda English \n", "Argentina Spanish \n", "Armenia Armenian \n", "Australia English \n", "Austria German \n", "Azerbaijan AzerbaijaniTurkic \n", "Bahamas English \n", "Bahrain Arabic \n", "Bangladesh Bangla \n", "Barbados English \n", "Belarus Belorussian \n", "Belgium Dutch \n", "Belize English \n", "Benin French \n", "Bhutan Dzongkha \n", "Bolivia Spanish \n", "Bosnia and Herzegovina Bosnian \n", "Botswana English \n", "Brazil Portuguese \n", "Brunei Malay \n", "Bulgaria Bulgarian \n", "Burkina Faso Frenchlanguages \n", "Burundi KirundiandFrench \n", "Cambodia Khmer \n", "Cameroon French \n", "... ... \n", "Swaziland English \n", "Sweden Swedish \n", "Switzerland German \n", "Syria Arabic \n", "Taiwan Chinese \n", "Tajikistan Tajik \n", "Tanzania Swahili \n", "Thailand Thai \n", "Togo French \n", "Tonga Tongan \n", "Trinidad and Tobago English \n", "Tunisia Arabic \n", "Turkey Turkish \n", "Turkmenistan Turkmen \n", "Tuvalu Tuvaluan \n", "Uganda English \n", "Ukraine Ukrainian \n", "United Arab Emirates Arabic \n", "United Kingdom English \n", "United States English \n", "Uruguay Spanish \n", "Uzbekistan Uzbek. \n", "Vanuatu Bislama \n", "Vatican City (Holy See) Italian \n", "Venezuela Spanish \n", "Vietnam Vietnamese \n", "Western Sahara (proposed state) HassaniyaArabic \n", "Yemen Arabic \n", "Zambia English \n", "Zimbabwe English \n", "\n", "[198 rows x 2 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = parse_url('https://www.infoplease.com/world/countries-world/languages-spoken-each-country-world')\n", "countryLanguages = df[0].rename(columns={0:'country',1:'language'}).set_index('country')\n", "countryLanguages['languages'] = [re.sub('\\d+|%|\\(.*\\)|\\s','',i).split(',')[0].split(';')[0] for i in countryLanguages['language']]\n", "countryLanguages " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "translator=Translator()\n", "translator.translate('Hola me llamo Jennifer 😜😜', dest='en').text" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'contributors': None,\n", " 'coordinates': None,\n", " 'created_at': 'Sat Jun 30 19:33:55 +0000 2018',\n", " 'entities': {'hashtags': [{'indices': [87, 93], 'text': 'Tibet'},\n", " {'indices': [94, 103], 'text': 'Tibetans'},\n", " {'indices': [104, 114], 'text': 'FreeTibet'}],\n", " 'symbols': [],\n", " 'urls': [{'display_url': 'shar.es/anBniE',\n", " 'expanded_url': 'https://shar.es/anBniE',\n", " 'indices': [47, 70],\n", " 'url': 'https://t.co/zAKFBK5Mna'},\n", " {'display_url': 'twitter.com/i/web/status/1…',\n", " 'expanded_url': 'https://twitter.com/i/web/status/1013143614623440896',\n", " 'indices': [116, 139],\n", " 'url': 'https://t.co/9GU9LWHwGp'}],\n", " 'user_mentions': [{'id': 23065876,\n", " 'id_str': '23065876',\n", " 'indices': [75, 85],\n", " 'name': 'Tsem Tulku Rinpoche',\n", " 'screen_name': 'tsemtulku'}]},\n", " 'favorite_count': 0,\n", " 'favorited': False,\n", " 'geo': None,\n", " 'id': 1013143614623440896,\n", " 'id_str': '1013143614623440896',\n", " 'in_reply_to_screen_name': None,\n", " 'in_reply_to_status_id': None,\n", " 'in_reply_to_status_id_str': None,\n", " 'in_reply_to_user_id': None,\n", " 'in_reply_to_user_id_str': None,\n", " 'is_quote_status': False,\n", " 'lang': 'en',\n", " 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},\n", " 'place': None,\n", " 'possibly_sensitive': False,\n", " 'retweet_count': 0,\n", " 'retweeted': False,\n", " 'source': '<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>',\n", " 'text': 'The Sixty-Four Yogini Temple of Hirapur, India https://t.co/zAKFBK5Mna via @tsemtulku\\n\\n#Tibet\\n#Tibetans\\n#FreeTibet… https://t.co/9GU9LWHwGp',\n", " 'truncated': True,\n", " 'user': {'contributors_enabled': False,\n", " 'created_at': 'Wed Nov 18 10:24:35 +0000 2015',\n", " 'default_profile': False,\n", " 'default_profile_image': False,\n", " 'description': '',\n", " 'entities': {'description': {'urls': []}},\n", " 'favourites_count': 7051,\n", " 'follow_request_sent': False,\n", " 'followers_count': 288,\n", " 'following': False,\n", " 'friends_count': 181,\n", " 'geo_enabled': True,\n", " 'has_extended_profile': True,\n", " 'id': 4277739734,\n", " 'id_str': '4277739734',\n", " 'is_translation_enabled': False,\n", " 'is_translator': False,\n", " 'lang': 'en',\n", " 'listed_count': 74,\n", " 'location': 'Bentung, Pahang',\n", " 'name': 'Chris Chong',\n", " 'notifications': False,\n", " 'profile_background_color': '000000',\n", " 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',\n", " 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',\n", " 'profile_background_tile': False,\n", " 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4277739734/1524960980',\n", " 'profile_image_url': 'http://pbs.twimg.com/profile_images/780584831935586304/GLHvkq-Q_normal.jpg',\n", " 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/780584831935586304/GLHvkq-Q_normal.jpg',\n", " 'profile_link_color': '1B95E0',\n", " 'profile_sidebar_border_color': '000000',\n", " 'profile_sidebar_fill_color': '000000',\n", " 'profile_text_color': '000000',\n", " 'profile_use_background_image': False,\n", " 'protected': False,\n", " 'screen_name': 'chrischong90',\n", " 'statuses_count': 9983,\n", " 'time_zone': None,\n", " 'translator_type': 'none',\n", " 'url': None,\n", " 'utc_offset': None,\n", " 'verified': False}}" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Setup Tweepy API Authentication\n", "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", "auth.set_access_token(access_token, access_token_secret)\n", "api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())\n", "tweets = api.search('dalailama')\n", "tweets['statuses'][0]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>city</th>\n", " <th>population</th>\n", " <th>density</th>\n", " <th>country</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>Manila</td>\n", " <td>1780148</td>\n", " <td>107561</td>\n", " <td>Philippines</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Ebeye</td>\n", " <td>15000</td>\n", " <td>107143</td>\n", " <td>Marshall Islands</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Pateros (Municipality)</td>\n", " <td>64147</td>\n", " <td>79114</td>\n", " <td>Philippines</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Mumbai</td>\n", " <td>12478447</td>\n", " <td>73837</td>\n", " <td>India</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>Dhaka</td>\n", " <td>8523137</td>\n", " <td>73583</td>\n", " <td>Bangladesh</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>Bnei Brak</td>\n", " <td>200162</td>\n", " <td>73159</td>\n", " <td>Israel</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>Caloocan</td>\n", " <td>1489040</td>\n", " <td>72302</td>\n", " <td>Philippines</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>Levallois-Perret</td>\n", " <td>63436</td>\n", " <td>68458</td>\n", " <td>France</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>Le Pré-Saint-Gervais</td>\n", " <td>18121</td>\n", " <td>67047</td>\n", " <td>France</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>Chennai</td>\n", " <td>4681087</td>\n", " <td>66961</td>\n", " <td>India</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>Vincennes</td>\n", " <td>48689</td>\n", " <td>66371</td>\n", " <td>France</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>Saint-Mandé</td>\n", " <td>22627</td>\n", " <td>65115</td>\n", " <td>France</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>Bally</td>\n", " <td>291972</td>\n", " <td>64031</td>\n", " <td>India</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>Kolkata</td>\n", " <td>4486679</td>\n", " <td>62813</td>\n", " <td>India</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>Saint-Josse-ten-Noode</td>\n", " <td>27548</td>\n", " <td>62404</td>\n", " <td>Belgium</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>Kathmandu</td>\n", " <td>1183000</td>\n", " <td>61972</td>\n", " <td>Nepal</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>Subang Jaya</td>\n", " <td>1683589</td>\n", " <td>38482</td>\n", " <td>Malaysia</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>Neapoli</td>\n", " <td>27084</td>\n", " <td>60186</td>\n", " <td>Greece</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>Montrouge</td>\n", " <td>48410</td>\n", " <td>59705</td>\n", " <td>France</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>Malé</td>\n", " <td>133412</td>\n", " <td>59559</td>\n", " <td>Maldives</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>Malabon</td>\n", " <td>353337</td>\n", " <td>58607</td>\n", " <td>Philippines</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>Guttenberg, New Jersey</td>\n", " <td>11481</td>\n", " <td>58577</td>\n", " <td>United States</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>Pasig</td>\n", " <td>669773</td>\n", " <td>55958</td>\n", " <td>Philippines</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>Paris</td>\n", " <td>2265886</td>\n", " <td>55673</td>\n", " <td>France</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>Mislata</td>\n", " <td>43756</td>\n", " <td>54695</td>\n", " <td>Spain</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>Macau</td>\n", " <td>643100</td>\n", " <td>54790</td>\n", " <td>China</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>Kallithea</td>\n", " <td>100050</td>\n", " <td>54733</td>\n", " <td>Greece</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>Nea Smyrni</td>\n", " <td>73090</td>\n", " <td>53717</td>\n", " <td>Greece</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>Howrah</td>\n", " <td>1072161</td>\n", " <td>53670</td>\n", " <td>India</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", " <td>Pasay</td>\n", " <td>392869</td>\n", " <td>53554</td>\n", " <td>Philippines</td>\n", " </tr>\n", " <tr>\n", " <th>31</th>\n", " <td>San Juan</td>\n", " <td>121430</td>\n", " <td>52946</td>\n", " <td>Philippines</td>\n", " </tr>\n", " <tr>\n", " <th>32</th>\n", " <td>Colombo</td>\n", " <td>323257</td>\n", " <td>52871</td>\n", " <td>Sri Lanka</td>\n", " </tr>\n", " <tr>\n", " <th>33</th>\n", " <td>L'Hospitalet de Llobregat</td>\n", " <td>252171</td>\n", " <td>52536</td>\n", " <td>Spain</td>\n", " </tr>\n", " <tr>\n", " <th>34</th>\n", " <td>Union City, New Jersey</td>\n", " <td>66455</td>\n", " <td>51810</td>\n", " <td>United States</td>\n", " </tr>\n", " <tr>\n", " <th>35</th>\n", " <td>Cairo</td>\n", " <td>11742120</td>\n", " <td>50180</td>\n", " <td>Egypt</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>Makati</td>\n", " <td>529039</td>\n", " <td>50080</td>\n", " <td>Philippines</td>\n", " </tr>\n", " <tr>\n", " <th>37</th>\n", " <td>West New York, New Jersey</td>\n", " <td>49708</td>\n", " <td>49362</td>\n", " <td>United States</td>\n", " </tr>\n", " <tr>\n", " <th>38</th>\n", " <td>Saint-Gilles</td>\n", " <td>46931</td>\n", " <td>48234</td>\n", " <td>Belgium</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>Bandung</td>\n", " <td>3075478</td>\n", " <td>48054</td>\n", " <td>Indonesia</td>\n", " </tr>\n", " <tr>\n", " <th>40</th>\n", " <td>Monaco</td>\n", " <td>36950</td>\n", " <td>47372</td>\n", " <td>Monaco</td>\n", " </tr>\n", " <tr>\n", " <th>41</th>\n", " <td>Boulogne-Billancourt</td>\n", " <td>113085</td>\n", " <td>47240</td>\n", " <td>France</td>\n", " </tr>\n", " <tr>\n", " <th>42</th>\n", " <td>Quezon City</td>\n", " <td>2936116</td>\n", " <td>45999</td>\n", " <td>Philippines</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " city population density country\n", "1 Manila 1780148 107561 Philippines\n", "2 Ebeye 15000 107143 Marshall Islands\n", "3 Pateros (Municipality) 64147 79114 Philippines\n", "4 Mumbai 12478447 73837 India\n", "5 Dhaka 8523137 73583 Bangladesh\n", "6 Bnei Brak 200162 73159 Israel\n", "7 Caloocan 1489040 72302 Philippines\n", "8 Levallois-Perret 63436 68458 France\n", "9 Le Pré-Saint-Gervais 18121 67047 France\n", "10 Chennai 4681087 66961 India\n", "11 Vincennes 48689 66371 France\n", "12 Saint-Mandé 22627 65115 France\n", "13 Bally 291972 64031 India\n", "14 Kolkata 4486679 62813 India\n", "15 Saint-Josse-ten-Noode 27548 62404 Belgium\n", "16 Kathmandu 1183000 61972 Nepal\n", "17 Subang Jaya 1683589 38482 Malaysia\n", "18 Neapoli 27084 60186 Greece\n", "19 Montrouge 48410 59705 France\n", "20 Malé 133412 59559 Maldives\n", "21 Malabon 353337 58607 Philippines\n", "22 Guttenberg, New Jersey 11481 58577 United States\n", "23 Pasig 669773 55958 Philippines\n", "24 Paris 2265886 55673 France\n", "25 Mislata 43756 54695 Spain\n", "26 Macau 643100 54790 China\n", "27 Kallithea 100050 54733 Greece\n", "28 Nea Smyrni 73090 53717 Greece\n", "29 Howrah 1072161 53670 India\n", "30 Pasay 392869 53554 Philippines\n", "31 San Juan 121430 52946 Philippines\n", "32 Colombo 323257 52871 Sri Lanka\n", "33 L'Hospitalet de Llobregat 252171 52536 Spain\n", "34 Union City, New Jersey 66455 51810 United States\n", "35 Cairo 11742120 50180 Egypt\n", "36 Makati 529039 50080 Philippines\n", "37 West New York, New Jersey 49708 49362 United States\n", "38 Saint-Gilles 46931 48234 Belgium\n", "39 Bandung 3075478 48054 Indonesia\n", "40 Monaco 36950 47372 Monaco\n", "41 Boulogne-Billancourt 113085 47240 France\n", "42 Quezon City 2936116 45999 Philippines" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import wikipedia as wp\n", "from pygeocoder import Geocoder\n", "import time\n", "import re\n", " \n", "#-- Get the most populated cities from wikipedia (Thank you wikipedia library!) --\n", "html = wp.page(\"List_of_cities_by_population_density\").html().encode(\"UTF-8\")\n", "df = pd.read_html(html)[1]\n", "df=df.drop([2,3,4],axis=1)\n", "df=df.rename(columns={0:'city',1:'population',5:'density',6:'country'})\n", "df=df.iloc[1:]\n", "df['population']=[int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in df['population']]\n", "df['density']=[int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in df['density']]\n", "df\n", "#--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- ---" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[1780148,\n", " 15000,\n", " 64147,\n", " 12478447,\n", " 8523137,\n", " 200162,\n", " 1489040,\n", " 63436,\n", " 18121,\n", " 4681087,\n", " 48689,\n", " 22627,\n", " 291972,\n", " 4486679,\n", " 27548,\n", " 1183000,\n", " 1683589,\n", " 27084,\n", " 48410,\n", " 133412,\n", " 353337,\n", " 11481,\n", " 669773,\n", " 2265886,\n", " 43756,\n", " 643100,\n", " 100050,\n", " 73090,\n", " 1072161,\n", " 392869,\n", " 121430,\n", " 323257,\n", " 252171,\n", " 66455,\n", " 11742120,\n", " 529039,\n", " 49708,\n", " 46931,\n", " 3075478,\n", " 36950,\n", " 113085,\n", " 2936116]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in df['population']]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>followers</th>\n", " <th>following</th>\n", " <th>geo</th>\n", " <th>hashtags</th>\n", " <th>statuses_count</th>\n", " <th>text</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>154060</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>22455</td>\n", " <td>@ryapee Hi Rya order ako ulit polyblender mejo...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>83</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>3030</td>\n", " <td>Yung dating saling pusa naging aso bigla // 🎶</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>643</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[{'text': 'AdoptDontShop', 'indices': [38, 52]}]</td>\n", " <td>8529</td>\n", " <td>sana meron din dito sa Pilipinas yung #AdoptDo...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2729</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>28771</td>\n", " <td>Aso nga kasi ako, bantay ako dito hahaha 😂</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>25</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>14</td>\n", " <td>lakas mangahol ng kaklase ko dinaig pa aso nam...</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>116</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>1993</td>\n", " <td>@LampanoElla Dun sa aso oo HAHA</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>113</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>4495</td>\n", " <td>Me: labas mo dila para lumabas dila ng aso \\nS...</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>130</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>1745</td>\n", " <td>cute kong aso https://t.co/twDprm7o5P</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>54</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>727</td>\n", " <td>me: pabili pong dog food\\ntindero: alin? ung p...</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>24</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>296</td>\n", " <td>@dsgalarpez hahahaha aso ka na ba ngayon?</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>841</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>29025</td>\n", " <td>@DenniceRoselle Uy kawawa mga aso. Di naman si...</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>260</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>8486</td>\n", " <td>Nagtanggal tuloy ako nang mga tae nang aso kai...</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>260</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>8486</td>\n", " <td>Nakakapikon ung aso 😭😭</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>176</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>4682</td>\n", " <td>May mga sakit aso namin hanep</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>1357</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>15736</td>\n", " <td>i hate when strangers esp. men look at you str...</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>422</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>21045</td>\n", " <td>@dnnkthryn Ngek malas. Sa rosewood naman okay ...</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>422</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>21045</td>\n", " <td>@dnnkthryn Yup. Iba talaga pag bahay super fre...</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>365</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>2083</td>\n", " <td>Tang ina sobrang iba pala pakiramdam pag namat...</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>75</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>243</td>\n", " <td>aso't pusa ❤\\ngoodmorning hubby babe ! https:/...</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>1021</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>11119</td>\n", " <td>RT @akoposimarcelo: Yung buti pa yung mga aso,...</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>47</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>905</td>\n", " <td>napaka clingy ng aso ko, nebeyen hehe</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>147</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>3406</td>\n", " <td>ang laki ng aso!!! panay naka tahol</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>269</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>21310</td>\n", " <td>Alam ata ng aso ko na birthday ko ngayon. Iba ...</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>548</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>3553</td>\n", " <td>Nagduduet nanaman yung dalawang aso hays</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>88</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>1470</td>\n", " <td>Yung aso naming maligalig na palundag lundag pa</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>394</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>67411</td>\n", " <td>Mukha talaga akong tanga kapag nakikipaglaro a...</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>105</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>417</td>\n", " <td>RT @akoposimarcelo: Yung buti pa yung mga aso,...</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>174</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>4033</td>\n", " <td>nagdisitahulan mga aso ang creepy huhu</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>90</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>828</td>\n", " <td>@glbysrcmny aso</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>850</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>11211</td>\n", " <td>Distemper virus. may ganyan plang sakit nang m...</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>70</th>\n", " <td>80</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>1208</td>\n", " <td>Plus two agad aso namen AAHAHAHAHAH saya</td>\n", " </tr>\n", " <tr>\n", " <th>71</th>\n", " <td>180</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>6216</td>\n", " <td>@jhnlstrpgnsn Hahaha hindi ko aso yun sa ate k...</td>\n", " </tr>\n", " <tr>\n", " <th>72</th>\n", " <td>1280</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>5483</td>\n", " <td>RT @akoposimarcelo: Yung buti pa yung mga aso,...</td>\n", " </tr>\n", " <tr>\n", " <th>73</th>\n", " <td>74</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>2055</td>\n", " <td>Mama: Tanga Di Mo Pa Pinapakain Yung Aso.\\n\\nT...</td>\n", " </tr>\n", " <tr>\n", " <th>74</th>\n", " <td>72</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>456</td>\n", " <td>@cescamarii di yan kusa aso yan</td>\n", " </tr>\n", " <tr>\n", " <th>75</th>\n", " <td>365</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>2083</td>\n", " <td>Nasagasaan aso ko 😢</td>\n", " </tr>\n", " <tr>\n", " <th>76</th>\n", " <td>301</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>4788</td>\n", " <td>Ngayon ko nalang na appreciate ulit yung ganda...</td>\n", " </tr>\n", " <tr>\n", " <th>77</th>\n", " <td>561</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>13927</td>\n", " <td>pag gantong nalulungkot ako imbis na maghanap ...</td>\n", " </tr>\n", " <tr>\n", " <th>78</th>\n", " <td>180</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>6718</td>\n", " <td>creepy ng aso pero mahal ko kayo HAHAHHHAHAHA</td>\n", " </tr>\n", " <tr>\n", " <th>79</th>\n", " <td>329</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>1259</td>\n", " <td>Pagod bebi ko ako inaantok na tagal ni aso HAH...</td>\n", " </tr>\n", " <tr>\n", " <th>80</th>\n", " <td>5</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>460</td>\n", " <td>So ayun diba may hamster kami si Luxus saka si...</td>\n", " </tr>\n", " <tr>\n", " <th>81</th>\n", " <td>797</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>42407</td>\n", " <td>@Anniefernando6 @aldenAllTheWay Baka kayo ang ...</td>\n", " </tr>\n", " <tr>\n", " <th>82</th>\n", " <td>293</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>3881</td>\n", " <td>hindi ako to pramis. hahahahahahaha hindi ako ...</td>\n", " </tr>\n", " <tr>\n", " <th>83</th>\n", " <td>293</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>3881</td>\n", " <td>HUY ANG WEIRD TALAGA KASI NAGSESAVE AKO NG PIC...</td>\n", " </tr>\n", " <tr>\n", " <th>84</th>\n", " <td>84</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>15631</td>\n", " <td>Fun fact about you — Sobrang love ko yung mga ...</td>\n", " </tr>\n", " <tr>\n", " <th>85</th>\n", " <td>293</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>3881</td>\n", " <td>diko maintindihan bakit ako nagsesave ng pictu...</td>\n", " </tr>\n", " <tr>\n", " <th>86</th>\n", " <td>440</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>1375</td>\n", " <td>Hayaan mo lang na husgahan ka nila.Hindi yung ...</td>\n", " </tr>\n", " <tr>\n", " <th>87</th>\n", " <td>476</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>7158</td>\n", " <td>@pauiicosta lumabas na naman pagka aso mo haha...</td>\n", " </tr>\n", " <tr>\n", " <th>88</th>\n", " <td>35</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>1902</td>\n", " <td>nagttampo ako kay potchi, tangina aso lang yon...</td>\n", " </tr>\n", " <tr>\n", " <th>89</th>\n", " <td>183</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>1154</td>\n", " <td>Ako: pare iiyak ka pag namatay aso mo? \\nRoque...</td>\n", " </tr>\n", " <tr>\n", " <th>90</th>\n", " <td>62</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>756</td>\n", " <td>Aso ko e pero di na maghahabol🐶 https://t.co/Y...</td>\n", " </tr>\n", " <tr>\n", " <th>91</th>\n", " <td>44</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>64</td>\n", " <td>hi aso</td>\n", " </tr>\n", " <tr>\n", " <th>92</th>\n", " <td>167</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>2591</td>\n", " <td>Nangapitbahay nako para sa aso. Happy pill! 😊 ...</td>\n", " </tr>\n", " <tr>\n", " <th>93</th>\n", " <td>536</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>6135</td>\n", " <td>NP: Banal na aso, santong kabayo\\n\\nNatatawa a...</td>\n", " </tr>\n", " <tr>\n", " <th>94</th>\n", " <td>3228</td>\n", " <td>False</td>\n", " <td>{'type': 'Point', 'coordinates': [14.61941886,...</td>\n", " <td>[]</td>\n", " <td>10223</td>\n", " <td>Late post: Buti na lang talaga alert ako..kunc...</td>\n", " </tr>\n", " <tr>\n", " <th>95</th>\n", " <td>242</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>4463</td>\n", " <td>tao,ahas at aso.</td>\n", " </tr>\n", " <tr>\n", " <th>96</th>\n", " <td>473</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>5234</td>\n", " <td>ALAM NIYO BANG MUNTIKAN NG GAWING PAGKAIN NG A...</td>\n", " </tr>\n", " <tr>\n", " <th>97</th>\n", " <td>220</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>10928</td>\n", " <td>Dang kyot ng aso nila Sir huhu i want 😍</td>\n", " </tr>\n", " <tr>\n", " <th>98</th>\n", " <td>141</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>12781</td>\n", " <td>Hindi sa dinidepensahan pero may umuulol na na...</td>\n", " </tr>\n", " <tr>\n", " <th>99</th>\n", " <td>200</td>\n", " <td>False</td>\n", " <td>None</td>\n", " <td>[]</td>\n", " <td>8882</td>\n", " <td>RT @Itsmeearlbravo: Di naman siguro ako pinang...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>100 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " followers following geo \\\n", "0 154060 False None \n", "1 83 False None \n", "2 643 False None \n", "3 2729 False None \n", "4 25 False None \n", "5 116 False None \n", "6 113 False None \n", "7 130 False None \n", "8 54 False None \n", "9 24 False None \n", "10 841 False None \n", "11 260 False None \n", "12 260 False None \n", "13 176 False None \n", "14 1357 False None \n", "15 422 False None \n", "16 422 False None \n", "17 365 False None \n", "18 75 False None \n", "19 1021 False None \n", "20 47 False None \n", "21 147 False None \n", "22 269 False None \n", "23 548 False None \n", "24 88 False None \n", "25 394 False None \n", "26 105 False None \n", "27 174 False None \n", "28 90 False None \n", "29 850 False None \n", ".. ... ... ... \n", "70 80 False None \n", "71 180 False None \n", "72 1280 False None \n", "73 74 False None \n", "74 72 False None \n", "75 365 False None \n", "76 301 False None \n", "77 561 False None \n", "78 180 False None \n", "79 329 False None \n", "80 5 False None \n", "81 797 False None \n", "82 293 False None \n", "83 293 False None \n", "84 84 False None \n", "85 293 False None \n", "86 440 False None \n", "87 476 False None \n", "88 35 False None \n", "89 183 False None \n", "90 62 False None \n", "91 44 False None \n", "92 167 False None \n", "93 536 False None \n", "94 3228 False {'type': 'Point', 'coordinates': [14.61941886,... \n", "95 242 False None \n", "96 473 False None \n", "97 220 False None \n", "98 141 False None \n", "99 200 False None \n", "\n", " hashtags statuses_count \\\n", "0 [] 22455 \n", "1 [] 3030 \n", "2 [{'text': 'AdoptDontShop', 'indices': [38, 52]}] 8529 \n", "3 [] 28771 \n", "4 [] 14 \n", "5 [] 1993 \n", "6 [] 4495 \n", "7 [] 1745 \n", "8 [] 727 \n", "9 [] 296 \n", "10 [] 29025 \n", "11 [] 8486 \n", "12 [] 8486 \n", "13 [] 4682 \n", "14 [] 15736 \n", "15 [] 21045 \n", "16 [] 21045 \n", "17 [] 2083 \n", "18 [] 243 \n", "19 [] 11119 \n", "20 [] 905 \n", "21 [] 3406 \n", "22 [] 21310 \n", "23 [] 3553 \n", "24 [] 1470 \n", "25 [] 67411 \n", "26 [] 417 \n", "27 [] 4033 \n", "28 [] 828 \n", "29 [] 11211 \n", ".. ... ... \n", "70 [] 1208 \n", "71 [] 6216 \n", "72 [] 5483 \n", "73 [] 2055 \n", "74 [] 456 \n", "75 [] 2083 \n", "76 [] 4788 \n", "77 [] 13927 \n", "78 [] 6718 \n", "79 [] 1259 \n", "80 [] 460 \n", "81 [] 42407 \n", "82 [] 3881 \n", "83 [] 3881 \n", "84 [] 15631 \n", "85 [] 3881 \n", "86 [] 1375 \n", "87 [] 7158 \n", "88 [] 1902 \n", "89 [] 1154 \n", "90 [] 756 \n", "91 [] 64 \n", "92 [] 2591 \n", "93 [] 6135 \n", "94 [] 10223 \n", "95 [] 4463 \n", "96 [] 5234 \n", "97 [] 10928 \n", "98 [] 12781 \n", "99 [] 8882 \n", "\n", " text \n", "0 @ryapee Hi Rya order ako ulit polyblender mejo... \n", "1 Yung dating saling pusa naging aso bigla // 🎶 \n", "2 sana meron din dito sa Pilipinas yung #AdoptDo... \n", "3 Aso nga kasi ako, bantay ako dito hahaha 😂 \n", "4 lakas mangahol ng kaklase ko dinaig pa aso nam... \n", "5 @LampanoElla Dun sa aso oo HAHA \n", "6 Me: labas mo dila para lumabas dila ng aso \\nS... \n", "7 cute kong aso https://t.co/twDprm7o5P \n", "8 me: pabili pong dog food\\ntindero: alin? ung p... \n", "9 @dsgalarpez hahahaha aso ka na ba ngayon? \n", "10 @DenniceRoselle Uy kawawa mga aso. Di naman si... \n", "11 Nagtanggal tuloy ako nang mga tae nang aso kai... \n", "12 Nakakapikon ung aso 😭😭 \n", "13 May mga sakit aso namin hanep \n", "14 i hate when strangers esp. men look at you str... \n", "15 @dnnkthryn Ngek malas. Sa rosewood naman okay ... \n", "16 @dnnkthryn Yup. Iba talaga pag bahay super fre... \n", "17 Tang ina sobrang iba pala pakiramdam pag namat... \n", "18 aso't pusa ❤\\ngoodmorning hubby babe ! https:/... \n", "19 RT @akoposimarcelo: Yung buti pa yung mga aso,... \n", "20 napaka clingy ng aso ko, nebeyen hehe \n", "21 ang laki ng aso!!! panay naka tahol \n", "22 Alam ata ng aso ko na birthday ko ngayon. Iba ... \n", "23 Nagduduet nanaman yung dalawang aso hays \n", "24 Yung aso naming maligalig na palundag lundag pa \n", "25 Mukha talaga akong tanga kapag nakikipaglaro a... \n", "26 RT @akoposimarcelo: Yung buti pa yung mga aso,... \n", "27 nagdisitahulan mga aso ang creepy huhu \n", "28 @glbysrcmny aso \n", "29 Distemper virus. may ganyan plang sakit nang m... \n", ".. ... \n", "70 Plus two agad aso namen AAHAHAHAHAH saya \n", "71 @jhnlstrpgnsn Hahaha hindi ko aso yun sa ate k... \n", "72 RT @akoposimarcelo: Yung buti pa yung mga aso,... \n", "73 Mama: Tanga Di Mo Pa Pinapakain Yung Aso.\\n\\nT... \n", "74 @cescamarii di yan kusa aso yan \n", "75 Nasagasaan aso ko 😢 \n", "76 Ngayon ko nalang na appreciate ulit yung ganda... \n", "77 pag gantong nalulungkot ako imbis na maghanap ... \n", "78 creepy ng aso pero mahal ko kayo HAHAHHHAHAHA \n", "79 Pagod bebi ko ako inaantok na tagal ni aso HAH... \n", "80 So ayun diba may hamster kami si Luxus saka si... \n", "81 @Anniefernando6 @aldenAllTheWay Baka kayo ang ... \n", "82 hindi ako to pramis. hahahahahahaha hindi ako ... \n", "83 HUY ANG WEIRD TALAGA KASI NAGSESAVE AKO NG PIC... \n", "84 Fun fact about you — Sobrang love ko yung mga ... \n", "85 diko maintindihan bakit ako nagsesave ng pictu... \n", "86 Hayaan mo lang na husgahan ka nila.Hindi yung ... \n", "87 @pauiicosta lumabas na naman pagka aso mo haha... \n", "88 nagttampo ako kay potchi, tangina aso lang yon... \n", "89 Ako: pare iiyak ka pag namatay aso mo? \\nRoque... \n", "90 Aso ko e pero di na maghahabol🐶 https://t.co/Y... \n", "91 hi aso \n", "92 Nangapitbahay nako para sa aso. Happy pill! 😊 ... \n", "93 NP: Banal na aso, santong kabayo\\n\\nNatatawa a... \n", "94 Late post: Buti na lang talaga alert ako..kunc... \n", "95 tao,ahas at aso. \n", "96 ALAM NIYO BANG MUNTIKAN NG GAWING PAGKAIN NG A... \n", "97 Dang kyot ng aso nila Sir huhu i want 😍 \n", "98 Hindi sa dinidepensahan pero may umuulol na na... \n", "99 RT @Itsmeearlbravo: Di naman siguro ako pinang... \n", "\n", "[100 rows x 6 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweets['Manila,Philippines']" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Delft, Holland'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "\n", "# Setup Tweepy API Authentication\n", "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", "auth.set_access_token(access_token, access_token_secret)\n", "api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())\n", "tweetsa=api.search(q='trump', geocode='52.132633,5.29126,100km')\n", "tweetsa['statuses'][0]['user']['location']" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "If using all scalar values, you must pass an index", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-114-f7373bd5071b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0mdf2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m 328\u001b[0m dtype=dtype, copy=copy)\n\u001b[1;32m 329\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 330\u001b[0;31m \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_init_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 331\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMaskedArray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmrecords\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mmrecords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_init_dict\u001b[0;34m(self, data, index, columns, dtype)\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0marrays\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 460\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 461\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_arrays_to_mgr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 462\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_init_ndarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_arrays_to_mgr\u001b[0;34m(arrays, arr_names, index, columns, dtype)\u001b[0m\n\u001b[1;32m 6161\u001b[0m \u001b[0;31m# figure out the index, if necessary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6162\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6163\u001b[0;31m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6164\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6165\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_ensure_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mextract_index\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 6200\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6201\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mindexes\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mraw_lengths\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6202\u001b[0;31m raise ValueError('If using all scalar values, you must pass'\n\u001b[0m\u001b[1;32m 6203\u001b[0m ' an index')\n\u001b[1;32m 6204\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: If using all scalar values, you must pass an index" ] } ], "source": [ "df=pd.DataFrame({'a':1,'b':2,'c':3})\n", "df2=pd.DataFrame({'a':1,'b':2,'c':3})\n", "df+=df2" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fri Jun 29 20:43:37 +0000 2018\n", "Thu Jun 28 14:21:15 +0000 2018\n", "Wed Jun 27 23:51:55 +0000 2018\n", "Tue Jun 26 23:12:07 +0000 2018\n", "Mon Jun 25 19:23:04 +0000 2018\n", "Sun Jun 24 23:58:25 +0000 2018\n", "Sat Jun 23 23:06:10 +0000 2018\n", "Fri Jun 22 17:51:26 +0000 2018\n", "Thu Jun 21 22:46:17 +0000 2018\n" ] }, { "ename": "IndexError", "evalue": "list index out of range", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-82-e5c117ad83b9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mday\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoday\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mtimedelta\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m70\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtweets\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mapi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cat'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mgeocode\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'34.0934,56.134,200mi'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muntil\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mday\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'1012848185839058950'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtweets\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'statuses'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'created_at'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mIndexError\u001b[0m: list index out of range" ] } ], "source": [ "from datetime import date, timedelta\n", "date.today() - timedelta(0)\n", "for day in [str(date.today() - timedelta(i)).split()[0] for i in range(70)]:\n", " tweets=api.search('cat',geocode = '34.0934,56.134,200mi', until=day, max_id='1012848185839058950')\n", " print(tweets['statuses'][0]['created_at'])" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 2\n", "2 3\n" ] } ], "source": [ "for a,b in zip([1,2],[2,3]):\n", " print(a,b)\n" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>state</th>\n", " <th>city</th>\n", " <th>density</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>57116</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>51810</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>49362</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>39066</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>New York City</td>\n", " <td>New York</td>\n", " <td>27788</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>New York City</td>\n", " <td>New York</td>\n", " <td>27016</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>24577</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>24060</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>23216</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>22437</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>New York City</td>\n", " <td>New York</td>\n", " <td>21635</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>Miami</td>\n", " <td>Florida</td>\n", " <td>21484</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>21254</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>Miami</td>\n", " <td>Florida</td>\n", " <td>20518</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>19179</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>Boston</td>\n", " <td>Massachusetts</td>\n", " <td>18431</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>18297</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>18218</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>Louisville</td>\n", " <td>Kentucky</td>\n", " <td>18100</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>18801</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>San Francisco</td>\n", " <td>California</td>\n", " <td>18679</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>Miami</td>\n", " <td>Florida</td>\n", " <td>17023</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>16896</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>Philadelphia</td>\n", " <td>Pennsylvania</td>\n", " <td>16557</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>16377</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>Boston</td>\n", " <td>Massachusetts</td>\n", " <td>16354</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>Providence</td>\n", " <td>Rhode Island</td>\n", " <td>16146</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>16093</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>Boston</td>\n", " <td>Massachusetts</td>\n", " <td>16036</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>16036</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>104</th>\n", " <td>Detroit</td>\n", " <td>Michigan</td>\n", " <td>10900</td>\n", " </tr>\n", " <tr>\n", " <th>105</th>\n", " <td>Philadelphia</td>\n", " <td>Pennsylvania</td>\n", " <td>10897</td>\n", " </tr>\n", " <tr>\n", " <th>106</th>\n", " <td>Philadelphia</td>\n", " <td>Pennsylvania</td>\n", " <td>10882</td>\n", " </tr>\n", " <tr>\n", " <th>107</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>10855</td>\n", " </tr>\n", " <tr>\n", " <th>108</th>\n", " <td>New York City</td>\n", " <td>New York</td>\n", " <td>10847</td>\n", " </tr>\n", " <tr>\n", " <th>109</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>10841</td>\n", " </tr>\n", " <tr>\n", " <th>110</th>\n", " <td>San Francisco</td>\n", " <td>California</td>\n", " <td>10752</td>\n", " </tr>\n", " <tr>\n", " <th>111</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>10744</td>\n", " </tr>\n", " <tr>\n", " <th>112</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>10667</td>\n", " </tr>\n", " <tr>\n", " <th>113</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>10590</td>\n", " </tr>\n", " <tr>\n", " <th>114</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>10582</td>\n", " </tr>\n", " <tr>\n", " <th>115</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>10572</td>\n", " </tr>\n", " <tr>\n", " <th>116</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>10556</td>\n", " </tr>\n", " <tr>\n", " <th>117</th>\n", " <td>Miami</td>\n", " <td>Florida</td>\n", " <td>10474</td>\n", " </tr>\n", " <tr>\n", " <th>118</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>10398</td>\n", " </tr>\n", " <tr>\n", " <th>119</th>\n", " <td>Philadelphia</td>\n", " <td>Pennsylvania</td>\n", " <td>10397</td>\n", " </tr>\n", " <tr>\n", " <th>120</th>\n", " <td>San Francisco</td>\n", " <td>California</td>\n", " <td>10368</td>\n", " </tr>\n", " <tr>\n", " <th>121</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>10358</td>\n", " </tr>\n", " <tr>\n", " <th>122</th>\n", " <td>Boston</td>\n", " <td>Massachusetts</td>\n", " <td>10351</td>\n", " </tr>\n", " <tr>\n", " <th>123</th>\n", " <td>New York City</td>\n", " <td>New York</td>\n", " <td>10337</td>\n", " </tr>\n", " <tr>\n", " <th>124</th>\n", " <td>Philadelphia</td>\n", " <td>Pennsylvania</td>\n", " <td>10256</td>\n", " </tr>\n", " <tr>\n", " <th>125</th>\n", " <td>Cleveland</td>\n", " <td>Ohio</td>\n", " <td>10208</td>\n", " </tr>\n", " <tr>\n", " <th>126</th>\n", " <td>New York City</td>\n", " <td>New York</td>\n", " <td>10188</td>\n", " </tr>\n", " <tr>\n", " <th>127</th>\n", " <td>New York City</td>\n", " <td>New York</td>\n", " <td>10187</td>\n", " </tr>\n", " <tr>\n", " <th>128</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>10178</td>\n", " </tr>\n", " <tr>\n", " <th>129</th>\n", " <td>New York City</td>\n", " <td>New Jersey</td>\n", " <td>10144</td>\n", " </tr>\n", " <tr>\n", " <th>130</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>10126</td>\n", " </tr>\n", " <tr>\n", " <th>131</th>\n", " <td>Philadelphia</td>\n", " <td>Pennsylvania</td>\n", " <td>10107</td>\n", " </tr>\n", " <tr>\n", " <th>132</th>\n", " <td>Chicago</td>\n", " <td>Illinois</td>\n", " <td>10094</td>\n", " </tr>\n", " <tr>\n", " <th>133</th>\n", " <td>Los Angeles</td>\n", " <td>California</td>\n", " <td>10065</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>133 rows × 3 columns</p>\n", "</div>" ], "text/plain": [ " state city density\n", "1 New York City New Jersey 57116\n", "2 New York City New Jersey 51810\n", "3 New York City New Jersey 49362\n", "4 New York City New Jersey 39066\n", "5 New York City New York 27788\n", "6 New York City New York 27016\n", "7 New York City New Jersey 24577\n", "8 New York City New Jersey 24060\n", "9 Los Angeles California 23216\n", "10 New York City New Jersey 22437\n", "11 New York City New York 21635\n", "12 Miami Florida 21484\n", "13 Los Angeles California 21254\n", "14 Miami Florida 20518\n", "15 Los Angeles California 19179\n", "16 Boston Massachusetts 18431\n", "17 Los Angeles California 18297\n", "18 New York City New Jersey 18218\n", "19 Louisville Kentucky 18100\n", "20 New York City New Jersey 18801\n", "21 San Francisco California 18679\n", "22 Miami Florida 17023\n", "23 Los Angeles California 16896\n", "24 Philadelphia Pennsylvania 16557\n", "25 New York City New Jersey 16377\n", "26 Boston Massachusetts 16354\n", "27 Providence Rhode Island 16146\n", "28 New York City New Jersey 16093\n", "29 Boston Massachusetts 16036\n", "30 Los Angeles California 16036\n", ".. ... ... ...\n", "104 Detroit Michigan 10900\n", "105 Philadelphia Pennsylvania 10897\n", "106 Philadelphia Pennsylvania 10882\n", "107 New York City New Jersey 10855\n", "108 New York City New York 10847\n", "109 New York City New Jersey 10841\n", "110 San Francisco California 10752\n", "111 New York City New Jersey 10744\n", "112 Los Angeles California 10667\n", "113 New York City New Jersey 10590\n", "114 New York City New Jersey 10582\n", "115 Los Angeles California 10572\n", "116 New York City New Jersey 10556\n", "117 Miami Florida 10474\n", "118 Los Angeles California 10398\n", "119 Philadelphia Pennsylvania 10397\n", "120 San Francisco California 10368\n", "121 New York City New Jersey 10358\n", "122 Boston Massachusetts 10351\n", "123 New York City New York 10337\n", "124 Philadelphia Pennsylvania 10256\n", "125 Cleveland Ohio 10208\n", "126 New York City New York 10188\n", "127 New York City New York 10187\n", "128 Los Angeles California 10178\n", "129 New York City New Jersey 10144\n", "130 Los Angeles California 10126\n", "131 Philadelphia Pennsylvania 10107\n", "132 Chicago Illinois 10094\n", "133 Los Angeles California 10065\n", "\n", "[133 rows x 3 columns]" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#-- Get the most populated cities from wikipedia (Thank you wikipedia library!) --\n", "html = wp.page(\"List_of_United_States_cities_by_population_density\").html().encode(\"UTF-8\")\n", "df = pd.read_html(html)[1]\n", "df=df.drop([0,1,4,5,6,8],axis=1)\n", "df=df.rename(columns={2:'state',3:'city',7:'density'})\n", "df=df.iloc[1:]\n", "#df['population']=[int(city.split('\\xa0')[-1].split('[')[0].replace(',','')) for city in df['population']]\n", "df['density']=[int(city.split('\\xa0')[-1].split('[')[0].replace(',','').split('.')[0]) for city in df['density']]\n", "df\n", " #--- ---- ----- ----- ---- ---- ----- ---- ---- ----- --- ---- ---- ----- ---- ---" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }