######################################################################## # # Functions for downloading and re-sampling weather-data # for 5 cities in Denmark between 1980-2018. # # The raw data was obtained from: # # National Climatic Data Center (NCDC) in USA # https://www7.ncdc.noaa.gov/CDO/cdoselect.cmd # # Note that the NCDC's database functionality may change soon, and # that the CSV-file needed some manual editing before it could be read. # See the function _convert_raw_data() below for inspiration if you # want to convert a new data-file from NCDC's database. # # Implemented in Python 3.6 # # Usage: # 1) Set the desired storage directory in the data_dir variable. # 2) Call maybe_download_and_extract() to download the data-set # if it is not already located in the given data_dir. # 3) Either call load_original_data() or load_resampled_data() # to load the original or resampled data for use in your program. # # Format: # The raw data-file from NCDC is not included in the downloaded archive, # which instead contains a cleaned-up version of the raw data-file # referred to as the "original data". This data has not yet been resampled. # The original data-file is available as a pickled file for fast reloading # with Pandas, and as a CSV-file for broad compatibility. # ######################################################################## # # This file is part of the TensorFlow Tutorials available at: # # https://github.com/Hvass-Labs/TensorFlow-Tutorials # # Published under the MIT License. See the file LICENSE for details. # # Copyright 2018 by Magnus Erik Hvass Pedersen # ######################################################################## import pandas as pd import os import download ######################################################################## # Directory where you want to download and save the data-set. # Set this before you start calling any of the functions below. data_dir = "data/weather-denmark/" # Full path for the pickled data-file. (Original data). def path_original_data_pickle(): return os.path.join(data_dir, "weather-denmark.pkl") # Full path for the comma-separated text-file. (Original data). def path_original_data_csv(): return os.path.join(data_dir, "weather-denmark.csv") # Full path for the resampled data as a pickled file. def path_resampled_data_pickle(): return os.path.join(data_dir, "weather-denmark-resampled.pkl") # URL for the data-set on the internet. data_url = "https://github.com/Hvass-Labs/weather-denmark/raw/master/weather-denmark.tar.gz" # List of the cities in this data-set. These are cities in Denmark. cities = ['Aalborg', 'Aarhus', 'Esbjerg', 'Odense', 'Roskilde'] ######################################################################## # Private helper-functions. def _date_string(x): """Convert two integers to a string for the date and time.""" date = x[0] # Date. Example: 19801231 time = x[1] # Time. Example: 1230 return "{0}{1:04d}".format(date, time) def _usaf_to_city(usaf): """ The raw data-file uses USAF-codes to identify weather-stations. If you download another data-set from NCDC then you will have to change this function to use the USAF-codes in your new data-file. """ table = \ { 60300: 'Aalborg', 60700: 'Aarhus', 60800: 'Esbjerg', 61200: 'Odense', 61700: 'Roskilde' } return table[usaf] def _convert_raw_data(path): """ This converts a raw data-file obtained from the NCDC database. This function may be useful as an inspiration if you want to download another raw data-file from NCDC, but you will have to modify this function to match the data you have downloaded. Note that you may also have to manually edit the raw data-file, e.g. because the header is not in a proper comma-separated format. """ # The raw CSV-file uses various markers for "not-available" (NA). # (This is one of several oddities with NCDC's file-format.) na_values = ['999', '999.0', '999.9', '9999.9'] # Use Pandas to load the comma-separated file. # Note that you may have to manually edit the file's header # to get this to load correctly. df_raw = pd.read_csv(path, sep=',', header=1, index_col=False, na_values=na_values) # Create a new data-frame containing only the data # we are interested in. df = pd.DataFrame() # Get the city-name / weather-station name from the USAF code. df['City'] = df_raw['USAF '].apply(_usaf_to_city) # Convert the integer date-time to a proper date-time object. datestr = df_raw[['Date ', 'HrMn']].apply(_date_string, axis=1) df['DateTime'] = pd.to_datetime(datestr, format='%Y%m%d%H%M') # Get the data we are interested in. df['Temp'] = df_raw['Temp '] df['Pressure'] = df_raw['Slp '] df['WindSpeed'] = df_raw['Spd '] df['WindDir'] = df_raw['Dir'] # Set the city-name and date-time as the index. df.set_index(['City', 'DateTime'], inplace=True) # Save the new data-frame as a pickle for fast reloading. df.to_pickle(path_original_data_pickle()) # Save the new data-frame as a CSV-file for general readability. df.to_csv(path_original_data_csv()) return df def _resample(df): """ Resample the contents of a Pandas data-frame by first removing empty rows and columns, then up-sampling and interpolating the data for 1-minute intervals, and finally down-sampling to 60-minute intervals. """ # Remove all empty rows. df_res = df.dropna(how='all') # Upsample so the time-series has data for every minute. df_res = df_res.resample('1T') # Fill in missing values. df_res = df_res.interpolate(method='time') # Downsample so the time-series has data for every hour. df_res = df_res.resample('60T') # Finalize the resampling. (Is this really necessary?) df_res = df_res.interpolate() # Remove all empty rows. df_res = df_res.dropna(how='all') return df_res ######################################################################## # Public functions that you may call to download the data-set from # the internet and load the data into memory. def maybe_download_and_extract(): """ Download and extract the weather-data if the data-files don't already exist in the data_dir. """ download.maybe_download_and_extract(url=data_url, download_dir=data_dir) def load_original_data(): """ Load and return the original data that has not been resampled. Note that this is not the raw data obtained from NCDC. It is a cleaned-up version of that data, as written by the function _convert_raw_data() above. """ return pd.read_pickle(path_original_data_pickle()) def load_resampled_data(): """ Load and return the resampled weather-data. This has data-points at regular 60-minute intervals where missing data has been linearly interpolated. This uses a cache-file for saving and quickly reloading the data, so the original data is only resampled once. """ # Path for the cache-file with the resampled data. path = path_resampled_data_pickle() # If the cache-file exists ... if os.path.exists(path): # Reload the cache-file. df = pd.read_pickle(path) else: # Otherwise resample the original data and save it in a cache-file. # Load the original data. df_org = load_original_data() # Split the original data into separate data-frames for each city. df_cities = [df_org.xs(city) for city in cities] # Resample the data for each city. df_resampled = [_resample(df_city) for df_city in df_cities] # Join the resampled data into a single data-frame. df = pd.concat(df_resampled, keys=cities, axis=1, join='inner') # Save the resampled data in a cache-file for quick reloading. df.to_pickle(path) return df ########################################################################