Blog Post 1 - Interactive Data Graphics Using NOAA Climate Data

In this post, we are going to create several interesting, interactive data graphics using the NOAA climate data.

1. Create a Database

First of all, we are going to create a database with three tables: ‘temperatures’, ‘stations’, and ‘countries’. We will keep these as three separate tables in our database.

# Import required packages
import sqlite3
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Create a database to store the data
conn = sqlite3.connect("temps.db")

There are few cleanning steps that we’ll make before incorporating the data into our database. Thus, we create a function call ‘prepare_df’ to complete this cleanning step.

def prepare_df(df):
    """
    This function takes the temperature dataframe,
    and it will return another cleaned dataframe to 
    help us incorporate it into our database later.
    """
    df = df.set_index(keys=["ID", "Year"])
    df = df.stack()
    df = df.reset_index()
    df = df.rename(columns = {"level_2"  : "Month" , 0 : "Temp"})
    df["Month"] = df["Month"].str[5:].astype(int)
    # Convert the temperature to degree centigrade
    df["Temp"]  = df["Temp"] / 100
    return(df)

# Add the temps data into the database
df_iter = pd.read_csv("temps.csv", chunksize = 100000)
for df in df_iter:
    df = prepare_df(df)
    df.to_sql("temperatures", conn, if_exists = "append", index = False)

# Add the stations table into the database
stations = pd.read_csv("station-metadata.csv")
stations.to_sql("stations", conn, if_exists = "replace", index = False)

# Add the country table into the database
countries = pd.read_csv("countries.csv")
# Rename some columns to avoid spaces
countries = countries.rename(columns = {"FIPS 10-4":"FIPS_10-4",
                                        "ISO 3166":"ISO_3166"})
countries.to_sql("countries", conn, if_exists = "replace", index = False)

Then, we check if we’ve successfully add the above data into our database by using cursor.

cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('temperatures',), ('stations',), ('countries',)]

cursor.execute("SELECT sql FROM sqlite_master WHERE type='table';")
for result in cursor.fetchall():
    print(result[0])

CREATE TABLE "temperatures" (
"ID" TEXT,
  "Year" INTEGER,
  "Month" INTEGER,
  "Temp" REAL
)
CREATE TABLE "stations" (
"ID" TEXT,
  "LATITUDE" REAL,
  "LONGITUDE" REAL,
  "STNELEV" REAL,
  "NAME" TEXT
)
CREATE TABLE "countries" (
"FIPS_10-4" TEXT,
  "ISO_3166" TEXT,
  "Name" TEXT
)

# close the database
conn.close()

2. Write a Query Function

In this part, we are going to write a function call query_climate_database to get a dataframe with specified country, year, and month.

def query_climate_database(country, year_begin, year_end, month):
    '''
    This function return a dataframe of temperature 
    readings for the specified country, in the specified
    date range, in the specified month of the year.
    ---------------------------------------
    Parameters:
        "country": a string giving the name of a 
        country for which data should be returned.
        
        "year_begin" and "year_end": two integers giving 
        the earliest and latest years for which should be returned.
        
        "month": an integer giving the month of
        the year for which should be returned.  
    '''
    # Open the Database
    conn = sqlite3.connect("temps.db")
    # SQL Command
    cmd = \
    f"""
    SELECT S.NAME, S.LATITUDE, S.LONGITUDE,
    C.Name AS Country, T.Year, T.Month, T.Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.ID = S.ID
    LEFT JOIN countries C ON SUBSTRING(T.ID, 1, 2)=C.[FIPS_10-4]
    WHERE T.Year >= {year_begin} AND T.Year <= {year_end} 
    AND T.Month = {month} AND C.Name = '{country}'
    """
    # Output the required dataframe
    df = pd.read_sql_query(cmd, conn)
    # Close the database
    conn.close()
    return(df)

# Testing our function by using the follwing code:
query_climate_database(country = "India", 
                       year_begin = 1980, 
                       year_end = 2020,
                       month = 1)

	NAME	LATITUDE	LONGITUDE	Country	Year	Month	Temp
0	PBO_ANANTAPUR	14.583	77.633	India	1980	1	23.48
1	PBO_ANANTAPUR	14.583	77.633	India	1981	1	24.57
2	PBO_ANANTAPUR	14.583	77.633	India	1982	1	24.19
3	PBO_ANANTAPUR	14.583	77.633	India	1983	1	23.51
4	PBO_ANANTAPUR	14.583	77.633	India	1984	1	24.81
...	...	...	...	...	...	...	...
3147	DARJEELING	27.050	88.270	India	1983	1	5.10
3148	DARJEELING	27.050	88.270	India	1986	1	6.90
3149	DARJEELING	27.050	88.270	India	1994	1	8.10
3150	DARJEELING	27.050	88.270	India	1995	1	5.60
3151	DARJEELING	27.050	88.270	India	1997	1	5.70

3152 rows × 7 columns

3. Write a Geographic Scatter Function for Yearly Temperature Increases

# Import required packages
from plotly import express as px
from sklearn.linear_model import LinearRegression
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import calendar

def coef(data_group):
    """
    This function compute a simple estimate of
    the year-over-year average change in temperature
    in each month at each station.
    ---------------------------------
    Parameter:
        'data_group': dataframe
    """
    # Indexing X and Y for Fitting the Linear Regression
    x = data_group[["Year"]]
    y = data_group["Temp"]
    # Fit the LR model
    LR = LinearRegression()
    LR.fit(x, y)
    # Return the coefficients
    return LR.coef_[0]

def temperature_coefficient_plot(country, year_begin,
                                 year_end, month, min_obs, **kwargs):
    """
    This function accept five explicit arguments,
    and an undetermined number of keyword arguments.
    It output an interactive geographic scatterplot.
    --------------------------------------------
    Parameters:
        - "country": a string giving the name of a 
        country for which data should be returned.
        
        - "year_begin" and "year_end": two integers giving 
        the earliest and latest years for which should be returned.
        
        - "month": an integer giving the month of
        the year for which should be returned. 
        
        - "min_obs":the minimum required number of
        years of data for any given station.
        
        - "**kwargs": additional keyword arguments
        passed to px.scatter_mapbox()
    """
    # Querying the database according to the input arguments
    df = query_climate_database(country, year_begin, year_end, month)
    
    # Select the only stations that is more than min_obs data
    df = df[df.groupby(["NAME"])["Temp"].transform(len) >= min_obs]
    
    # Compute estimate of year-over-year average change in temperature
    # in each month at each station
    coefs = df.groupby(["NAME"]).apply(coef).reset_index()
    
    # Merge the coefficient into df
    df = df.merge(coefs, on = "NAME")
    
    # Drop the duplicates value
    df = df.drop_duplicates(subset = "NAME")
    
    # Round the temperatures into 4 decimal places
    df[0] = df[0].round(4)
    
    # Rename the columns to appropriate labels
    df = df.rename(columns={0:"Estimated Yearly Increase (°C)"})
    
    # Convert month from number to alphabet
    month_name = calendar.month_name[month]
    
    # Title of the plot
    title = 'Estimates of yearly increase in temperature'\
    f' in {month_name} <br>for stations in {country},'\
    f' years {year_begin} - {year_end}'
    
    # Draw the plot
    fig = px.scatter_mapbox(df,
                            title = title,
                            lat = "LATITUDE",
                            lon = "LONGITUDE",
                            color = "Estimated Yearly Increase (°C)",
                            hover_name = "NAME",
                            color_continuous_midpoint=0,
                            **kwargs
                           )
    return fig

# choose a colormap
color_map = px.colors.diverging.RdGy_r

# Testing the plot
fig = temperature_coefficient_plot("India", 1980, 2020, 1, 
                                   min_obs = 10,
                                   zoom = 2,
                                   width = 900,
                                   height = 500,
                                   mapbox_style="carto-positron",
                                   color_continuous_scale=color_map)
fig.show()

# Write the interactive plot into HTML
import plotly.io as pio
pio.write_html(fig, file = "figure1.html", auto_open = True)

4. Create Two More Interesting Figures

Visualization 1. The Distribution of Temperature in India for January and June, year 1980 - 2020

def temperature_distribution_plot(country, year_begin, year_end,
                                  month1, month2, **kwargs):
    """
    This function plot the temperature distribution over years
    for the specified country, in the specified date range,
    in two specified month.  
    ----------------------------------------
    Parameters:
        - "country": a string giving the name of a 
        country for which data should be returned.
        
        - "year_begin" and "year_end": two integers giving 
        the earliest and latest years for which should be returned.
        
        - "month1" and "month2": Integers giving the months of
        the year for which should be returned. 
    ------------------------------------------
    Return:
        This function return a histogram showing the 
        temperature distribution over years
        for the specified country, in the specified date range,
        in two specified month.
    """
    # query the climate database for the first month input
    df_1 = query_climate_database(country=country, year_begin=year_begin,
                                  year_end=year_end, month=month1)
    
    # query the climate database for the first month input
    df_2 = query_climate_database(country=country, year_begin=year_begin,
                                  year_end=year_end, month=month2)
    
    # Concat the dataframe from 2 different month
    df = pd.concat([df_1, df_2])
    
    # Convert the month number to letters
    month_name_1 = calendar.month_name[month1]
    month_name_2 = calendar.month_name[month2]
    
    # Replace the Month value with alphabet
    df["Month"].replace({month1: month_name_1, month2: month_name_2},
                        inplace=True)

    # Set title
    title = f"Distribution of Temperature in {month_name_1}"\
    f" and {month_name_2} <br>in India"
    
    # Draw the Histogram
    fig = px.histogram(df,
                       title = title,
                       x = "Temp",
                       color = "Month",
                       opacity = 0.6, 
                       barmode='stack',
                       nbins = 35,
                       height= 500,
                       width = 650,
                       **kwargs
                  )
    # Rename x and y labels
    fig.update_layout(xaxis_title_text='Temperature',
                      yaxis_title_text='Count')
    return fig

# Draw our histogram
fig2 = temperature_distribution_plot(country = "India",
                                     year_begin = 1980,
                                     year_end = 2020,
                                     month1 = 1,
                                     month2 = 6)
fig2.show()

pio.write_html(fig2, file = "figure2.html", auto_open = True)

By observing the above histogram, we can see that the temperature in India in January is mainly concentrated between 10-30 degrees Celsius. June is a summer holiday and the temperature is much higher compared to January. As we can see from the graph, most of the temperatures in June are above 25 degrees Celsius, with the highest temperatures reaching over 35 degrees.

Visualization 2. Temperature variation over time in June and December in India - KAKINADA Station

def query_climate_database2(country, station_name,
                            year_begin, year_end, month):
    """
    This function return a dataframe of temperature 
    readings for the specified country and station, in the specified
    date range, in the specified month of the year.
    ---------------------------------------
    Parameters:
        - "country": a string giving the name of a 
        country for which data should be returned.
        
        - "station_name": string giving the name
        of the temperature station.
        
        - "year_begin" and "year_end": two integers giving 
        the earliest and latest years for which should be returned.
        
        - "month": an integer giving the month of
        the year for which should be returned.
    -----------------------------------------
    Return:
        This function returns a dataframe.
    """
    # Open the Database
    conn = sqlite3.connect("temps.db")
    # SQL Command
    cmd = \
    f"""
    SELECT S.NAME, S.LATITUDE, S.LONGITUDE,
    C.Name AS Country, T.Year, T.Month, T.Temp
    FROM temperatures T
    LEFT JOIN stations S ON T.ID = S.ID
    LEFT JOIN countries C ON SUBSTRING(T.ID, 1, 2) = C.[FIPS_10-4]
    WHERE S.NAME = '{station_name}' AND T.Year >= {year_begin} 
    AND T.Year <= {year_end} AND C.Name = '{country}' 
    AND T.Month = {month}
    """
    df = pd.read_sql_query(cmd, conn)
    conn.close()
    return(df)

# Import the calendar package for later
import calendar
import seaborn as sns

def temperature_line_plot(country, station_name, year_begin,
                          year_end, month1, month2, **kwargs):
    """
    This function draw a line plot to show 
    temperature variation over years
    for the specified country, in the specified date range,
    in two specified month, in the specified station.
    ----------------------------------
    Parameters:
        - "country": a string giving the name of a 
        country for which data should be returned.
        
        - "station_name": string giving the name
        of the temperature station.
        
        - "year_begin" and "year_end": two integers giving 
        the earliest and latest years for which should be returned.
        
        - "month1" and "month2": Integers giving the months of
        the year for which should be returned.
    -------------------------------------
    Return:
        This function returns a line plot.
    """
    # Get the station data from our first selected month
    df1 = query_climate_database2(country = country,
                                  station_name = station_name,
                                  year_begin = year_begin,
                                  year_end = year_end,
                                  month = month1)
    # Get the station data from our second selected month
    df2 = query_climate_database2(country = country,
                                  station_name = station_name,
                                  year_begin = year_begin,
                                  year_end = year_end,
                                  month = month2)
    # Combine dataframes from two different months
    df = pd.concat([df1, df2])
    
    # Convert the month number to letters
    month_name_1 = calendar.month_name[month1]
    month_name_2 = calendar.month_name[month2]
    
    # Replace the Month value with alphabet
    df["Month"].replace({month1: month_name_1, month2: month_name_2},
                        inplace=True)
    
    # Set Title
    title = "Temperature Over Time In " + month_name_1+ \
    " and " + month_name_2 +" "+ " <br>- " + country +  \
    " " + station_name + " Station"
    
    # Draw the line plot
    g = px.line(df, x="Year", y="Temp", color = "Month",
                title = title, width = 650
               )
    # Update the xlabel and ylabel
    g.update_layout(xaxis_title_text='Year',
                    yaxis_title_text='Temperature(°C)')
    return(g)

# Testing the function
fig3 = temperature_line_plot(country = "India",
                             station_name = "KAKINADA",
                             year_begin = 1980,
                             year_end = 2020,
                             month1 = 5,
                             month2 = 11)
fig3.show()

pio.write_html(fig3, file = "figure3.html", auto_open = True)

By observing the above line plot, the variation of temperature over time in May and November at Kakinada Station, India. The trend of line shows that the temperature in May is mainly concentrated in 30-33 degrees C. The temperature in November is mainly concentrated in 25-27.5 degrees C. From 1980 to 2020, the temperature at this station is relatively stable in May and November without major fluctuations.

Written on January 21, 2022