Appointment Duration¶

Aim: Make an approximation of appointment time by ICB and staff type that we can use in discrete event simulation?¶

  • Find all binned appointments by area
  • Fit a probability distribution function
  • Export to Yaml file

Library imports¶

In [1]:
import os
# 
# print the current working directory
print(os.getcwd())
/workspaces/PrimaryCareDemandAndCapacity/notebooks
In [2]:
# this is a bit of a hack to get relative imports 
# to work as if these notebooks were in a package
# change cwd to project root if 'notebooks' in PATH
from os import chdir
from pathlib import Path
if 'notebooks' in str(Path.cwd()):
    chdir('..')

# std library
import yaml

#stats/arrays
import numpy as np
import pandas as pd
import scipy.stats as stats

# project imports from src
from src.schemas import DataCatalog
import src.constants as const
from src.various_methods import PlotCounter

Notebook constants¶

In [3]:
GP_APPOINTMENTS_CATALOG_NAME:str = 'Appointments in General Practice, September 2023'
OUTPUT_DIRECTORIES = ['outputs', 'assumptions']
OUTPUT_YAML_FILE = "outputs/assumptions/appointment_duration.yaml"
NOTEBOOK_ALIAS = 'appointment_duration'
AGE_BIN_EDGES = [1, 6, 11, 16, 21, 31, 60]

¶

Load from data catalog¶

In [4]:
# load data catalog
data_catalog = DataCatalog.load_from_yaml("data_catalog.yaml")
# keeps track of plot numbers
plot_counter = PlotCounter(name=NOTEBOOK_ALIAS) 

Set up output directories¶

In [5]:
# set up output directories
for i in ['outputs/assumptions', 'outputs/plots', 'outputs/tables']:
    if not os.path.exists(i):
        os.makedirs(i)

Wrangling¶

Part 1 - Cleaning¶

  1. remove unknown appointments
  2. Convert date columns
  3. Remove unrequired columns
In [6]:
# Load the appointments data
appointments_catalog_entry = data_catalog.get_catalog_entry_by_name(GP_APPOINTMENTS_CATALOG_NAME)
appointments_df = appointments_catalog_entry.load()
html_appts =  appointments_df.head(10).to_html()
In [7]:
appointments_df = (
    appointments_df # select only the columns we need, and remove unknown
    .loc[(appointments_df['SUB_ICB_LOCATION_CODE'].isin(const.SUB_ICB_CODES.keys())) & (appointments_df['ACTUAL_DURATION'] != "Unknown / Data Quality")]
    .assign(
        Alliance=appointments_df['SUB_ICB_LOCATION_CODE'].map(
            const.SUB_ICB_CODES), # add the alliance column
        Date=pd.to_datetime(appointments_df['Appointment_Date'], format='%d%b%Y')) # convert the date column to datetime
        # drop the columns we don't need
    .drop(columns=['SUB_ICB_LOCATION_CODE', 'SUB_ICB_LOCATION_ONS_CODE', 'SUB_ICB_LOCATION_NAME', 'ICB_ONS_CODE', 'REGION_ONS_CODE', 'Appointment_Date']))
appointments_df.head()
Out[7]:
ACTUAL_DURATION COUNT_OF_APPOINTMENTS Alliance Date
195134 6-10 Minutes 1825 Ipswich & East Suffolk 2021-12-01
195135 21-30 Minutes 752 Ipswich & East Suffolk 2021-12-01
195136 16-20 Minutes 786 Ipswich & East Suffolk 2021-12-01
195137 31-60 Minutes 456 Ipswich & East Suffolk 2021-12-01
195138 11-15 Minutes 1253 Ipswich & East Suffolk 2021-12-01

Part 2 - Summary statistics¶

  1. Group by Alliance and FY
  2. Create left bin edges
In [8]:
appointments_binned_df = (appointments_df
                          .groupby(['Alliance', pd.Grouper(key='Date',freq='BA-MAR',label='right'),'ACTUAL_DURATION'])
                          .sum()
                          .reset_index()
                          .rename(columns={'Date':'financial_year'}).replace({"1-5":"01-05","6-10":"06-10"})
                          .assign(financial_year=lambda df: df['financial_year'].dt.year-1,
                                  left_bin=lambda x: x.ACTUAL_DURATION.map(lambda x: int(x.split("-")[0]))))

appointments_binned_df.head()
Out[8]:
Alliance financial_year ACTUAL_DURATION COUNT_OF_APPOINTMENTS left_bin
0 Ipswich & East Suffolk 2021 1-5 Minutes 140934 1
1 Ipswich & East Suffolk 2021 11-15 Minutes 106449 11
2 Ipswich & East Suffolk 2021 16-20 Minutes 67949 16
3 Ipswich & East Suffolk 2021 21-30 Minutes 65468 21
4 Ipswich & East Suffolk 2021 31-60 Minutes 40643 31

Part 3: Convert to dictionaries & create continuous data¶

  1. Create empty dictionaries to be populated
  2. Pivot the data for easier conversion
  3. Populate the dictionaries with
    • Binned data
    • Continuous (Linear interpolation between bin edges)
In [9]:
# create empty dicts in dicts to store the binned and continuous data
binned_dict = {k: {k2: None for k2 in range(2021,2024)} for k in appointments_binned_df.Alliance.unique()}
cont_dict = {k: {k2: None for k2 in range(2021,2024)} for k in appointments_binned_df.Alliance.unique()}

# pivot the binned data
pivot_bins_df = (appointments_binned_df
                 .groupby(['Alliance','financial_year','left_bin']).sum() # sum the counts
                 .reset_index() # reset the index
                 .drop(columns=['ACTUAL_DURATION']) # drop the duration column
                 .sort_values(by=['Alliance','financial_year','left_bin']) # sort the values
                 .pivot(index=['Alliance','financial_year'],columns='left_bin',values='COUNT_OF_APPOINTMENTS')
                 .reset_index() # reset the index 
                 .fillna(0) # fill the NaNs with 0 (there are none)
                 .set_index(['Alliance','financial_year']) # set the index
                 )

# iterate through the pivot table and create the binned and continuous data
for index, row in pivot_bins_df.iterrows():
    # get the bin sums as np array
    bin_sums = row.to_numpy()
    # add to dictionary in correct place
    binned_dict[index[0]][index[1]] = bin_sums
    # create the continuous array (initially empty)
    for binned_index, bin_count in enumerate(bin_sums):
        # create the datapoints in bin
        cont_array = np.linspace(start=AGE_BIN_EDGES[binned_index], stop=AGE_BIN_EDGES[binned_index+1], num=bin_count) 
        # if it is the first, instantiate the main array
        if binned_index == 0:
            cont_array_all = cont_array
        else:
            cont_array_all = np.append(cont_array_all, cont_array)
    cont_dict[index[0]][index[1]] = cont_array_all 
    

Plotting¶

Histograms with provided bins¶

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt

for alliance, dataset in cont_dict.items():
    fig, axes = plt.subplots(1,3,figsize=(10,7.5))
    ax_index = 0
    for year, array in dataset.items():
        ax = sns.histplot(array, kde=False, bins=AGE_BIN_EDGES, stat='probability', ax=axes[ax_index])
        ax.yaxis.grid(True)
        if ax_index == 0:
            ax.set_title(f"{alliance} {year}")
            ax.set_ylabel("Probability")
        else:
            ax.set_title(f"{year}")
            ax.set_ylabel("")
        ax_index += 1
    plt.savefig(f"outputs/plots/{plot_counter.plot_name}.png", dpi=300)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Distribution examples¶

In [11]:
import seaborn as sns
lognorm_example =stats.lognorm.rvs(0.5, size=1000, scale=1)
exponential_example = stats.expon.rvs(size=1000, scale=1)

# plot two example distributions
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1,2,figsize=(10,5))
# Plot the lognormal distribution
sns.histplot(lognorm_example, kde=True, bins='auto', color='blue', ax=axes[0])
axes[0].set_title('Lognormal Distribution')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')

# Plot the exponential distribution
sns.histplot(exponential_example, kde=True, bins='auto', color='green', ax=axes[1])
axes[1].set_title('Exponential Distribution')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')
plt.savefig(f"outputs/plots/{plot_counter.plot_name}.png", dpi=300)
No description has been provided for this image

Fit distributions¶

  • Exponential
  • Lognormal
In [12]:
# loop over binned_dict and fit distributions
fitted_distributions = {}
for alliance, years in cont_dict.items():
    if not fitted_distributions.get(alliance, None):
        fitted_distributions[alliance] = {}# add dict entry
    for year, array in years.items():
        if array is not None:
            # Fit an exponential distribution to the data using MLE
            expon_fit_params = list(float(i) for i in stats.expon.fit(array))
            lognorm_fit_params = list(float (i) for i in stats.lognorm.fit(array))
            fitted_distributions[alliance][year] = {"lognorm":lognorm_fit_params, "expon":expon_fit_params}

Plot 'Theoretical' distributions¶

In [13]:
for alliance, years in fitted_distributions.items():
    for year, params in years.items():
        if year==2023:
            lognorm_samples = stats.lognorm.rvs(*params['lognorm'], size=1000)
            expon_samples = stats.expon.rvs(*params['expon'], size=1000)
            fig, axes = plt.subplots(1,2,figsize=(10,7.5))
            ax = sns.histplot(lognorm_samples, kde=True, bins=AGE_BIN_EDGES, stat='probability', ax=axes[0])
            ax.yaxis.grid(True)
            ax.set_title(f"{alliance} {year} lognormal")
            ax.set_ylabel("Probability")
            ax.set_xlabel("Appointment Time (Minutes)")
            ax = sns.histplot(expon_samples, kde=True, bins=AGE_BIN_EDGES, stat='probability', ax=axes[1])
            ax.yaxis.grid(True)
            ax.set_xlabel("Appointment Time (Minutes)")
            ax.set_title(f"{alliance} {year} exponential")
            ax.set_ylabel("Probability")
    plt.savefig(f"outputs/plots/{plot_counter.plot_name}.png", dpi=300)
            
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Export to YAML¶

Using 2023 data¶

In [14]:
# select only 2023 data
distributions_2023 = {k: {dist: params for dist, params in v[2023].items()} for k,v in fitted_distributions.items()}
# output to yaml
with open(OUTPUT_YAML_FILE, 'w') as yaml_file:
    yaml.dump(distributions_2023, yaml_file)