Preliminary analysis of growth data for efflux experiment¶

(c) 2024 Justin Bois. This work is licensed under a Creative Commons Attribution License CC-BY 4.0. All code contained herein is licensed under an MIT license.

This document was prepared at Caltech with support financial support from the Donna and Benjamin M. Rosen Bioengineering Center.

No description has been provided for this image

This tutorial was generated from an Jupyter notebook. You can download the notebook here.


In [1]:
import os
import io

import numpy as np
import polars as pl
import scipy.optimize

import iqplot

import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()
Loading BokehJS ...

Here, I begin the analysis the data for the 2024 efflux pump experiment. You need to adjust the code cell below to give the path to your data sets.

In [2]:
# This is where the data are on my machine. If you are in 
# the same directory as the data sets, then you can use
# data_path = './'
data_path = '/Users/bois/Dropbox/bi1x_2024_fall_data/efflux/'

Data wrangling¶

We will read in the data. First, the function given in the tutorial.

In [3]:
def timestamp_to_seconds(t_stamp):
    """Convert H:M:S string to seconds. This is necessary
    to hand-build because the datetime utilities do not handle
    times over 24 hours."""
    h, m, s = t_stamp.split(':')
    return 3600 * int(h) + 60 * int(m) + float(s)


def read_plate_reader(fname, header_row, last_row):
    """Read in time series data from Bi 1x plate reader.

    Parameters
    ----------
    fname : str
        Path to file outputted from plate reader.
    header_row : int
        Number of row containing header column (zero-indexed)
    last_row : int
        Last row of the file containing the time series data (zero-indexed)

    Returns
    -------
    output : DataFrame
        Polars DataFrame containing time series data.
    """
    # Get relevant contents of file as string
    with open(fname, "r", encoding="ISO-8859-1") as f:
        lines = f.readlines()
    file_str = "".join(lines[header_row : last_row + 1])

    # How many total lines to read
    n_rows = last_row - header_row

    # Read in data frame
    df = pl.read_csv(
        io.StringIO(file_str),
        separator="\t",
        encoding="ISO-8859-1",
    )

    # Rename temperature column
    T_col = [col for col in df.columns if "T°" in col][0]
    df = df.rename({T_col: "temperature (deg C)"})

    # Parse the time column
    df = df.with_columns(
        pl.col("Time")
        .map_elements(timestamp_to_seconds, return_dtype=float)
        .alias("time (s)")
    ).with_columns(
        pl.col('time (s)') - pl.col('time (s)').min()
    )

    # Time in units of hours
    df = df.with_columns((pl.col('time (s)') / 3600).alias('time (hr)'))

    return df

Next, we load in the data.

In [4]:
fname = os.path.join(data_path, '20241122_efflux.txt')
header_row = 53
last_row = 294

# Load in data
df = read_plate_reader(fname, header_row, last_row)

Now we'll add the metadata, first defining dictionaries.

In [5]:
well_metadata = dict(
    A1="MG-0.0",
    A2="WT-0.21",
    A3="MG-0.0",
    A4="WT-0.90",
    A5="MG-0.0",
    A6="WT-3.90",
    A7="MG-0.0",
    A8="WT-16.90",
    A9="MG-0.0",
    A10="WT-73.15",
    A11="MG-0.0",
    A12="WT-208.33",
    B1="3.19-0.0",
    B2="UV5-0.21",
    B3="3.19-0.0",
    B4="UV5-0.90",
    B5="3.19-0.0",
    B6="UV5-3.90",
    B7="3.19-0.0",
    B8="UV5-16.90",
    B9="3.19-0.0",
    B10="UV5-73.15",
    B11="3.19-0.0",
    B12="UV5-208.33",
    C1="IW-0.0",
    C2="MG-0.44",
    C3="IW-0.0",
    C4="MG-1.88",
    C5="IW-0.0",
    C6="MG-8.13",
    C7="IW-0.0",
    C8="MG-35.17",
    C9="IW-0.0",
    C10="MG-152.19",
    C11="IW-0.0",
    C12="MG-312.50",
    D1="WT-0.0",
    D2="3.19-0.44",
    D3="WT-0.0",
    D4="3.19-1.88",
    D5="WT-0.0",
    D6="3.19-8.13",
    D7="WT-0.0",
    D8="3.19-35.17",
    D9="WT-0.0",
    D10="3.19-152.19",
    D11="WT-0.0",
    D12="3.19-312.50",
    E1="UV5-0.0",
    E2="IW-0.44",
    E3="UV5-0.0",
    E4="IW-1.88",
    E5="UV5-0.0",
    E6="IW-8.13",
    E7="UV5-0.0",
    E8="IW-35.17",
    E9="UV5-0.0",
    E10="IW-152.19",
    E11="UV5-0.0",
    E12="IW-312.50",
    F1="MG-0.21",
    F2="WT-0.44",
    F3="MG-0.90",
    F4="WT-1.88",
    F5="MG-3.90",
    F6="WT-8.13",
    F7="MG-16.90",
    F8="WT-35.17",
    F9="MG-73.15",
    F10="WT-152.19",
    F11="MG-208.33",
    F12="WT-312.50",
    G1="3.19-0.21",
    G2="UV5-0.44",
    G3="3.19-0.90",
    G4="UV5-1.88",
    G5="3.19-3.90",
    G6="UV5-8.13",
    G7="3.19-16.90",
    G8="UV5-35.17",
    G9="3.19-73.15",
    G10="UV5-152.19",
    G11="3.19-208.33",
    G12="UV5-312.50",
    H1="IW-0.21",
    H2="blank",
    H3="IW-0.90",
    H4="blank",
    H5="IW-3.90",
    H6="blank",
    H7="IW-16.90",
    H8="blank",
    H9="IW-73.15",
    H10="blank",
    H11="IW-208.33",
    H12="blank",
)

We should also store the promoter binding energy for each strain.

In [6]:
promoter_binding_energy = {
    'blank': None,
    'MG': np.inf,
    '3.19': -3.19,
    'IW': -4.28,
    'WT': -5.19,
    'UV5': -7.93
}

Now some functions to help us tidy the data set.

In [7]:
def conc(well, well_metadata):
    """Return abx concentration for a given well"""
    strain_conc = well_metadata[well]
    if strain_conc == "blank":
        return 0.0
    else:
        ind = strain_conc.find("-")
        return float(strain_conc[ind + 1 :])


def strain(well, well_metadata):
    """Return strain for a given well"""
    strain_conc = well_metadata[well]
    if strain_conc == "blank":
        return "blank"
    else:
        ind = strain_conc.find("-")
        return strain_conc[:ind]


def tidy(df, well_metadata, date):
    """Convert data frame to tidy format."""
    # Convert to tall format
    df_tidy = df.unpivot(
        index=["Time", "temperature (deg C)", "time (s)", "time (hr)"],
        variable_name="well",
        value_name="absorbance",
    )

    # Add metadata columns
    df_tidy = df_tidy.with_columns(
        pl.col('well').map_elements(lambda x: conc(x, well_metadata), return_dtype=float).alias('conc'),
        pl.col('well').map_elements(lambda x: strain(x, well_metadata), return_dtype=str).alias('strain')
    ).with_columns(
        pl.col('strain').replace_strict(promoter_binding_energy, return_dtype=pl.Float64).alias('promoter_binding_energy')
    )

    # Add the date
    df_tidy = df_tidy.with_columns(date=pl.lit(date))
    
    # Sort, starting with promoter binding energy
    df_tidy = df_tidy.sort(
        by=["promoter_binding_energy", "conc", "well", "time (hr)"]
    )

    return df_tidy

Now, tidy the date frame.

In [8]:
df = tidy(df, well_metadata, 20241122)

We may also want to look at previous years' data¶

To look at last term's data, you can directly load a tidy data frame as follows.

In [9]:
df_spring = pl.read_csv(
    os.path.join(data_path, 'tidy_efflux_data_spring_2024.csv'),
    schema=df.schema
)

We can put all the data in a single data frame if we want to analyze them all together.

In [10]:
df = pl.concat((df, df_spring))

We could us all of the data, or pull out a specific experiment. To pull out a specific experiment, we can do it by date. We will use the most recent class's data.

In [11]:
# To pull out the spring data to use
# df = df.filter(pl.col('date')<20241122)

df = df.filter(pl.col('date')==20241122)

Checking the blanks¶

Let's check all blanks.

In [12]:
p = bokeh.plotting.figure(
    frame_width=350,
    frame_height=200,
    x_axis_label="time (hr)",
    y_axis_label="absorbance",
    tooltips=[('well', '@well')],
)

# Pull out only entries for blanks
df_blank = df.filter(pl.col("strain") == "blank")

# Groupby and iterate, adding to plot
for (well,), sub_df in df_blank.group_by("well"):
    p.line(
        source=sub_df.to_dict(),
        x="time (hr)",
        y="absorbance",
    )

bokeh.io.show(p)

The H12 blank is highly variable, so we will not use that one in our calculations.

We can now subtract the blanks to get the OD600.

In [13]:
# ##################
# You need to give an explanation of what the code in this cell is doing.
# ##################

# Compute blanks
blank = df.filter((pl.col("strain") == "blank") & (pl.col('well') != 'H12'))["absorbance"].mean()

# Subtract blanks
df = df.with_columns((pl.col('absorbance') - blank).alias('OD600'))

# Clean out negative values
df = df.filter(pl.col('OD600') > 0)

Plotting the curves¶

We now write a function to plot all curves (except blanks) for each concentration of antibiotic.

In [14]:
def plot_curves(df, conc):
    p = bokeh.plotting.figure(
        frame_width=400,
        frame_height=250,
        x_axis_label="time (hr)",
        y_axis_label="OD600",
        y_axis_type="log",
        toolbar_location="above",
        tooltips=[('well', '@well')],
        title=f'conc = {conc} µM',
    )
    
    # Color scheme
    colors = bokeh.palettes.Category10_10
    
    # Pull out only entries for antibiotic concentration of interest (and not blanks)
    inds = (pl.col("conc") == conc) & (pl.col("strain") != "blank")
    df_conc = df.filter(inds)
    
    # Items we will place in the legend
    legend_items = []
    
    # Groupby and iterate, adding to plot
    color_index = 0
    for (strain,), sub_df in df_conc.group_by("strain", maintain_order=True):
        # Lines for a given strain
        lines = []
    
        # Get each well for the strain and add a line to the plot
        for well, subsub_df in sub_df.group_by("well", maintain_order=True):
            lines.append(
                p.line(source=subsub_df.to_dict(), x="time (hr)", y="OD600", color=colors[color_index])
            )
            
        color_index += 1
    
        # Add the lines to the legend
        legend_items.append((strain, lines))
    
    # Create the legend from the items
    legend = bokeh.models.Legend(items=legend_items, click_policy="hide")
    
    # Add the legend to the plot
    p.add_layout(legend, "right")

    return p

We can then use this to make a plot of all curves. For convenience, we make a plot with tabs containing each concentration. This is also a useful visualization simply to see the growth curves.

In [15]:
tabs = []
for c in df['conc'].unique():
    tabs.append(bokeh.models.TabPanel(child=plot_curves(df, c), title=str(c)))

all_plots = bokeh.models.Tabs(tabs=tabs)
bokeh.io.show(bokeh.models.Tabs(tabs=tabs))

In all curves, if there is growth, it is exponential between OD of 0.15 and 0.3.

Obtaining growth rate¶

Below are functions to help with the regressions to obtain growth rates.

In [16]:
def exp_growth(t, A0, lam):
    return A0 * np.exp(lam * t)


def fit_growth(t, OD, ODmin=0.15, ODmax=0.3):
    """Obtain estimates for A0 and λ for a given sub-trace"""
    # Pull out linear regime
    inds = np.logical_and(OD >= ODmin, OD <= ODmax)
    t = t[inds]
    OD = OD[inds]
    
    # If no growth, return zero
    if len(t) == 0:
        return 0.0
    
    # Parameter guesses
    A0_guess = 0.01
    lam_guess = 1.0  # 1/hr

    # Perform optimization
    popt, _ = scipy.optimize.curve_fit(exp_growth, t, OD, p0=(A0_guess, lam_guess))
    
    # Just return growth rate 
    return popt[1]

As an example, I show how to compute growth rates for all strains with zero antibiotic concentration.

In [17]:
# ##################
# You need to give an explanation of what the code in this cell is doing.
# ##################

# Pull out only entries for zero antibiotic concentration (and not blanks)
df_0abx = df.filter((pl.col("conc") == 0.0) & (pl.col("strain") != "blank"))

# Compute growth rates
growth_rates_0abx = (
    df_0abx.group_by(["strain", "well"], maintain_order=True)
    .agg(
        pl.struct(["time (hr)", "OD600"]).map_elements(
            lambda s: fit_growth(
                s.struct.field("time (hr)").to_numpy(),
                s.struct.field("OD600").to_numpy(),
            ),
            return_dtype=float,
            returns_scalar=True,
        )
        .alias("growth rate (1/hr)")
    )
)
In [18]:
growth_rates_0abx
Out[18]:
shape: (30, 3)
strainwellgrowth rate (1/hr)
strstrf64
"UV5""E1"0.414492
"UV5""E11"0.526582
"UV5""E3"0.388895
"UV5""E5"0.343108
"UV5""E7"0.2435
………
"MG""A11"0.576421
"MG""A3"0.572663
"MG""A5"0.532847
"MG""A7"0.46699
"MG""A9"0.471456

Now we can make a plot of the respective growth rates.

In [19]:
bokeh.io.show(
    iqplot.strip(
        growth_rates_0abx,
        q="growth rate (1/hr)",
        cats="strain",
        spread="swarm",
        tooltips=[('well', '@well')],
    )
)

We see some variation in the growth rates from one day to the next. We will take the growth rate for each strain to be given by the median.

In [20]:
# ##################
# You need to give an explanation of what the code in this cell is doing.
# ##################

lam0 = growth_rates_0abx.group_by("strain").agg(
    pl.col("growth rate (1/hr)").median().alias("lam0")
)

# Take a look
lam0
Out[20]:
shape: (5, 2)
strainlam0
strf64
"3.19"0.504496
"IW"0.494827
"UV5"0.370197
"WT"0.408679
"MG"0.552755

Perhaps unsurprisingly, the growth rates are all similar. For convenience, let's add these to the data frame.

In [21]:
df = df.join(lam0, on='strain')

Solver for theoretical growth rate¶

Recall from the theory that the growth rate $\lambda$ is found by solving a cubic equation. The following functions may be used to perform that solution. Be sure to read these carefully to understand how they work.

In [22]:
def solve_growth_rate_dimensionless(
    aex, kappa, K, qmax, Pin, Pout, return_all_roots=False
):
    """Solve for the growth rate for a single antibiotic
    concentation in dimensionless units. All parameters
    are dimensionless. If `return_all_roots` is True,
    returns all physically viable steady state growth rates.
    Otherwise, only the single stable growth rate is returned.
    """
    # No antibiotic: max growth rate
    if aex == 0.0:
        lam = np.array([1.0])
    else:
        # Coefficients for cubic polynomial
        a3 = K + qmax - 1
        a2 = 2 - (1 - K) * (Pin * aex + Pout) - (1 - kappa) * (qmax + K)
        a1 = (
            -1
            - kappa * (qmax + K)
            + (2 - (1 - kappa) * K) * Pout
            + (1 - (1 - 2 * K) * kappa) * Pin * aex
        )
        a0 = (1 + kappa * K) * (kappa * Pin * aex - Pout)

        # Set up the cubic polynomial
        cubic = np.polynomial.Polynomial([a0, a1, a2, a3])

        # Solve the cubic
        lam = np.sort([r for r in cubic.roots() if np.isreal(r) and 0 <= r <= 1])

    # No roots, return 0
    if len(lam) == 0:
        return 0.0

    if return_all_roots:
        return lam if len(lam) > 1 else lam[0]
    else:
        return lam[-1]


def growth_rate(
    aex,            # µM
    qmax,           # µM/hr
    lam0,           # 1/hr
    rmin=19.3,      # µM
    kappa_t=0.061,  # 1/µM
    Pin=2.85,       # 1/hr
    Pout=2.85,      # 1/hr
    Kd=1.0,         # µM
    KM=10.0,        # µM
):
    """Solve for the growth rate for antibiotic concentration, or
    array of antibiotic concentrations, aex. Assumes all
    concentration units are µM and all time units are hours.
    """
    # Nondimensionalize
    kappa = kappa_t * rmin / lam0
    K = KM / Kd
    qmax = qmax / Kd / lam0
    Pin = Pin / lam0
    Pout = Pout / lam0
    aex = aex / Kd

    if np.isscalar(aex):
        result = solve_growth_rate_dimensionless(
            aex, kappa, K, qmax, Pin, Pout, return_all_roots=False
        )
    else:
        result = [
            solve_growth_rate_dimensionless(
                aex_val, kappa, K, qmax, Pin, Pout, return_all_roots=False
            )
            for aex_val in aex
        ]
        result = np.array(result)

    # Return dimensional growth rate
    return result * lam0

You can now proceed with the analysis!