Preliminary analysis of growth data for efflux experiment¶
(c) 2024 Justin Bois. This work is licensed under a Creative Commons Attribution License CC-BY 4.0. All code contained herein is licensed under an MIT license.
This document was prepared at Caltech with support financial support from the Donna and Benjamin M. Rosen Bioengineering Center.

This tutorial was generated from an Jupyter notebook. You can download the notebook here.
import os
import io
import numpy as np
import polars as pl
import scipy.optimize
import iqplot
import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()
Here, I begin the analysis the data for the 2024 efflux pump experiment. You need to adjust the code cell below to give the path to your data sets.
# This is where the data are on my machine. If you are in
# the same directory as the data sets, then you can use
# data_path = './'
data_path = '/Users/bois/Dropbox/bi1x_2024_fall_data/efflux/'
Data wrangling¶
We will read in the data. First, the function given in the tutorial.
def timestamp_to_seconds(t_stamp):
"""Convert H:M:S string to seconds. This is necessary
to hand-build because the datetime utilities do not handle
times over 24 hours."""
h, m, s = t_stamp.split(':')
return 3600 * int(h) + 60 * int(m) + float(s)
def read_plate_reader(fname, header_row, last_row):
"""Read in time series data from Bi 1x plate reader.
Parameters
----------
fname : str
Path to file outputted from plate reader.
header_row : int
Number of row containing header column (zero-indexed)
last_row : int
Last row of the file containing the time series data (zero-indexed)
Returns
-------
output : DataFrame
Polars DataFrame containing time series data.
"""
# Get relevant contents of file as string
with open(fname, "r", encoding="ISO-8859-1") as f:
lines = f.readlines()
file_str = "".join(lines[header_row : last_row + 1])
# How many total lines to read
n_rows = last_row - header_row
# Read in data frame
df = pl.read_csv(
io.StringIO(file_str),
separator="\t",
encoding="ISO-8859-1",
)
# Rename temperature column
T_col = [col for col in df.columns if "T°" in col][0]
df = df.rename({T_col: "temperature (deg C)"})
# Parse the time column
df = df.with_columns(
pl.col("Time")
.map_elements(timestamp_to_seconds, return_dtype=float)
.alias("time (s)")
).with_columns(
pl.col('time (s)') - pl.col('time (s)').min()
)
# Time in units of hours
df = df.with_columns((pl.col('time (s)') / 3600).alias('time (hr)'))
return df
Next, we load in the data.
fname = os.path.join(data_path, '20241122_efflux.txt')
header_row = 53
last_row = 294
# Load in data
df = read_plate_reader(fname, header_row, last_row)
Now we'll add the metadata, first defining dictionaries.
well_metadata = dict(
A1="MG-0.0",
A2="WT-0.21",
A3="MG-0.0",
A4="WT-0.90",
A5="MG-0.0",
A6="WT-3.90",
A7="MG-0.0",
A8="WT-16.90",
A9="MG-0.0",
A10="WT-73.15",
A11="MG-0.0",
A12="WT-208.33",
B1="3.19-0.0",
B2="UV5-0.21",
B3="3.19-0.0",
B4="UV5-0.90",
B5="3.19-0.0",
B6="UV5-3.90",
B7="3.19-0.0",
B8="UV5-16.90",
B9="3.19-0.0",
B10="UV5-73.15",
B11="3.19-0.0",
B12="UV5-208.33",
C1="IW-0.0",
C2="MG-0.44",
C3="IW-0.0",
C4="MG-1.88",
C5="IW-0.0",
C6="MG-8.13",
C7="IW-0.0",
C8="MG-35.17",
C9="IW-0.0",
C10="MG-152.19",
C11="IW-0.0",
C12="MG-312.50",
D1="WT-0.0",
D2="3.19-0.44",
D3="WT-0.0",
D4="3.19-1.88",
D5="WT-0.0",
D6="3.19-8.13",
D7="WT-0.0",
D8="3.19-35.17",
D9="WT-0.0",
D10="3.19-152.19",
D11="WT-0.0",
D12="3.19-312.50",
E1="UV5-0.0",
E2="IW-0.44",
E3="UV5-0.0",
E4="IW-1.88",
E5="UV5-0.0",
E6="IW-8.13",
E7="UV5-0.0",
E8="IW-35.17",
E9="UV5-0.0",
E10="IW-152.19",
E11="UV5-0.0",
E12="IW-312.50",
F1="MG-0.21",
F2="WT-0.44",
F3="MG-0.90",
F4="WT-1.88",
F5="MG-3.90",
F6="WT-8.13",
F7="MG-16.90",
F8="WT-35.17",
F9="MG-73.15",
F10="WT-152.19",
F11="MG-208.33",
F12="WT-312.50",
G1="3.19-0.21",
G2="UV5-0.44",
G3="3.19-0.90",
G4="UV5-1.88",
G5="3.19-3.90",
G6="UV5-8.13",
G7="3.19-16.90",
G8="UV5-35.17",
G9="3.19-73.15",
G10="UV5-152.19",
G11="3.19-208.33",
G12="UV5-312.50",
H1="IW-0.21",
H2="blank",
H3="IW-0.90",
H4="blank",
H5="IW-3.90",
H6="blank",
H7="IW-16.90",
H8="blank",
H9="IW-73.15",
H10="blank",
H11="IW-208.33",
H12="blank",
)
We should also store the promoter binding energy for each strain.
promoter_binding_energy = {
'blank': None,
'MG': np.inf,
'3.19': -3.19,
'IW': -4.28,
'WT': -5.19,
'UV5': -7.93
}
Now some functions to help us tidy the data set.
def conc(well, well_metadata):
"""Return abx concentration for a given well"""
strain_conc = well_metadata[well]
if strain_conc == "blank":
return 0.0
else:
ind = strain_conc.find("-")
return float(strain_conc[ind + 1 :])
def strain(well, well_metadata):
"""Return strain for a given well"""
strain_conc = well_metadata[well]
if strain_conc == "blank":
return "blank"
else:
ind = strain_conc.find("-")
return strain_conc[:ind]
def tidy(df, well_metadata, date):
"""Convert data frame to tidy format."""
# Convert to tall format
df_tidy = df.unpivot(
index=["Time", "temperature (deg C)", "time (s)", "time (hr)"],
variable_name="well",
value_name="absorbance",
)
# Add metadata columns
df_tidy = df_tidy.with_columns(
pl.col('well').map_elements(lambda x: conc(x, well_metadata), return_dtype=float).alias('conc'),
pl.col('well').map_elements(lambda x: strain(x, well_metadata), return_dtype=str).alias('strain')
).with_columns(
pl.col('strain').replace_strict(promoter_binding_energy, return_dtype=pl.Float64).alias('promoter_binding_energy')
)
# Add the date
df_tidy = df_tidy.with_columns(date=pl.lit(date))
# Sort, starting with promoter binding energy
df_tidy = df_tidy.sort(
by=["promoter_binding_energy", "conc", "well", "time (hr)"]
)
return df_tidy
Now, tidy the date frame.
df = tidy(df, well_metadata, 20241122)
We may also want to look at previous years' data¶
To look at last term's data, you can directly load a tidy data frame as follows.
df_spring = pl.read_csv(
os.path.join(data_path, 'tidy_efflux_data_spring_2024.csv'),
schema=df.schema
)
We can put all the data in a single data frame if we want to analyze them all together.
df = pl.concat((df, df_spring))
We could us all of the data, or pull out a specific experiment. To pull out a specific experiment, we can do it by date. We will use the most recent class's data.
# To pull out the spring data to use
# df = df.filter(pl.col('date')<20241122)
df = df.filter(pl.col('date')==20241122)
Checking the blanks¶
Let's check all blanks.
p = bokeh.plotting.figure(
frame_width=350,
frame_height=200,
x_axis_label="time (hr)",
y_axis_label="absorbance",
tooltips=[('well', '@well')],
)
# Pull out only entries for blanks
df_blank = df.filter(pl.col("strain") == "blank")
# Groupby and iterate, adding to plot
for (well,), sub_df in df_blank.group_by("well"):
p.line(
source=sub_df.to_dict(),
x="time (hr)",
y="absorbance",
)
bokeh.io.show(p)
The H12 blank is highly variable, so we will not use that one in our calculations.
We can now subtract the blanks to get the OD600.
# ##################
# You need to give an explanation of what the code in this cell is doing.
# ##################
# Compute blanks
blank = df.filter((pl.col("strain") == "blank") & (pl.col('well') != 'H12'))["absorbance"].mean()
# Subtract blanks
df = df.with_columns((pl.col('absorbance') - blank).alias('OD600'))
# Clean out negative values
df = df.filter(pl.col('OD600') > 0)
Plotting the curves¶
We now write a function to plot all curves (except blanks) for each concentration of antibiotic.
def plot_curves(df, conc):
p = bokeh.plotting.figure(
frame_width=400,
frame_height=250,
x_axis_label="time (hr)",
y_axis_label="OD600",
y_axis_type="log",
toolbar_location="above",
tooltips=[('well', '@well')],
title=f'conc = {conc} µM',
)
# Color scheme
colors = bokeh.palettes.Category10_10
# Pull out only entries for antibiotic concentration of interest (and not blanks)
inds = (pl.col("conc") == conc) & (pl.col("strain") != "blank")
df_conc = df.filter(inds)
# Items we will place in the legend
legend_items = []
# Groupby and iterate, adding to plot
color_index = 0
for (strain,), sub_df in df_conc.group_by("strain", maintain_order=True):
# Lines for a given strain
lines = []
# Get each well for the strain and add a line to the plot
for well, subsub_df in sub_df.group_by("well", maintain_order=True):
lines.append(
p.line(source=subsub_df.to_dict(), x="time (hr)", y="OD600", color=colors[color_index])
)
color_index += 1
# Add the lines to the legend
legend_items.append((strain, lines))
# Create the legend from the items
legend = bokeh.models.Legend(items=legend_items, click_policy="hide")
# Add the legend to the plot
p.add_layout(legend, "right")
return p
We can then use this to make a plot of all curves. For convenience, we make a plot with tabs containing each concentration. This is also a useful visualization simply to see the growth curves.
tabs = []
for c in df['conc'].unique():
tabs.append(bokeh.models.TabPanel(child=plot_curves(df, c), title=str(c)))
all_plots = bokeh.models.Tabs(tabs=tabs)
bokeh.io.show(bokeh.models.Tabs(tabs=tabs))
In all curves, if there is growth, it is exponential between OD of 0.15 and 0.3.
Obtaining growth rate¶
Below are functions to help with the regressions to obtain growth rates.
def exp_growth(t, A0, lam):
return A0 * np.exp(lam * t)
def fit_growth(t, OD, ODmin=0.15, ODmax=0.3):
"""Obtain estimates for A0 and λ for a given sub-trace"""
# Pull out linear regime
inds = np.logical_and(OD >= ODmin, OD <= ODmax)
t = t[inds]
OD = OD[inds]
# If no growth, return zero
if len(t) == 0:
return 0.0
# Parameter guesses
A0_guess = 0.01
lam_guess = 1.0 # 1/hr
# Perform optimization
popt, _ = scipy.optimize.curve_fit(exp_growth, t, OD, p0=(A0_guess, lam_guess))
# Just return growth rate
return popt[1]
As an example, I show how to compute growth rates for all strains with zero antibiotic concentration.
# ##################
# You need to give an explanation of what the code in this cell is doing.
# ##################
# Pull out only entries for zero antibiotic concentration (and not blanks)
df_0abx = df.filter((pl.col("conc") == 0.0) & (pl.col("strain") != "blank"))
# Compute growth rates
growth_rates_0abx = (
df_0abx.group_by(["strain", "well"], maintain_order=True)
.agg(
pl.struct(["time (hr)", "OD600"]).map_elements(
lambda s: fit_growth(
s.struct.field("time (hr)").to_numpy(),
s.struct.field("OD600").to_numpy(),
),
return_dtype=float,
returns_scalar=True,
)
.alias("growth rate (1/hr)")
)
)
growth_rates_0abx
strain | well | growth rate (1/hr) |
---|---|---|
str | str | f64 |
"UV5" | "E1" | 0.414492 |
"UV5" | "E11" | 0.526582 |
"UV5" | "E3" | 0.388895 |
"UV5" | "E5" | 0.343108 |
"UV5" | "E7" | 0.2435 |
… | … | … |
"MG" | "A11" | 0.576421 |
"MG" | "A3" | 0.572663 |
"MG" | "A5" | 0.532847 |
"MG" | "A7" | 0.46699 |
"MG" | "A9" | 0.471456 |
Now we can make a plot of the respective growth rates.
bokeh.io.show(
iqplot.strip(
growth_rates_0abx,
q="growth rate (1/hr)",
cats="strain",
spread="swarm",
tooltips=[('well', '@well')],
)
)
We see some variation in the growth rates from one day to the next. We will take the growth rate for each strain to be given by the median.
# ##################
# You need to give an explanation of what the code in this cell is doing.
# ##################
lam0 = growth_rates_0abx.group_by("strain").agg(
pl.col("growth rate (1/hr)").median().alias("lam0")
)
# Take a look
lam0
strain | lam0 |
---|---|
str | f64 |
"3.19" | 0.504496 |
"IW" | 0.494827 |
"UV5" | 0.370197 |
"WT" | 0.408679 |
"MG" | 0.552755 |
Perhaps unsurprisingly, the growth rates are all similar. For convenience, let's add these to the data frame.
df = df.join(lam0, on='strain')
Solver for theoretical growth rate¶
Recall from the theory that the growth rate $\lambda$ is found by solving a cubic equation. The following functions may be used to perform that solution. Be sure to read these carefully to understand how they work.
def solve_growth_rate_dimensionless(
aex, kappa, K, qmax, Pin, Pout, return_all_roots=False
):
"""Solve for the growth rate for a single antibiotic
concentation in dimensionless units. All parameters
are dimensionless. If `return_all_roots` is True,
returns all physically viable steady state growth rates.
Otherwise, only the single stable growth rate is returned.
"""
# No antibiotic: max growth rate
if aex == 0.0:
lam = np.array([1.0])
else:
# Coefficients for cubic polynomial
a3 = K + qmax - 1
a2 = 2 - (1 - K) * (Pin * aex + Pout) - (1 - kappa) * (qmax + K)
a1 = (
-1
- kappa * (qmax + K)
+ (2 - (1 - kappa) * K) * Pout
+ (1 - (1 - 2 * K) * kappa) * Pin * aex
)
a0 = (1 + kappa * K) * (kappa * Pin * aex - Pout)
# Set up the cubic polynomial
cubic = np.polynomial.Polynomial([a0, a1, a2, a3])
# Solve the cubic
lam = np.sort([r for r in cubic.roots() if np.isreal(r) and 0 <= r <= 1])
# No roots, return 0
if len(lam) == 0:
return 0.0
if return_all_roots:
return lam if len(lam) > 1 else lam[0]
else:
return lam[-1]
def growth_rate(
aex, # µM
qmax, # µM/hr
lam0, # 1/hr
rmin=19.3, # µM
kappa_t=0.061, # 1/µM
Pin=2.85, # 1/hr
Pout=2.85, # 1/hr
Kd=1.0, # µM
KM=10.0, # µM
):
"""Solve for the growth rate for antibiotic concentration, or
array of antibiotic concentrations, aex. Assumes all
concentration units are µM and all time units are hours.
"""
# Nondimensionalize
kappa = kappa_t * rmin / lam0
K = KM / Kd
qmax = qmax / Kd / lam0
Pin = Pin / lam0
Pout = Pout / lam0
aex = aex / Kd
if np.isscalar(aex):
result = solve_growth_rate_dimensionless(
aex, kappa, K, qmax, Pin, Pout, return_all_roots=False
)
else:
result = [
solve_growth_rate_dimensionless(
aex_val, kappa, K, qmax, Pin, Pout, return_all_roots=False
)
for aex_val in aex
]
result = np.array(result)
# Return dimensional growth rate
return result * lam0
You can now proceed with the analysis!