import os
import fnmatch
import pandas as pd
import numpy as np

from qiime2 import Visualization

# Put the full path to your home working directory
home_dir = "/home/ec2-user/sequencing_analysis/"

# Subdirectory in home_dir containing all of the reads (probably 'soil')
data_dir = "soil/"

# Whether or not to do pre-processing (this takes a looong time)
# Only change to True if you have not done it yet.
pre_process = False

# Full path to the directory with reads
data_dir = os.path.join(home_dir, data_dir)

# These are artifact and visualization directories located within the home directory.
artifact_dir = os.path.join(home_dir, 'qiime_artifacts/')
visualization_dir = os.path.join(home_dir, 'qiime_visualizations/')

# If these do not exist, make them
if not os.path.exists(artifact_dir):
    os.makedirs(artifact_dir)
    
if not os.path.exists(visualization_dir):
    os.makedirs(visualization_dir)

if pre_process:
    with open(os.path.join(home_dir, 'manifest.txt'), 'w') as f:
        # Add formatting rows
        f.write('sample-id\tforward-absolute-filepath\treverse-absolute-filepath\n')

        # list all files and write the forward reads into the manifest file
        list_of_files = os.listdir(data_dir)  
        forward = '*L001_R1_001.fastq.gz'

        for file in list_of_files:  
            if fnmatch.fnmatch(file, forward):
                line = str(file.split("_")[0]
                           + f'\t{data_dir}'
                           + str(file.split("L001")[0])
                           + 'L001_R1_001.fastq.gz'
                           + f'\t{data_dir}'
                           + str(file.split("L001")[0])
                           + 'L001_R2_001.fastq.gz\n')
                f.write(line)

    pd.set_option('display.max_colwidth', None)           
    pd.read_csv(os.path.join(home_dir, 'manifest.txt'), sep='\t')

if pre_process:
    !qiime tools import \
      --type 'SampleData[PairedEndSequencesWithQuality]' \
      --input-path $home_dir'manifest.txt' \
      --output-path $artifact_dir'pair-end-demux.qza' \
      --input-format PairedEndFastqManifestPhred33V2

# Next, create a visualization of demuliplexed samples with quality
if pre_process:
    !qiime demux summarize \
      --i-data $artifact_dir'pair-end-demux.qza' \
      --o-visualization $visualization_dir'pair-end-demux.qzv'

Visualization.load(os.path.join(visualization_dir, 'pair-end-demux.qzv'))

# Use Dada2 to denoise the sample. This removes spurious reads that are more likely to be sequencing errors than novel colonies
if pre_process:
    !qiime dada2 denoise-paired \
        --i-demultiplexed-seqs $artifact_dir'pair-end-demux.qza' \
        --p-trunc-len-f 150 \
        --p-trunc-len-r 150 \
        --p-n-threads 4 \
        --o-representative-sequences $artifact_dir'rep-seqs.qza' \
        --o-table $artifact_dir'OTU_table.qza' \
        --o-denoising-stats $artifact_dir'stats-dada2.qza'

if pre_process:
    !qiime metadata tabulate \
      --m-input-file $artifact_dir'stats-dada2.qza' \
      --o-visualization $visualization_dir'stats-dada2.qzv'

metadata = os.path.join(home_dir, 'metadata.tsv')
df = pd.read_csv(metadata, sep='\t')

# Display dataframe
df

# Gives us a table with each feature and its abundance
!qiime feature-table summarize \
  --i-table $artifact_dir'OTU_table.qza' \
  --o-visualization $visualization_dir'OTU_table.qzv' \
  --m-sample-metadata-file $metadata

# Relates the features to the sequences
!qiime feature-table tabulate-seqs \
  --i-data $artifact_dir'rep-seqs.qza' \
  --o-visualization $visualization_dir'rep-seqs.qzv'

R[write to console]: Warning:
R[write to console]:  ‘timedatectl’ indicates the non-existent timezone name ‘n/a’

R[write to console]: Warning:
R[write to console]:  Your system is mis-configured: ‘/etc/localtime’ is not a symlink

R[write to console]: Warning:
R[write to console]:  It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)

Saved Visualization to: /home/ec2-user/sequencing_analysis/qiime_visualizations/OTU_table.qzv
R[write to console]: Warning:
R[write to console]:  ‘timedatectl’ indicates the non-existent timezone name ‘n/a’

R[write to console]: Warning:
R[write to console]:  Your system is mis-configured: ‘/etc/localtime’ is not a symlink

R[write to console]: Warning:
R[write to console]:  It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)

Saved Visualization to: /home/ec2-user/sequencing_analysis/qiime_visualizations/rep-seqs.qzv

if not os.path.exists(os.path.join(artifact_dir, "gg-13-8-99-515-806-nb-classifier.qza")):
    !wget \
      -O $artifact_dir"gg-13-8-99-515-806-nb-classifier.qza" \
      "https://moving-pictures-tutorial.readthedocs.io/en/latest/data/moving-pictures/gg-13-8-99-515-806-nb-classifier.qza"

if not os.path.exists(os.path.join(artifact_dir, "2022.10.backbone.full-length.nb.sklearn-1.4.2.qza")):
    !wget \
      -O $artifact_dir"2022.10.backbone.full-length.nb.sklearn-1.4.2.qza" \
      "http://ftp.microbio.me/greengenes_release/2022.10/sklearn-1.4.2-compatible-nb-classifiers/2022.10.backbone.full-length.nb.sklearn-1.4.2.qza"

# We'll use the smaller classifier to save disk space
classifier = 'gg-13-8-99-515-806-nb-classifier.qza'

!qiime feature-classifier classify-sklearn \
  --i-classifier $artifact_dir$classifier \
  --i-reads $artifact_dir'rep-seqs.qza' \
  --o-classification $artifact_dir'taxonomy.qza'

R[write to console]: Warning:
R[write to console]:  ‘timedatectl’ indicates the non-existent timezone name ‘n/a’

R[write to console]: Warning:
R[write to console]:  Your system is mis-configured: ‘/etc/localtime’ is not a symlink

R[write to console]: Warning:
R[write to console]:  It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)

Saved FeatureData[Taxonomy] to: /home/ec2-user/sequencing_analysis/qiime_artifacts/taxonomy.qza

!qiime metadata tabulate \
  --m-input-file $artifact_dir'taxonomy.qza' \
  --o-visualization $visualization_dir'taxonomy.qzv'

!qiime taxa barplot \
  --i-table $artifact_dir'OTU_table.qza' \
  --i-taxonomy $artifact_dir'taxonomy.qza' \
  --m-metadata-file $metadata \
  --o-visualization $visualization_dir'taxa-bar-plots.qzv'

R[write to console]: Warning:
R[write to console]:  ‘timedatectl’ indicates the non-existent timezone name ‘n/a’

R[write to console]: Warning:
R[write to console]:  Your system is mis-configured: ‘/etc/localtime’ is not a symlink

R[write to console]: Warning:
R[write to console]:  It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)

Saved Visualization to: /home/ec2-user/sequencing_analysis/qiime_visualizations/taxonomy.qzv
R[write to console]: Warning:
R[write to console]:  ‘timedatectl’ indicates the non-existent timezone name ‘n/a’

R[write to console]: Warning:
R[write to console]:  Your system is mis-configured: ‘/etc/localtime’ is not a symlink

R[write to console]: Warning:
R[write to console]:  It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)

Saved Visualization to: /home/ec2-user/sequencing_analysis/qiime_visualizations/taxa-bar-plots.qzv

Visualization.load(f'{visualization_dir}taxa-bar-plots.qzv')

otu_table = os.path.join(artifact_dir, 'OTU_table.qza')
output_file = os.path.join(artifact_dir, 'OTU_table_filtered.qza')

# Generate OTU table, filtering to include only shallow samples
!qiime feature-table filter-samples \
  --i-table $otu_table \
  --m-metadata-file $metadata \
  --p-where "[shallow]='1'" \
  --o-filtered-table $output_file

# Gives us a table with each feature and its abundance
!qiime feature-table summarize \
  --i-table $output_file \
  --o-visualization $visualization_dir'OTU_table_filtered_shallow.qzv' \
  --m-sample-metadata-file $metadata

R[write to console]: Warning:
R[write to console]:  ‘timedatectl’ indicates the non-existent timezone name ‘n/a’

R[write to console]: Warning:
R[write to console]:  Your system is mis-configured: ‘/etc/localtime’ is not a symlink

R[write to console]: Warning:
R[write to console]:  It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)

Saved FeatureTable[Frequency] to: /home/ec2-user/sequencing_analysis/qiime_artifacts/OTU_table_filtered.qza
R[write to console]: Warning:
R[write to console]:  ‘timedatectl’ indicates the non-existent timezone name ‘n/a’

R[write to console]: Warning:
R[write to console]:  Your system is mis-configured: ‘/etc/localtime’ is not a symlink

R[write to console]: Warning:
R[write to console]:  It is strongly recommended to set envionment variable TZ to ‘Etc/UCT’ (or equivalent)

Saved Visualization to: /home/ec2-user/sequencing_analysis/qiime_visualizations/OTU_table_filtered_shallow.qzv

Visualization.load(f'{visualization_dir}OTU_table_filtered_shallow.qzv')

Visualization.load(f'{visualization_dir}rep-seqs.qzv')

%load_ext watermark
%watermark -v -p pandas,jupyterlab,qiime2

Python implementation: CPython
Python version       : 3.10.14
IPython version      : 8.21.0

pandas    : 2.2.2
jupyterlab: 4.4.2
qiime2    : 2025.4.0

	sample-id	team	location	estimated-depth-inches	pH	shallow
0	1	ABORW	Orange Walk	4.5	4.5	0
1	2	RA	Watson Laboratories	3.4	4.9	0
2	3	AC	NO DATA	NO DATA	NO DATA	NO DATA
3	4	LFCG	NO DATA	NO DATA	NO DATA	NO DATA
4	5	JP	Oak Tree near Arms	4.7	4	0
5	6	MLRB	Dabney Garden	4	4.6	0
6	7	NK	West Lilly Pond	2	7	1
7	8	ES	NO DATA	NO DATA	NO DATA	NO DATA
8	9	F4CMR	NO DATA	NO DATA	NO DATA	NO DATA
9	10	GC4	Chen Garden	5.5	6	0
10	11	EY	Tournament Park	2	4	1
11	12	K	Tournament Park	3	4	0
12	13	SECS2	Mud patch from middle of turtle pond	1.5	6	1
13	14	SECS1	Under tree next to turtle pond	3.5	6	0
14	15	MR	NO DATA	NO DATA	NO DATA	NO DATA
15	16	BMRC	Bush in between chemical physics building and ...	1.57	4	1
16	17	ZA	Tournament park by trash bin on left	1.8	4.5	1
17	18	CL	Tournament park by trash bin on left	1.8	4.5	1
18	19	ABOLW	Olive Walk	4	4.6	0

Analysis of sequencing data from the antibiotic resistance experiment¶

Purpose¶

Pre-Processing¶

Creating the Manifest File¶

Importing the sequences¶

Trimming and Denoising¶

Adding metadata¶

Feature Table Creation¶

Taxonomic analysis¶

Using Native Bayes classifier¶

Onwards and Upwards¶

Computing Environment¶