Banana Slug Genomics

#
# This file contains the instructions for performing an assembly of the NA12878 human
# genome data released by the Broad Institute.
#
# Basic assumptions:
# The data has been downloaded from the 1000 Genomes ftp. The BAM file provided
# has been transformed into 62 FASTQ files (by running Picard RevertSAM, BAMToFastq
# then using 'split' on the command line) with names like reads.000.fastq ... reads.061.fastq.
# Each file should have 20 million reads each (10 million pairs).
#
# The following is a guideline of how to perform the assembly - this is not a script.
# It is up to the user to submit the assembly jobs to a cluster, and make sure they ran
# without error. It is highly recommended that you understand how SGA works first. I suggest
# trying to run the C. elegans example assembly which is in the same directory.
#
# These instructions use a script called "submit" which takes a command either as a string
# or as a batch of commands on stdin (1 command per line) and submits it to the cluter.
# The submit script takes in the amount of memory required by the job (eg --mem 16G) and the
# number of compute cores (--cpu 8). It is the user's responsibility to provide this script.
#
#
# Update 14/07/12:
#   The commands used here are for the original published version of SGA. Since publication,
#   many algorithms have been improved, notable the indexing/merge step. See the SGA wiki
#   for updated best practices on indexing large collections of data.
#
#
# Send questions or comments to js18@sanger.ac.uk
#

#
# Step 1. Preprocess in the input data
#

# Make a directory for the preprocessed data and enter it
mkdir preprocessed; cd preprocessed;

# Preprocess the raw data, which is assumed to be an a directory called input/
for i in `ls ../input/reads.*.fastq`; do F=`basename $i .fastq`; submit "sga preprocess -o $F.pp.fastq --pe-mode 2 $i"; done;

# Leave the preprocessed directory
cd ..
# Step 2. Index the raw data
#

# Make a directory for the indexed data
mkdir index_raw; cd index_raw;

# Symlink in the preprocessed data
ls ../preprocessed/reads.*.pp.fastq | xargs -i ln -s {}

# Build an FM-index for each fastq file
ls *.fastq | xargs -i submit --mem 8G --cpu 8 "sga index --no-reverse -d 5000000 -t 8 {}"

# Use the sga-mergeDriver.pl script to merge the individual indices together until a single index remains

# Merge round 1
sga-mergeDriver.pl -t 8 *.fastq | submit --cpu 8 --mem 8G

# Merge round 2
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 8G

# Merge round 3
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 8G

# Rename intermediate files to have a shorter name
N=0; for i in `ls *.fa*`; do sga-rename.pl $i merged.$N; N=$(($N+1)); done

# Merge round 4
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 16G

# Merge round 5
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 28G

# Merge round 6
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 50G

# Leave the index_raw directory
cd ..

#
# Step 3. Error correct the reads using the index that was just built.
# There should be a file created by the last merge job called "final.bwt"
#

# Make a new directory to perform the correction in
mkdir corrected; cd corrected;

# Symlink the preprocessed reads into this directory.
# I assume the preprocessed reads are in a directory called preprocessed
ls ../preprocessed/reads.*.pp.fastq | xargs -i ln -s {}

# Symlink the index files
ln -s ../index_raw/final.bwt
ln -s ../index_raw/final.fa

# Submit the error correction jobs to the cluster
# We use a 55-mer for correction. The -d parameter lowers the memory usage
# of the index. 8 threads are used for the computation.
ls *.pp.fastq | xargs -i submit --mem 30G --cpu 8 "sga correct -k 55 --learn -d 256 -t 8 -p final {}"

# Leave the correction directory
cd ..

#
# Step 4. Index the corrected data.
#

# Make a directory for the corrected index data
mkdir index_corrected; cd index_corrected;

# Symlink in the preprocessed data
ls ../corrected/reads.*.ec.fa | xargs -i ln -s {}

# Build an FM-index for each fastq file
ls *.fa | xargs -i submit --mem 8G --cpu 8 "sga index -d 5000000 -t 8 {}"

# Use the sga-mergeDriver.pl script to merge the individual indices together until a single index remains

# Merge round 1
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 8G

# Merge round 2
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 8G

# Merge round 3
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 8G

# Rename intermediate files to have a shorter name
N=0; for i in `ls *.fa*`; do sga-rename.pl $i merged.$N; N=$(($N+1)); done

# Merge round 4
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 16G

# Merge round 5
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 28G

# Merge round 6
sga-mergeDriver.pl -t 8 *.fa | submit --cpu 8 --mem 50G

# Merging done, the index is named final.bwt and all the reads are now in final.fa

#
# Step 5: Filter the data to remove duplicates and low-quality reads remaining
# after correction.
#
submit --mem 60G --cpu 8 "sga filter -d 256 -x 2 -t 8 final.fa"

#
# Step 6: Merge together reads that can be unambiguously assembled
#
submit --mem 60G --cpu 8 "sga fm-merge -m 65 -t 8 final.filter.pass.fa"

#
# Step 7: Index the merged sequences
#
submit --mem 24G --cpu 8 "sga index -d 2000000 -t 8 final.filter.pass.merged.fa"

#
# Step 8: Use overlap to construct the string graph of the merged reads
#
submit --mem 8G --cpu 8 "sga overlap -m 65 -t 8 final.filter.pass.merged.fa"

#
# Step 9: Perform the contig assembly
# The minimum overlap value used is 77. Aggressive variant removal parameters are chosen.
# Small repeats at the ends of reads are resolved using the -r 10 parameter. The minimum
# branch length for the trimming algorithm is 200 bp.
#
submit --mem 16G "sga assemble -m 77 -d 0.4 -g 0.1 -r 10 -l 200 final.filter.pass.merged.asqg.gz"

#
# Scaffolding
# Change to a directory to hold the scaffolded assembly
cd ..
mkdir pe; cd pe;

# Symlink the assembly from step 9
ln -s ../index_corrected/*-contigs.fa final-contigs.fa
ln -s ../index_corrected/*-graph.asqg.gz final-graph.asqg.gz

#
# Step 10: Index the contigs with bwa
#
submit --mem 8G "bwa index -a bwtsw final-contigs.fa"

# Symlink the raw reads files
ls ../input/reads.*.fastq | xargs -i ln -s {}

#
# Step 11: Align reads to the contigs and make bam files
#
for i in `ls *.fastq`; do F=`basename $i .fastq`; submit --cpu 8 --mem 12G "sga-align -t 8 --name $F final-contigs.fa $i"; done

# Make a directory for the refsorted and unsorted bams
mkdir initial_sorted_bams
mv *.refsort.bam initial_sorted_bams

mkdir initial_bams
mv *.bam initial_bams

# Merge the refsorted bam
cd initial_sorted_bams
submit --mem 8G "samtools merge merged.refsort.bam *.refsort.bam"

# Merge the unsorted bam
cd ../initial_bams
submit --mem 8G "samtools merge merged.bam *.bam"

# Return to the pe directory
cd ../

#
# Step 12: Make the astat file

submit "sga-astat.py -m 200 initial_sorted_bams/merged.refsort.bam > contigs.astat"

#
# Step 13: Make the distance estimate file
#

submit "sga-bam2de.pl --prefix pe -n 5 -m 200 initial_bams/merged.bam"

#
# Step 14: Build scaffolds
#
submit --mem 8G "sga scaffold -u 25 -m 200 -a contigs.astat --pe pe.de -o scaffolds.m200.u25.scaf final-contigs.fa"

# Step 15: Convert the scaffolds to a fasta file

submit --mem 16G "sga scaffold2fasta -a final-graph.asqg.gz --write-unplaced -m 200 --use-overlap -o scaffolds.m200.build1.fa scaffolds.m200.u25.scaf"
You could leave a comment if you were logged in.
Banana Slug Genomics

User Tools

Site Tools

Page Tools