#!/bin/sh -e

##########################################################################
#   Synopsis:
#       fastq-scum file.fastq[.xz|.bz2|.gz]
#
#   Description:
#       .B fastq-scum
#       scans a FASTQ file for known adapters and poly-A tails of length
#       8 or more, showing matching sequences with the adapters or tails
#       in color.  This helps the user determine optimal parameters for
#       use with fastq-trim.
#
#   Arguments:
#       file    FASTQ file, optinally compressed with xz, bzip2, or gzip
#       
#   Returns:
#       0 on success, non-zero error code otherwise
#
#   Examples:
#       fastq-scum file.fastq.xz
#
#   See also:
#       fastq-trim
#       
#   History:
#   Date        Name        Modification
#   2022-01-03  Jason Bacon Begin
##########################################################################

usage()
{
    printf "Usage: $0 file.fastq[.bz2|.gz|.lz4|.xz|.zst]\n"
    exit 1
}


##########################################################################
#   Main
##########################################################################

if [ $# != 1 ]; then
    usage
fi
file="$1"

ext=${file##*.}
case $ext in
xz)
    co=xzcat
    ;;

bz2)
    co=bzcat
    ;;

gz)
    # zcat vs gzcat not standardized
    co="gunzip -c"
    ;;

zst)
    co=zstdcat
    ;;

lz4)
    co=lz4cat
    ;;

fastq|fq)
    co=cat
    ;;
*)
    printf "Unsupported compression: $ext\n"
    exit 1
esac

# Adapter sequences from FASTQ dist
# Illumina Universal Adapter        AGATCGGAAGAG
# Illumina Small RNA 3' Adapter     TGGAATTCTCGG
# Illumina Small RNA 5' Adapter     GATCGTCGGACT
# Nextera Transposase Sequence      CTGTCTCTTATA
# SOLID Small RNA Adapter           CGCCTTGGCCGT
$co $file | egrep --color \
    'AGATCGGAAGAG|TGGAATTCTCGG|GATCGTCGGACT|CTGTCTCTTATA|CGCCTTGGCCGT|AAAAAAAA+$'
