################################################### ## Exercises for Common Bioinformatics Use Cases ## ################################################### ## Log in to a node with srun srun --x11 --partition=short --mem=2gb --cpus-per-task 4 --ntasks 1 --time 1:00:00 --pty bash -l ## Download Halobacterium proteome and inspect it wget https://ftp.ncbi.nlm.nih.gov/genomes/genbank/archaea/Halobacterium_salinarum/all_assembly_versions/GCA_004799605.1_ASM479960v1/GCA_004799605.1_ASM479960v1_protein.faa.gz gunzip GCA_004799605.1_ASM479960v1_protein.faa.gz mv GCA_004799605.1_ASM479960v1_protein.faa halobacterium.faa # less halobacterium.faa # press q to quit ## How many protein sequences are stored in the downloaded file? grep '>' halobacterium.faa | wc grep '^>' halobacterium.faa --count ## How many proteins contain the pattern "WxHxxH" or "WxHxxHH"? egrep 'W.H..H{1,2}' halobacterium.faa --count ## Use less to find IDs for pattern matches or use awk awk --posix -v RS='>' '/W.H..(H){1,2}/ { print ">" $0;}' halobacterium.faa | less awk --posix -v RS='>' '/W.H..(H){1,2}/ { print ">" $0;}' halobacterium.faa | grep '^>' | cut -c 2- | cut -f 1 -d\ > myIDs ## Create a BLASTable database with formatdb module load ncbi-blast/2.2.31+ makeblastdb -in halobacterium.faa -out halobacterium.faa -dbtype prot -hash_index -parse_seqids ## Query BLASTable database by IDs stored in a file (e.g. myIDs) blastdbcmd -db halobacterium.faa -dbtype prot -entry_batch myIDs -get_dups -out myseq.fasta ## Run BLAST search for sequences stored in myseq.fasta blastp -query myseq.fasta -db halobacterium.faa -outfmt 0 -evalue 1e-6 -out blastp.out blastp -query myseq.fasta -db halobacterium.faa -outfmt 6 -evalue 1e-6 -out blastp.tab ## Return system time and host name date hostname ## More exercises in Linux Manual ## https://hpcc.ucr.edu/manuals_linux-basics_shell.html ## Submit job to queuing system of cluster ## (i) Create submission script as outlined here: https://bit.ly/2O9qMJm ## (ii) Submit script to cluster as follows ## sbatch script_name.sh