#!/bin/bash -l ###################################################### ## HW02: Linux Basics Using Bioinformatics Examples ## ###################################################### ## Author: First Last Name ## Last update: 21-Apr-2024 ## Download E. coli (Halobacterium) proteome and inspect it wget https://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/Escherichia_coli/reference/GCA_000008865.2_ASM886v2/GCA_000008865.2_ASM886v2_protein.faa.gz gunzip GCA_000008865.2_ASM886v2_protein.faa.gz mv GCA_000008865.2_ASM886v2_protein.faa ecoli.faa # less ecoli.faa # press q to quit ## How many protein sequences are stored in the downloaded file? grep '>' ecoli.faa | wc grep '^>' ecoli.faa --count # Answer: 5,153 protein sequences ## How many proteins contain the pattern "WxHxxH" or "WxHxxHH"? egrep 'W.H..H{1,2}' ecoli.faa --count # Answer: 15 matches ## Use less to find IDs for pattern matches or use awk awk --posix -v RS='>' '/W.H..(H){1,2}/ { print ">" $0;}' ecoli.faa | less awk --posix -v RS='>' '/W.H..(H){1,2}/ { print ">" $0;}' ecoli.faa | grep '^>' | cut -c 2- | cut -f 1 -d\ > myIDs ## Create a BLASTable database with formatdb module load ncbi-blast makeblastdb -in ecoli.faa -out ecoli.faa -dbtype prot -hash_index -parse_seqids ## Query BLASTable database by IDs stored in a file (e.g. myIDs) blastdbcmd -db ecoli.faa -dbtype prot -entry_batch myIDs -get_dups -out myseq.fasta ## Run BLAST search for sequences stored in myseq.fasta blastp -query myseq.fasta -db ecoli.faa -outfmt 0 -evalue 1e-6 -out blastp.out blastp -query myseq.fasta -db ecoli.faa -outfmt 6 -evalue 1e-6 -out myresult.txt # Answer created file with -outfmt 6 format. ## Submission of HW02 to GitHub # As instructed in the assignment, upload the result file generated with '-outfmt 6' # option (here myresult.txt) to your homework repos on GitHub under Homework/HW02. The # following solution assumes that this is done on the HPCC where you have cloned your # homework repos. The part between '<...>' needs to replaced by the corresponding # path where your repos is located on the HPCC. cp myresult.txt ~//Homework/HW02/HW2.txt cd ~//Homework/HW02/ git pull # optional, just in case there were changes in your repos online git add HW2.txt # adds new file to repos git commit -am "HW02 submission" # commit new changes to repos git push # upload changes including your HW02 to GitHub, done! ## Return system time and host name # this is useful to record on which system you have done your work # and at what time. date hostname # Answer: Apr 21 16:25:58 PDT 2024 # Answer: skylark or some other host name of a node on HPCC