#!/usr/bin/env python import sys import re import io if len(sys.argv) != 2: print("Usage: fasta_dna_stats ") print("This script is for informational purposes only, and requires that the input file be a DNA (As, Ts, Cs, and Gs) FASTA-formatted file.") quit() ####### Parse input def fasta_hash_from_file(filename): handle = io.open(filename, "rb") seqs_dict = dict() current_id = "" current_seq_list = list() for line in handle: line_list = re.split(r"\s+", line.strip()); if re.match(r"^>", line_list[0]): if current_id != "": seqs_dict[current_id] = "".join(current_seq_list) current_id = re.subn(r"^>", "", line_list[0])[0] current_seq_list = list() else: current_seq_list.append(line_list[0]) seqs_dict[current_id] = "".join(current_seq_list) return seqs_dict def longest_perfect_repeat(seq): result = re.finditer(r"(.{1,10}?)\1{1,}", seq) max = 1 maxrep = seq[0] maxunit = seq[0] for match in result: unit = match.group(1) if len(match.group(0)) > max and len(set(unit)) > 1: max = len(match.group(0)) maxrep = match.group(0) maxunit = match.group(1) return [max, maxrep, maxunit] def gc_content(seq): gccount = 0; ## strings are iterable too ;) for base in seq: if base == "G" or base == "C" or base == "g" or base == "c": gccount = gccount + 1 rounded = int(float(gccount)/len(seq)*1000)/1000.0 return(rounded) def most_common_fivemer(seq): mers = dict() for index in range(0, len(seq) - 5 + 1): fivemer = seq[index:index+5] if mers.has_key(fivemer): mers[fivemer] = mers[fivemer] + 1 else: mers[fivemer] = 1 max = 0 maxmer = "" for fivemer in mers.keys(): if mers[fivemer] > max: max = mers[fivemer] maxmer = fivemer return [maxmer, max] filename = sys.argv[1] seqs_dict = fasta_hash_from_file(filename) print("# Column 1: Sequence ID") print("# Column 2: GC content") print("# Column 3: Length") print("# Column 4: Most common 5mer") print("# Column 5: Count of most common 5mer") print("# Column 6: Repeat unit of longest simple perfect repeat (2 to 10 chars)") print("# Column 7: Length of repeat (in characters)") print("# Column 8: Repeat type (dinucleotide, trinucleotide, etc.)") types = dict() types[1] = "uninucleotide" types[2] = "dinucleotide" types[3] = "trinucleotide" types[4] = "tetranucleotide" types[5] = "pentanucleotide" types[6] = "hexanucleotide" types[7] = "heptanucleotide" types[8] = "octanucleotide" types[9] = "enneanucleotide" types[10] = "decanucleotide" for id in seqs_dict.keys(): sys.stderr.write("Processing sequence ID " + id + "\n") seq = seqs_dict[id] sys.stdout.write(id + "\t") sys.stdout.write(str(gc_content(seq)) + "\t") sys.stdout.write(str(len(seq)) + "\t") maxmer = most_common_fivemer(seq) sys.stdout.write(maxmer[0] + "\t") sys.stdout.write(str(maxmer[1]) + "\t") longest_rep_list = longest_perfect_repeat(seq) sys.stdout.write("unit:" + longest_rep_list[2] + "\t") sys.stdout.write(str(longest_rep_list[0]) + "\t") sys.stdout.write(types[len(longest_rep_list[2])]) sys.stdout.write("\n")