#include #include #include #include #include #include #include #include #include #include #include #include "htslib/sam.h" #include "htslib/bgzf.h" #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) #define MIN_NON_OVERLAP 20 #define MAX_UNMAPPED_BASES 50 #define MIN_INDEL_SIZE 50 struct line { uint32_t pos, raLen, rapos, qaLen, sclip, eclip, SQO, EQO; char strand; char *chrm; }; // from Dave Larson's extract_sv_reads static inline size_t _pre_seq_bytes(bam1_t const* b) { return (b->core.n_cigar<<2) + b->core.l_qname; } static inline size_t _seq_qual_bytes(bam1_t const* b) { // sequence bytes + quality bytes return (((b->core.l_qseq + 1)>>1) + b->core.l_qseq); } static inline size_t _post_qual_bytes(bam1_t const* b) { // Total - pre-sequence bytes - sequence bytes - quality bytes return (b->l_data - _pre_seq_bytes(b) - _seq_qual_bytes(b)); } bam1_t *drop_seq_qual(bam1_t *_aln) { bam1_t *b = bam_dup1(_aln); uint8_t* data = _aln->data; int m_data = _aln->m_data; // NOTE Copying everything BUT sequence and quality memcpy(data, b->data, _pre_seq_bytes(b)); memcpy(data + _pre_seq_bytes(b), bam_get_aux(b), _post_qual_bytes(b)); *_aln = *b; _aln->l_data = b->l_data - _seq_qual_bytes(b); _aln->core.l_qseq = 0; _aln->m_data = m_data; _aln->data = data; bam_destroy1(b); return _aln; } // This will parse a base 10 int, and change ptr to one char beyond the end of the number. int parseNextInt(char **ptr) { int num = 0; char curChar; for (curChar = (*ptr)[0]; curChar != 0; curChar = (++(*ptr))[0]) { int digit = curChar - '0'; if (digit >= 0 && digit <= 9) num = num*10 + digit; else break; } return num; } // This will the current char, and move the ptr ahead by one. char parseNextOpCode(char **ptr) { return ((*ptr)++)[0]; } // This just test for end of string. bool moreCigarOps(char *ptr) { return (ptr[0] != 0); } void calcAlnOffsets(uint32_t *cigar, uint32_t n_cigar, uint32_t sa_pos, char sa_strand, struct line *l) { l->raLen = 0; l->qaLen = 0; l->sclip = 0; l->eclip = 0; l->SQO = 0; l->EQO = 0; bool first = true; uint32_t k; for (k = 0; k < n_cigar; ++k) { uint32_t opLen = bam_cigar_oplen(cigar[k]); char opCode = bam_cigar_opchr(cigar[k]); if (opCode == 'M' || opCode == '=' || opCode == 'X') { l->raLen += opLen; l->qaLen += opLen; first = false; } else if (opCode == 'S' || opCode == 'H') { if (first) l->sclip += opLen; else l->eclip += opLen; } else if (opCode == 'D' || opCode == 'N') { l->raLen += opLen; } else if (opCode == 'I') { l->qaLen += opLen; } } //*rapos = str2pos(line->fields[POS]); l->rapos = sa_pos; if (sa_strand == '+') { l->pos = l->rapos - l->sclip; l->SQO = l->sclip; l->EQO = l->sclip + l->qaLen - 1; } else { l->pos = l->rapos + l->raLen + l->eclip - 1; l->SQO = l->eclip; l->EQO = l->eclip + l->qaLen - 1; } } void calcOffsets(char *cigar, uint32_t sa_pos, char sa_strand, struct line *l) { l->raLen = 0; l->qaLen = 0; l->sclip = 0; l->eclip = 0; l->SQO = 0; l->EQO = 0; bool first = true; while (moreCigarOps(cigar)) { int opLen = parseNextInt(&cigar); char opCode = parseNextOpCode(&cigar); if (opCode == 'M' || opCode == '=' || opCode == 'X') { l->raLen += opLen; l->qaLen += opLen; first = false; } else if (opCode == 'S' || opCode == 'H') { if (first) l->sclip += opLen; else l->eclip += opLen; } else if (opCode == 'D' || opCode == 'N') { l->raLen += opLen; } else if (opCode == 'I') { l->qaLen += opLen; } } //*rapos = str2pos(line->fields[POS]); l->rapos = sa_pos; if (sa_strand == '+') { l->pos = l->rapos - l->sclip; l->SQO = l->sclip; l->EQO = l->sclip + l->qaLen - 1; } else { l->pos = l->rapos + l->raLen + l->eclip - 1; l->SQO = l->eclip; l->EQO = l->eclip + l->qaLen - 1; } } void split_sa_tag(char *sa_tag, char **chrm, uint32_t *pos, char *strand, char **cigar) { *chrm = strtok(sa_tag, ","); *pos = atoi(strtok(NULL, ",")); *strand = strtok(NULL, ",")[0]; *cigar = strtok(NULL, ","); } int count_tags(char *sa_tag) { uint32_t i, c = 0; for (i = 0; i < strlen(sa_tag); ++i) { if (sa_tag[i] == ';') c += 1; } return c; } int main(int argc, char **argv) { // without these 2 lines, htslib sometimes tries to download a part of the sequence // even though the -f reference was provided. setenv("REF_CACHE", "", 0); setenv("REF_PATH", "fake_value_so_no_download", 0); if (argc < 4) errx(1, "usage\t:%s -f (optional #threads)", argv[0]); int c; int aligned_reads = 0; // track total aligned reads. useful for QC without 2nd pass over full cram. char *fasta = NULL; while (( c = getopt(argc, argv, "f:")) != -1) { fasta = optarg; } char *bam_file_name = argv[optind]; char *split_file_name = argv[1+optind]; char *disc_file_name = argv[2+optind]; int threads = 2; if (argc == 4+optind) { threads = atoi(argv[3+optind]); } samFile *in = sam_open(bam_file_name, "rb"); if(in == NULL) { errx(1, "Unable to open BAM/SAM file."); } // 0x1 | 0x2 | 0x4 | 0x8 | 0x10 | 0x20 | 0x40 | 0x80 | 0x100 | 0x200 | 0x800 // decode everything but base-qual hts_set_opt(in, CRAM_OPT_REQUIRED_FIELDS, 3071); hts_set_opt(in, CRAM_OPT_DECODE_MD, 1); if (fasta != NULL) { hts_set_fai_filename(in, fasta); } if (threads > 1) { hts_set_threads(in, threads); } bam_hdr_t *hdr = sam_hdr_read(in); samFile *disc = sam_open(disc_file_name, "wb1"); samFile *split = sam_open(split_file_name, "wb1"); int r = sam_hdr_write(disc, hdr); r = sam_hdr_write(split, hdr); bam1_t *aln = bam_init1(); int ret; int dropped; while(ret = sam_read1(in, hdr, aln) >= 0) { if (((aln->core.flag) & (BAM_FUNMAP | BAM_FQCFAIL | BAM_FDUP)) != 0) continue; aligned_reads += 1; dropped = 0; if ((((aln->core.flag) & (BAM_FPROPER_PAIR | BAM_FMUNMAP | BAM_FSUPPLEMENTARY)) == 0) && (aln->core.flag & BAM_FPAIRED == 1)) { dropped = 1; aln = drop_seq_qual(aln); r = sam_write1(disc, hdr, aln); } uint8_t *sa = bam_aux_get(aln, "SA"); if (sa == 0) { continue; } char *sa_tag = strdup(bam_aux2Z(sa)); if ( count_tags(sa_tag) == 1) { char *chrm, strand, *cigar; uint32_t pos; split_sa_tag(sa_tag, &chrm, &pos, &strand, &cigar); struct line sa, al; calcOffsets(cigar, pos, strand, &sa); sa.chrm = chrm; sa.strand = strand; calcAlnOffsets(bam_get_cigar(aln), aln->core.n_cigar, aln->core.pos + 1, bam_is_rev(aln) ? '-' : '+', &al); al.chrm = hdr->target_name[aln->core.tid]; al.strand = bam_is_rev(aln) ? '-' : '+'; struct line *left = &al, *right = &sa; if (left->SQO > right->SQO) { left = &sa; right = &al; } int overlap = MAX(1 + MIN(left->EQO, right->EQO) - MAX(left->SQO, right->SQO), 0); int alen1 = 1 + left->EQO - left->SQO; int alen2 = 1 + right->EQO - right->SQO; int mno = MIN(alen1-overlap, alen2-overlap); if (mno < MIN_NON_OVERLAP) continue; if ( (strcmp(left->chrm, right->chrm) == 0) && (left->strand == right->strand) ) { int leftDiag, rightDiag, insSize; if (left->strand == '-') { leftDiag = left->rapos - left->sclip; rightDiag = (right->rapos + right->raLen) - (right->sclip + right->qaLen); insSize = rightDiag - leftDiag; } else { leftDiag = (left->rapos + left->raLen) - (left->sclip + left->qaLen); rightDiag = right->rapos - right->sclip; insSize = leftDiag - rightDiag; } int desert = right->SQO - left->EQO - 1; if ((abs(insSize) < MIN_INDEL_SIZE) || ((desert > 0) && ( (desert - (int)MAX(0, insSize)) > MAX_UNMAPPED_BASES))) continue; } char *qname = bam_get_qname(aln); if ((aln->core.flag & 64) == 64) qname[0] = 'A'; else qname[0] = 'B'; if (dropped == 0) { aln = drop_seq_qual(aln); } r = sam_write1(split, hdr, aln); } free(sa_tag); } bam_destroy1(aln); sam_close(disc); sam_close(split); bam_hdr_destroy(hdr); sam_close(in); fprintf(stderr, "[lumpy_filter] extracted splits and discordants from %d total aligned reads\n", aligned_reads); if(ret < -1) { errx(1, "lumpy_filter: error reading bam: %s\n", bam_file_name); } return 0; }