barplot.bam <- function(vdf=NULL, minlen=1, maxlen=37, poscol="red", negcol="green", main="Sequence length distribution", xlab="Map length", ylab="Count", legend=c("+ve strand","-ve strand"), legendx=NULL, legendy=NULL, plot=TRUE, down=FALSE, sym.axes=TRUE, ...) { # count frequencies in the negative strand neg <- .sum.bam.length(vdf, "-", minlen, maxlen) # count frequencies on the positive strand pos <- .sum.bam.length(vdf, "+", minlen, maxlen) # join the two on column "length" and give the result # sensible column names bavdf <- merge(pos, neg, by="length", sort=FALSE) colnames(bavdf) <- c("lgth","pos","neg") # if the user wants a plot if (plot==TRUE) { # draw the barplot of pos and neg frequencies # and with the user specified options if (down==TRUE) { # if the user wants symmetrical y-axes if (sym.axes==TRUE) { miny <- -1 * max(bavdf[,2:3]) maxy <- max(bavdf[,2:3]) } else { # else let the data decide miny <- -1 * max(bavdf[,3]) maxy <- max(bavdf[,2]) } # plot the poistive counts barplot(as.matrix(t(bavdf[,2])), beside=TRUE, names=bavdf[,1], col=c(poscol), main=main, xlab=xlab, ylab=ylab, ylim=c(miny,maxy), ...) # add zero minus the negative counts barplot(as.matrix(0-t(bavdf[,3])), beside=TRUE, col=c(negcol), add=TRUE, ...) } else { barplot(as.matrix(t(bavdf[,2:3])), beside=TRUE, names=bavdf[,1], col=c(poscol,negcol), main=main, xlab=xlab, ylab=ylab, ...) } # set defaults for legendx and legendy # if no values are provided if (is.null(legendx)) { legendx <- 5 } if (is.null(legendy)) { legendy <- max(bavdf[,2:3])/2 } # draw the legend legend(legendx,legendy,legend=legend,fill=c(poscol,negcol)) } # return the frequencies return(bavdf) } .sum.bam.length <- function(vdf=NULL, str=NULL, minlen=NULL, maxlen=NULL) { # first limit the input to the strand we are interested in vdf <- vdf[vdf$strand==str,] # count the occurrence of each length of read # and convert to a data.frame tbl <- table(vdf$cliplen, useNA="always") tvdf <- as.data.frame(tbl) # The above may not contain every number between # minlen and maxlen, so we create a dummy # data.frame that does and "outer join" it # to the above result, replacing the resulting # NA with zero counts yvdf <- data.frame(Counts=minlen:maxlen) avdf <- merge(yvdf, tvdf, by.x="Counts", by.y="Var1", all.x=TRUE, sort=FALSE) avdf[is.na(avdf)] <- 0 # give it sensible column and row names colnames(avdf) <- c("length","freq") rownames(avdf) <- avdf$length # return return(avdf[order(avdf$length),]) } clip.bam <- function(vdf = NULL) { # set up extra columns with null values vdf$softclip <- rep(0,nrow(vdf)) vdf$leftclip <- rep(0,nrow(vdf)) vdf$rightclip <- rep(0,nrow(vdf)) vdf$clipseq <- as.vector(vdf$seq) # we take a copy of the cigar string # and remove hard clipping information # for convenience vdf$cigar2 <- gsub("\\d+H","",vdf$cigar) # Record the column indices for cigar columns cidx <- grep("cigar", colnames(vdf))[1] cidx2 <- grep("cigar", colnames(vdf))[2] # get the subset of rows with soft-clipping clipr <- grep("S", vdf$cigar) svdf <- vdf[clipr,] # calculate the total number of soft clipped bases # by summing up numbers that are adjacent to an S # in the cigar string svdf$softclip <- apply(svdf,1,function(x){sum(as.numeric(strapply(as.character(x[cidx]),"(\\d+)S", simplify=c)))}) # calculate the number of bases soft clipped from # the left of the read by extracting the number # adjacent to any S at the beginning of the string svdf$leftclip <- apply(svdf,1,function(x){sum(as.numeric(strapply(as.character(x[cidx2]),"^(\\d+)S",simplify=c)))}) # calculate the number of bases soft clipped from # the right of the read by extracting the number # adjacent to any S at the end of the string svdf$rightclip <- apply(svdf,1,function(x){sum(as.numeric(strapply(as.character(x[cidx2]),"(\\d+)S$",simplify=c)))}) # extract the sequence in the middle of leftclip and rightclip # this is the sequence that has actually aligned svdf$clipseq <- substr(svdf$seq, as.numeric(svdf$leftclip)+1, nchar(as.vector(svdf$seq))-as.numeric(svdf$rightclip)) # substitute in the values vdf$softclip[clipr] <- svdf$softclip vdf$leftclip[clipr] <- svdf$leftclip vdf$rightclip[clipr] <- svdf$rightclip vdf$clipseq[clipr] <- svdf$clipseq # calculate the length of the clipped sequence vdf$cliplen <- nchar(vdf$clipseq) # remove the second cigar string # we calculated above vdf <- vdf[,-cidx2] # return the data return(vdf) } dna.complement <- function(x=NULL, reverse=TRUE) { # convert the sequence to upper case x <- toupper(x) # split sequence into a vector vec <- strsplit(x,split="")[[1]] # prepare a vector to hold the # revcom data cvec <- vector(mode="character",length(vec)) # for each base.... for (i in 1:length(vec)) { # complement the relevant bases if (vec[i] == "A") { cvec[i] <- "T" } if (vec[i] == "G") { cvec[i] <- "C" } if (vec[i] == "C") { cvec[i] <- "G" } if (vec[i] == "T") { cvec[i] <- "A" } if (vec[i] == "U") { cvec[i] <- "A" } if (vec[i] == "N") { cvec[i] <- "N" } } # reverse if requested (default) if (reverse == TRUE) { cvec <- rev(cvec) } # return the sequence as a string return(paste(cvec, sep="", collapse="")) } make.pwm <- function(vdf=NULL, minlen=1, maxlen=37, scaled=TRUE, strand="pos", revcom=FALSE, ttou=FALSE) { # set the defaults strand and change it # if we want to look at the negative strand strnum <- "+" if (strand == "neg") { strnum <- "-" # negative strand reads need to be reverse # complemented as BAM always reports alignments # to the positive strand revcom <- TRUE } # filter the data.frame according to the input myvdf <- vdf[vdf$cliplen>=minlen&vdf$cliplen<=maxlen&vdf$strand==strnum,] # create a list to hold the output lst <- list() lst[["A"]] <- vector(mode="numeric", length=maxlen) lst[["C"]] <- vector(mode="numeric", length=maxlen) lst[["G"]] <- vector(mode="numeric", length=maxlen) lst[["T"]] <- vector(mode="numeric", length=maxlen) # iterate through the aligned sequences # in the filtered input for (s in as.vector(myvdf$clipseq)) { # reverse complement the read if needed if (revcom == TRUE) { s <- dna.complement(s) } # convert T to U if the user requests it if (ttou == TRUE) { s <- gsub("T","U", toupper(s)) } # split the sequence into a vector # iterate over each base and record counts # in the appropriate position using the idx # counter vec <- strsplit(s,split="")[[1]] idx <- 1 for (v in vec) { lst[[v]][idx]=lst[[v]][idx]+1 idx <- idx+1 } } # create a matrix and convert # the counts from the list out <- matrix(nrow=4, ncol=maxlen) out[1,] <- lst[["A"]] out[2,] <- lst[["C"]] out[3,] <- lst[["G"]] out[4,] <- lst[["T"]] # provide appropriate row and column names rownames(out) <- c("A","C","G","T") colnames(out) <- 1:maxlen # vector to hold indices of columns with # zero counts skipcol <- vector(mode="numeric") # if we want to scale the data # into proportions (we generally do) if (scaled == TRUE) { # iterate over the columns for(j in 1:ncol(out)) { # if the col sums > 0 divide by the total # otherwise add it to the columns to be skipped if (sum( out[,j] ) > 0) { out[,j] <- out[,j] / sum( out[,j] ) } else { skipcol <- append(skipcol, j) } } } # remove zero sum columns if (length(skipcol) > 0) { out <- out[,-skipcol] } # return the data return(out) } make.simple.consensus <- function(vdf=NULL, reflen=12000) { # create a vector for the consensus cons <- vector(mode="character", length=reflen) # create a list to count bases # at each position cl <- vector("list", reflen) # initialise the list with zeros for (i in 1:reflen) { cl[[i]]["A"]<-0 cl[[i]]["G"]<-0 cl[[i]]["C"]<-0 cl[[i]]["T"]<-0 cl[[i]]["N"]<-0 } # iterate through the input data.frame for (i in 1:nrow(vdf)) { # store the row in r for # convenience r <- vdf[i,] # record the 5p position for convenience cpos <- r$pos # iterate over each base in the clipped sequence for (base in strsplit(toupper(r$clipseq), split="")[[1]]) { # record the base at position cpos # and move to the next position cl[[cpos]][base] <- cl[[cpos]][base] + 1 cpos <- cpos + 1 } } # iterate over the psoitions in the reference for (i in 1:reflen) { # if there are no bases, record an N if (sum(cl[[i]]) == 0) { cons[i] <- "N" } else { # otherwise sort and record the base that occurs # the most # NB does not deal with ties cons[i] <- names(cl[[i]])[order(cl[[i]])[5]] } } # return the consensus as a string fas <- paste(cons, sep="", collapse="") return(fas) } position.barplot <- function(vdf=NULL, minlen=1, maxlen=37, reflen=10000, samp="", plot=TRUE, poscol="red", negcol="green") { # limit the input to only contain reads # of size between minlen and maxlen vdf <- vdf[vdf$cliplen>=minlen&vdf$cliplen<=maxlen,] # get the reference name and size range as strings refname = vdf$rname[1] maps <- paste(minlen, "-", maxlen, sep="") # get counts of reads at each position for # negative strand npos <- .count.reads.by.position(vdf, "-") # get counts of reads at each position for # positive strand ppos <- .count.reads.by.position(vdf, "+") # both of the above may "miss" positions out # due to zero counts so we create a dummy # data.frame with all positions in it # and outer-join each of npos and ppos to it # filling in resulting NS's with 0 counts lgths <- data.frame(position=1:reflen) lpos <- merge(lgths, ppos, all.x=TRUE, sort=TRUE) lneg <- merge(lgths, npos, all.x=TRUE, sort=TRUE) lpos[is.na(lpos)] <- 0 lneg[is.na(lneg)] <- 0 # sort each of the new merged data.frames # by position lpos <- lpos[order(lpos$position),] lneg <- lneg[order(lneg$position),] # if the users has decided to plot if (plot == TRUE) { # calculate the range to be plotted mx <- max(c(lpos[,2],lneg[,2])) rng <- c(-1*mx,mx) # create a title for the plot tit <- paste(samp,"Distribution of",maps,"bp maps along",refname, sep=" ") # create the barplot as counts on the positive strand and 0-counts on the negative strand barplot(as.matrix(lpos[,2]), beside=TRUE, names=lpos[,1], col=c(poscol), xlab="Nucleotide position", ylab="Count", border=poscol, ylim=rng, xaxt="n", main=tit) barplot(as.matrix(0-lneg[,2]), beside=TRUE, add=TRUE, col=c(negcol), border=negcol) # add an axis axis(side=1) } # merge the positive and negative count data.frames # and give them useful column names and return to the user ret <- merge(lpos, lneg, by="position", sort=FALSE) colnames(ret) <- c("position","poscount","negcount") return(ret) } .count.reads.by.position <- function(vdf=NULL, str=NULL) { # subset by strand svdf <- vdf[vdf$strand==str,] if (nrow(svdf) > 0) { # if there are any mappings on this strand # count the number of reads by position bn <- by(svdf$qname, svdf$pos, length) # create data.frame and return it ret <- data.frame(position=as.integer(names(bn)), count=as.vector(bn)) return(ret) } else { # otherwise just return a dummy data.frame return(data.frame(position=1,poscount=0)) } } pwm.heatmap <- function(pwm=NULL, col.fun=colorRampPalette(c("black","red"), space="rgb"), mar=c(3,2,1,1), cex.axis=0.8) { # set the margins par(mar=mar) # draw the image image(z=t(pwm), y=1:nrow(pwm), x=1:ncol(pwm), xaxt="n", yaxt="n", col=col.fun(10), xlab="",ylab="") # add axes axis(side=2, at=1:nrow(pwm), labels=rownames(pwm), cex.axis=cex.axis) # add axes axis(side=1, at=1:ncol(pwm), labels=colnames(pwm), cex.axis=cex.axis) } read.bam <- function (bamfile = NULL, chr = NULL, start = 1, end = 1e+07, what = c("qname", "flag", "rname", "strand", "pos", "qwidth", "mapq", "cigar", "mrnm", "mpos", "isize", "seq"), tag = c("NM"), removeN = TRUE) { # read the data using scanBam from Rsamtools() package if (is.null(chr)) { # if no chromosome if given, read everything param <- ScanBamParam(what = what, tag = tag) bam <- scanBam(bamfile, param = param) } else { # otherwise limit the data by chromosome and the # given start and end which <- IRangesList(chr = IRanges(start, end)) names(which) <- chr param <- ScanBamParam(which = which, what = what, tag = tag) bam <- scanBam(bamfile, param = param) } # the result is in a rather difficult # to use S4 class/list and so we go through # the following steps to coerce the object # to a native data.frame # get the names of the object (the columns/"what") # and apply lapply elts <- names(bam[[1]]) lst <- lapply(elts, function(elt) .unlist(lapply(bam, "[[", elt))) names(lst) <- elts # coerce to DataFrame using the in-built Rsamtools function vdf <- do.call("DataFrame", lst) colnames(vdf) <- names(lst) # remove blanks vdf <- vdf[!is.na(vdf$qwidth), ] # convert important columns to character vdf$seq <- as.character(vdf$seq) vdf$cigar <- as.character(vdf$cigar) # if the user wants to remove sequences # that contain N's, remove them if (removeN == TRUE) { ns <- grep("N", vdf$seq) if (length(ns) > 0) { vdf <- vdf[-ns, ] } } # call the native as.data,frame function # to create a native data.frame and return it vdf <- as.data.frame(vdf, stringsAsFactors = FALSE) return(vdf) } .unlist <- function (x) { # code provided by Martin Morgan (author of Rsamtools) # to assist in coercion of Rsamtools classes to data.frames ## do.call(c, ...) coerces factor to integer, which is undesired x1 <- x[[1L]] if (is.factor(x1)) { structure(unlist(x), class = "factor", levels = levels(x1)) } else { do.call(c, x) } } read.dist.plot <- function (sr = NULL, minlen = 1, maxlen = 37, method = "add", pad = 30, primary = "pos", plot=TRUE, title="5' read distance plot", xlab="Distance", ylab="Count") { # check the input paramters # check method argument is valid if (! method %in% c("pos", "neg", "mean", "multiply", "add", "min", "max")) { print("Method should be one of:") print(c("pos", "neg", "mean", "multiply", "add", "min", "max")) print(paste("You gave:", method)) print("Setting method to mean") method = "mean" } # check primary is one of "pos" or "neg" if (! primary %in% c("pos", "neg")) { print("Primary should be one of:") print(c("pos", "neg")) print("Setting Primary to pos") pri = "pos" } # subset the input according to the # given size range psr <- sr[sr$len >= minlen & sr$len <= maxlen, ] # calculate the end position of each read psr$posend <- psr$pos + (nchar(psr$seq) - 1) # calculate the 5 prime and 3 prime ends # of each read psr$p5 <- psr$pos psr$p3 <- psr$pos psr$p5[psr$strand == "+"] <- psr$pos[psr$strand == "+"] psr$p3[psr$strand == "+"] <- psr$posend[psr$strand == "+"] psr$p3[psr$strand == "-"] <- psr$pos[psr$strand == "-"] psr$p5[psr$strand == "-"] <- psr$posend[psr$strand == "-"] # set the "primary" strand # it is this strand that we will iterate over # and then we will count overlaps on the opposite strand if (primary == "pos") { pri <- psr[psr$strand == "+", ] alt <- psr[psr$strand == "-", ] } else { pri <- psr[psr$strand == "+", ] alt <- psr[psr$strand == "-", ] } # create an empty list for the output lst <- list() # for each entry in the sequence report # for the primary strand for (i in 1:nrow(pri)) { # r contains the row as a vector r <- pri[i, ] # set the beginning of the range in which we will # count reads on the opposite strand. Set to 1 # if it is <= 0 begin <- r$p5 - pad if (begin <= 0) begin <- 1 # set the ending of the range ending <- r$p5 + pad # subset the alternative strand # according to the defined range sn <- alt[alt$p5 >= begin & alt$p5 <= ending, ] # proceed only if there are actually reads.... if (nrow(sn) > 0) { # cum up the counts for reads, by position, within the range ssn <- aggregate(sn$count, by = list(pos = sn$p5), sum) # give the result some useful column names colnames(ssn) <- c("p5", "count") # for every rown in the result for (j in 1:nrow(ssn)) { # e contains the 5p end of the data e <- ssn[j, ]$p5 # cnt is the summed counts cnt <- ssn[j, ]$count # idx is the distance between the query read # and the read on the opposite strand idx <- as.character(e - r$p5) # initialise the output list # if no value has been set yet if (is.null(lst[[idx]])) lst[[idx]] <- 0 # depending on which method has been chosen # record the result appropriately switch(method, mean = { # add the average of counts on the primary and alternative strand lst[[idx]] <- lst[[idx]] + (mean(c(cnt, r$count))) }, pos = { # add just the count of reads on the primary strand lst[[idx]] <- lst[[idx]] + r$count }, neg = { # add just the count of reads on the alternate strand lst[[idx]] <- lst[[idx]] + cnt }, multiply = { # add the product of counts on the primary and alternate strands lst[[idx]] <- lst[[idx]] + (cnt * r$count) }, add = { # add the sum of counts on the primary and alternate strands lst[[idx]] <- lst[[idx]] + sum(c(cnt,r$count)) }, min = { # add the min of counts on the primary and alternate strands lst[[idx]] <- lst[[idx]] + min(c(cnt,r$count)) }, max = { # add the max of counts on the primary and alternate strands lst[[idx]] <- lst[[idx]] + max(c(cnt,r$count)) }) } } } # unlist the result and create a data.frame output ul <- unlist(lst) du <- data.frame(loc = as.numeric(names(ul)), count = as.numeric(ul)) # order by location du <- du[order(du$loc), ] # plot if the user wants it if (plot == TRUE) { plot(du$loc, du$count, type="l", main=title, xlab=xlab, ylab=ylab) } # return the result return(du) } sequence.report <- function(df=NULL, minlen=1, maxlen=37) { # filter the input according to the provided # size range mydf <- df[df$cliplen>=minlen&df$cliplen<=maxlen,] # use ddply to count the occurence of reads # grouped by reference (rname), position (pos), # the actual sequence (clipseq), the sequencelength (cliplen) # and strand (strand) cts <- ddply(mydf, c("rname","pos","clipseq","cliplen","strand"), nrow) # order the result by position, strand, length and count cts <- cts[order(cts$pos, cts$strand, cts$cliplen, cts$V1),] # reorder the columns and rename them with sensible names cts <- cts[,c("rname","pos","strand","clipseq","cliplen","V1")] colnames(cts) <- c("ref","pos","strand","seq","len","count") # return return(cts) } size.position.heatmap <- function(dm=NULL, minlen=1, maxlen=37, start=1, end=1e+07, scale=TRUE, col.fun=colorRampPalette(c("black","red"), space="rgb"), log=FALSE, mar=c(5,4,4,2), main=NULL) { # filter input columns mr <- as.integer(rownames(dm)) mc <- as.integer(colnames(dm)) dm <- dm[,mc>=minlen&mc<=maxlen] dm <- dm[mr>=start&mr<=end,] # reset minlen and maxlen in case # they were inside the range anyway minlen <- min(colnames(dm)) maxlen <- min(colnames(dm)) if (log==TRUE) { dm <- log(dm+1) } # scale it if the user wants if (scale==TRUE) { dm <- t(scale(t(dm))) dm[is.na(dm)] <- min(dm[!is.na(dm)]) } # get rownames as vector of integers r <- as.integer(rownames(dm)) # heatmap of read lengths par(mar=mar) image(z=as.matrix(dm), col=col.fun(32), xaxt="n", yaxt="n", y=1:ncol(dm), x=min(r):max(r), xlab="", ylab="") axis(side=2, at=1:ncol(dm), labels=colnames(dm)) axis(side=1) if (! is.null(main)) { title(main) } } size.strand.bias.plot <- function(bp=NULL, minlen=1, maxlen=37, line.col="red", sym.axes=TRUE, title="Strand bias plot", xlab="+ strand counts", ylab="- strand counts", lty=1, lwd=2, cextxt=1, mar=c(5,4,4,1), tpos=1, ...) { # limit the data according to user input bp <- bp[bp$lgth>=minlen&bp$lgth<=maxlen,] # figure out the axes we will need # depending on whether we want # symmetrical X and Y axes if (sym.axes==TRUE) { miny <- min(bp[,c("pos","neg")]) maxy <- max(bp[,c("pos","neg")]) minx <- miny maxx <- maxy } else { miny <- min(bp[,c("neg")]) maxy <- max(bp[,c("neg")]) minx <- min(bp[,c("pos")]) maxx <- max(bp[,c("pos")]) } # basic scatter plot par(mar=mar) plot(bp$pos, bp$neg, main=title, xlab=xlab, ylab=ylab, xlim=c(minx,maxx), ylim=c(miny, maxy), ...) # add the line of y=x abline(0,1,col=line.col, lwd=lwd, lty=lty) # label the points text(bp$pos, bp$neg, bp$lgth, cex=cextxt, pos=tpos) } stacked.barplot <- function(dm=NULL, minlen=1, maxlen=37, start=1, end=1e+07, internal.margins=c(0,0,0,1), skip.x=2, bicol=NULL, col.fun=rainbow, axis.col="black", main.col="black", main.adj=1, samey=FALSE, ...) { # filter input columns mr <- as.integer(rownames(dm)) mc <- as.integer(colnames(dm)) dm <- dm[,mc>=minlen&mc<=maxlen] dm <- dm[mr>=start&mr<=end,] # reset minlen and maxlen in case # they were inside the range anyway minlen <- min(colnames(dm)) maxlen <- min(colnames(dm)) # figure out how many graphs we have ngraph <- ncol(dm) # warn if it is lots! if (ngraph>7) { print("WARNING: you have chosen to plot a lot of graphs to the current device") print("WARNING: this may be too many for the device to cope with") print("WARNING: if you get an error message, then consider using a bigger device") print("WARNING: with bigger dimensions: see ?png ?jpeg ?pdf") } # create ngraph rows in the device split.screen(c(ngraph,1)) # set the colours if (is.null(bicol)) { cols <- col.fun(ncol(dm)) } else { cols <- rep(bicol, ncol(dm)) } # iterate over the number of graphs for (i in 1:ngraph) { maxy <- max(dm[,i]) miny <- min(dm[,i]) if (samey==TRUE) { maxy <- max(dm) miny <- min(dm) } # select the relevant screen screen(i) # set the margins inside each graph par(mar=internal.margins) # plot the graph plot(rownames(dm), dm[,i], col=cols[i], type="l", cex.axis=0.7, bty="n", xaxt="n", ylim=c(miny, maxy), ...) # only plot x-axis every skip.x graph if (i%%skip.x==0) { axis(side=1, cex.axis=0.7, col=axis.col, col.axis=axis.col) } # plot a title for each graph title(colnames(dm)[i], line=-1, cex.main=0.7, col.main=main.col, adj=main.adj) } # close the screens close.screen(all=TRUE) } summarise.by.length <- function(vdf=NULL, minlen=1, maxlen=37, start=1, end=1e+07, strand=NULL) { # sort out end which has been given # a silly high default number so as not to miss # anything if (end > max(vdf$pos)) { end <- max(vdf$pos) } # filter according to input vdf <- vdf[vdf$cliplen>=minlen&vdf$cliplen<=maxlen&vdf$pos>=start&vdf$pos<=end,] # summarise data for each length of read # count the occurrence at each position if (is.null(strand)) { bypos <- acast(vdf, pos~cliplen, length) } else if (strand=="pos") { bypos <- acast(vdf[vdf$strand=="+",], pos~cliplen, length) } else if (strand=="neg") { bypos <- acast(vdf[vdf$strand=="-",], pos~cliplen, length) } else { print("WARNING: don't understand your strand argument") print("WARNING: sending back data for both strands") bypos <- acast(vdf, pos~cliplen, length) } # in case of blanks, create dummy data.frame dummy <- data.frame(pos=start:end) # and merge it with the results from cast(...) dm <- merge(dummy, bypos, by.x="pos", by.y="row.names", all.x=TRUE, sort=FALSE) # deal with NAs which are zero counts dm[is.na(dm)] <- 0 # order by position dm <- dm[order(dm$pos),] # give the result sensible rownames rownames(dm) <- dm$pos # remove superfluous column dm <- dm[,-1] # return data return(dm) }