{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "if(!require(readr)) {\n", " install.packages(\"readr\")\n", " library(readr)\n", "} else {\n", " library(readr)\n", "}\n", "\n", "## The three lines below are user supplied. dataDir is where the data currently is located (either mounted FTP directory or \n", "## some temporary directory where you've copied the files to locally), facilityMD5FileName is the MD5 file supplied \n", "## by the sequencing facility, considered \"correct\" for the purposes of this script, and owlDir is the directory where files \n", "## will be copied to in Owl\n", "dataDir <- \"~/Documents/OwlUploader/\"\n", "facilityMD5FileName <- \"md5sums.txt\"\n", "owlDir <- \"~/Documents/OwlUploader/testDir\"\n", "\n", "if(length(grep(\"apple\", R.Version()$platform) == 1) == 0) {\n", " md5.command <- \"md5sum\"\n", "} else {\n", " md5.command <- \"md5\"\n", "}\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1] \"ed902ade9655affa9b9e92c08be9f761 EPI-103_S27_L005_R1_001.fastq.gz initial\"\n", "[1] \"4ec237a381460fd7635b77b84bc530f1 EPI-103_S27_L005_R2_001.fastq.gz initial\"\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Parsed with column specification:\n", "cols(\n", " X1 = col_character(),\n", " X2 = col_character(),\n", " X3 = col_character()\n", ")\n", "Parsed with column specification:\n", "cols(\n", " X1 = col_character(),\n", " X2 = col_character()\n", ")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1] \"EPI-103_S27_L005_R1_001.fastq.gz copied sucessfully, MD5 for copied file is ed902ade9655affa9b9e92c08be9f761\"\n", "[1] \"EPI-103_S27_L005_R2_001.fastq.gz copied sucessfully, MD5 for copied file is 4ec237a381460fd7635b77b84bc530f1\"\n" ] } ], "source": [ "## Main Logic loop, First three if statements are checks to make sure the user has updated the script with directories\n", "## and the MD5 file to ensure correct operation. If any of the criteria are not met, script stops automatically.\n", "if(dataDir == \"Input Data Dir Here\" ) {\n", " print(\"Update dataDir variable with where your data is\")\n", "\n", "} else if (owlDir == \"Input Owl Directory Here\") {\n", " print(\"Update owlDir with where you want the files copied to Owl\")\n", "\n", "} else if (facilityMD5FileName == \"File Name Here\") {\n", " print(\"Update Script with MD5 file provided by the facility\")\n", "\n", "## The else loop assumes that everything is correct and proceeds with the checking and copying process\n", "} else {\n", " # Sets the working directory to dataDir\n", " setwd(dataDir)\n", " # Pulls in all of the files which match the .gz file extension. May want to add a user supplied option for compression\n", " #schemes other than gzip.\n", " filenames <- list.files(path = dataDir, pattern = \"*.gz\")\n", "\n", " # Rums MD5 checks on all of the files, saving them to the external file chksum2.txt. This is just temporary and is removed\n", " # during cleanup\n", " for(i in 1: length(filenames)) {\n", " tempMD5 <- system(paste0(md5.command, \" \", filenames[i]), intern = TRUE)\n", " print(paste(tempMD5, \"initial\"))\n", " system(paste0(\"echo \", tempMD5, \" >> chksum2.txt\"))\n", " }\n", " # reads in and formats the facility and local MD5 files, removing any NA spaces due to read_delim only using a single whitespace\n", " # character to delimit, but most MD5 files seem to have two. Then names columns appropriately\n", " facility.MD5s <- read_delim(paste0(dataDir,facilityMD5FileName), \n", " \" \", escape_double = FALSE, col_names = FALSE, \n", " trim_ws = TRUE)\n", " \n", " facility.MD5s <- facility.MD5s[,!apply(is.na(facility.MD5s),2,all)]\n", " colnames(facility.MD5s) <- c(\"md5\", \"name\")\n", " \n", " file.MD5s <- read_delim(paste0(dataDir,\"chksum2.txt\"), \n", " \" \", escape_double = FALSE, col_names = FALSE, \n", " trim_ws = TRUE)\n", " \n", " file.MD5s <- file.MD5s[,!apply(is.na(file.MD5s),2,all)]\n", " colnames(file.MD5s) <- c(\"md5\", \"name\")\n", " \n", " ## Logic loop for checking MD5s and initiationg copying if MD5s match.\n", " for(i in 1:nrow(facility.MD5s)) {\n", " setwd(owlDir)\n", " ## Checks if the number of files match between chksum2.txt and the facility file. Stops script if they don't\n", " if (nrow(facility.MD5s) != nrow(file.MD5s)) {\n", " print(\"Number of Facility entries does not match number of files, check if all files are present\")\n", " break\n", " # This loop is for when MD5s match, and will first copy the file to the supplied Owl directory, then re-run\n", " # an MD5, comparing it to the facility file again, and if that matches append the MD5 checksum to the existing\n", " # MD5 file and add the file name to the readme.MD file in Owl. If it fails, then it prints that the copy has failed,\n", " # removes the file from owl, and then stops the script\n", " }else if(facility.MD5s$md5[which(facility.MD5s$name == file.MD5s$name[i])] == file.MD5s$md5[i]) {\n", " system(paste0(\"scp \", dataDir, file.MD5s$name[i], \" \", owlDir))\n", " tempMD5 <- substr(system(paste0(md5.command,\" \",owlDir, \"/\", filenames[i]), intern = TRUE),1 , 32)\n", " if (facility.MD5s$md5[which(facility.MD5s$name == file.MD5s$name[i])] == tempMD5) {\n", " system(paste0(\"echo \", file.MD5s$name[i], \" >> readme.MD\"))\n", " system(paste0(\"echo \", tempMD5, \" \", file.MD5s$name[i] ,\">> checksum.MD5\"))\n", " print(paste(file.MD5s$name[i], \"copied sucessfully, MD5 for copied file is \", file.MD5s$md5[i]))\n", " }else {\n", " print(\"Copy Failure. Produced incorrect MD5\")\n", " system(\"rm \", owlDir, \"/\", file.MD5s$name[i], intern = FALSE)\n", " break\n", " }\n", " # This final if statement is for if the inital file checksum and facility checksums do not match, if that's\n", " # the case then it prints that they've failed, with the file name, and saves the file name to a MD5Mismatch\n", " # file for further consideration. This does not stop the loop however\n", " } else if(facility.MD5s$md5[which(facility.MD5s$name == file.MD5s$name[i])] != file.MD5s$md5[i]) {\n", " print(paste(\"MD5 mismatch between facility and copied file for file \", file.MD5s$name[i]))\n", " system(paste0(\"echo \", file.MD5s$name[i], \" >> MD5Mismatch.txt\"))\n", " }\n", " } \n", "setwd(dataDir)\n", "system(\"rm chksum2.txt\", intern = FALSE)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.3.2" } }, "nbformat": 4, "nbformat_minor": 2 }