# Copyright (C) 2015 Khairul Azhar Kasmiran. All rights reserved. # # Standard disclaimer applies: # # THIS CODE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS CODE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # This code was produced using the facilities of Universiti Putra Malaysia, # Malaysia. # ------------------------------------------------------------------------------ # This script assumes that the Samsung data zipfile is in the working directory. # I have tried to make the script as simple as possible. # Load required library. library(dplyr) message("Unzipping HAR dataset...") # Unzip the Samsung data zipfile and change to the data directory. data.zipfilename <- "getdata-projectfiles-UCI HAR Dataset.zip" if (!file.exists(data.zipfilename)) { stop(paste("HAR dataset zipfile (", data.zipfilename, ") not found in", " working directory (", getwd(), "). Please download or copy", " the file into the working directory and try again.", sep="")) } unzip(data.zipfilename) setwd("UCI HAR Dataset") # ------------------------------------------------------------------------------ ### Step 1: Merges the training and the test sets to create one data set. message("Starting Step 1...") # Read the training and test data sets. train.data <- read.table("train/X_train.txt") test.data <- read.table("test/X_test.txt") # Merge the training and test data sets. Activity and subject lists are merged # in Step 4 to simplify Step 2 below. all.data <- rbind(train.data, test.data) # ------------------------------------------------------------------------------ ### Step 2: Extracts only the measurements on the mean and standard deviation ### for each measurement. message("Starting Step 2...") # Read the feature list. feat <- read.table("features.txt", col.names=c("col", "name")) # Get only mean and standard deviation measurements for each measurement. I'm # including only features with "mean()" or "std()" because "measurement" here # appears to refer only to the smartphone sensor signals listed in Table 2 of # the "A Public Domain Dataset for Human Activity Recognition Using Smartphones" # paper. wanted.feat <- feat[grepl("mean()", feat$name, fixed=TRUE) | grepl("std()", feat$name, fixed=TRUE),] # Extract only mean and standard deviation data. all.wanted.data <- all.data[, wanted.feat$col] # ------------------------------------------------------------------------------ ### Step 3: Uses descriptive activity names to name the activities in the data ### set. message("Starting Step 3...") # Read the activity label list. act <- read.table("activity_labels.txt", col.names=c("id", "name")) # Read the training and test activity label ids. train.act <- read.table("train/y_train.txt", col.names="id") test.act <- read.table("test/y_test.txt", col.names="id") # Merge the labels as for the data before (train then test). all.act <- rbind(train.act, test.act) # Convert activity label ids into names. Assumes that ids start from 1 with step # 1. all.act$name <- act[all.act$id,]$name # Just keep the names. all.act <- all.act[,"name", drop=FALSE] # ------------------------------------------------------------------------------ ### Step 4: Appropriately labels the data set with descriptive variable names. message("Starting Step 4...") # Read the subject lists. train.subj <- read.table("train/subject_train.txt", col.names="id") test.subj <- read.table("test/subject_test.txt", col.names="id") # Merge the subjects as for the data before (train then test). all.subj <- rbind(train.subj, test.subj) # Convert subjects from integer to factor. all.subj$id <- as.factor(all.subj$id) # Combine subjects, activities, and mean and std data into Tidy Data Set 1. tidy.data.1 <- cbind(all.subj, all.act, all.wanted.data) # Set variable names of Tidy Data Set 1. names(tidy.data.1) <- c("subject", "activity", as.character(wanted.feat$name)) # ------------------------------------------------------------------------------ ### Step 5: From the data set in step 4, creates a second, independent tidy data ### set with the average of each variable for each activity and each ### subject. message("Starting Step 5...") # Derive average for each variable by subject and activity and put it into Tidy # Data Set 2. tidy.data.2 <- tidy.data.1 %>% group_by(subject, activity) %>% summarise_each(funs(mean)) # Set variable names of Tidy Data Set 2. names(tidy.data.2) <- c("subject", "activity", sapply(wanted.feat$name, function(name) paste("avg", name, sep="-"))) # ------------------------------------------------------------------------------ message("Writing Tidy Data Set 2 to file...") # Write Tidy Data Set 2 to file. write.table(tidy.data.2, file="tidy_data_2.txt", row.names=FALSE) # Change back to the original working directory. setwd("..") message("Done!")