#!/usr/bin/env bash
#
# check-spider-hits.sh v1.2.0
#
# Copyright (C) 2019-2020 Alan Orth
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# Exit on first error
set -o errexit
# defaults
readonly DEF_SPIDERS_PATTERN_FILE=/dspace/config/spiders/agents/example
readonly DEF_SOLR_URL=http://localhost:8081/solr
readonly DEF_STATISTICS_SHARD=statistics
######
readonly PROGNAME=$(basename $0)
readonly ARGS="$@"
function usage() {
cat <<-EOF
Usage: $PROGNAME [-d] [-f $DEF_SPIDERS_PATTERN_FILE] [-p] [-s $DEF_STATISTICS_SHARD] [-u $DEF_SOLR_URL]
Optional arguments:
-d: print debug messages
-f: path to file containing spider user agent patterns¹ (default: $DEF_SPIDERS_PATTERN_FILE)
-p: purge statistics that match spider user agents
-s: Solr statistics shard, for example statistics or statistics-2018² (default: $DEF_STATISTICS_SHARD)
-u: URL to Solr (default: $DEF_SOLR_URL)
Written by: Alan Orth
¹ DSpace ships an "example" pattern file that works well. Another option is the patterns file maintained by the COUNTER-Robots project.
² If your statistics core has been split into yearly "shards" by DSpace's stats-util you need to search each shard separately.
EOF
exit 0
}
function parse_options() {
while getopts ":df:ps:u:" opt; do
case $opt in
d)
DEBUG=yes
;;
f)
SPIDERS_PATTERN_FILE=$OPTARG
if ! [[ -r "$SPIDERS_PATTERN_FILE" ]]; then
echo "(ERROR) Spider patterns file \"$SPIDERS_PATTERN_FILE\" doesn't exist."
exit 1
fi
;;
p)
PURGE_SPIDER_HITS=yes
;;
s)
STATISTICS_SHARD=$OPTARG
;;
u)
# make sure -s is passed something like a URL
if ! [[ "$OPTARG" =~ ^https?://.*$ ]]; then
usage
fi
SOLR_URL=$OPTARG
;;
\?|:)
usage
;;
esac
done
}
function envsetup() {
# check to see if user specified a Solr URL
# ... otherwise use the default
if [[ -z $SOLR_URL ]]; then
SOLR_URL=$DEF_SOLR_URL
fi
# check to see if user specified a spiders pattern file
# ... otherwise use the default
if [[ -z $SPIDERS_PATTERN_FILE ]]; then
SPIDERS_PATTERN_FILE=$DEF_SPIDERS_PATTERN_FILE
fi
# check to see if user specified Solr statistics shards
# ... otherwise use the default
if [[ -z $STATISTICS_SHARD ]]; then
STATISTICS_SHARD=$DEF_STATISTICS_SHARD
fi
}
# pass the shell's argument array to the parsing function
parse_options $ARGS
# set up the defaults
envsetup
[[ $DEBUG ]] && echo "(DEBUG) Using spiders pattern file: $SPIDERS_PATTERN_FILE"
# Make a temporary copy of the spider file so we can do pattern replacement
# inside it with sed rather than using stdout from sed and having to deal
# with spaces and newlines in bash.
SPIDERS_PATTERN_FILE_TEMP=$(mktemp)
cp "$SPIDERS_PATTERN_FILE" "$SPIDERS_PATTERN_FILE_TEMP"
# Read list of spider user agents from the patterns file, converting PCRE-style
# regular expressions to a format that is easier to deal with in bash (spaces!)
# and that Solr supports (ie, patterns are anchored by ^ and $ implicitly, and
# some character types like \d are not supported).
#
# See: https://1opensourcelover.wordpress.com/2013/09/29/solr-regex-tutorial/
#
# For now this seems to be enough:
# - Replace \s with a literal space
# - Replace \d with [0-9] character class
# - Unescape dashes
# - Escape @
#
sed -i -e 's/\\s/ /g' -e 's/\\d/[0-9]/g' -e 's/\\-/-/g' -e 's/@/\\@/g' $SPIDERS_PATTERN_FILE_TEMP
# Start a tally of bot hits so we can report the total at the end
BOT_HITS=0
while read -r spider; do
# Save the original pattern so we can inform the user later
original_spider=$spider
# Skip patterns that contain a plus or percent sign (+ or %) because they
# are tricky to deal with in Solr. For some reason escaping them seems to
# work for searches, but not for deletes. I don't have time to figure it
# out.
if [[ $spider =~ [%\+] ]]; then
[[ $DEBUG ]] && echo "(DEBUG) Skipping spider: $original_spider"
continue
fi
unset has_beginning_anchor
unset has_end_anchor
# Remove ^ at the beginning because it is implied in Solr's regex search
if [[ $spider =~ ^\^ ]]; then
spider=$(echo $spider | sed -e 's/^\^//')
# Record that this spider's original user agent pattern had a ^
has_beginning_anchor=yes
fi
# Remove $ at the end because it is implied in Solr's regex search
if [[ $spider =~ \$ ]]; then
spider=$(echo $spider | sed -e 's/\$$//')
# Record that this spider's original user agent pattern had a $
has_end_anchor=yes
fi
# If the original pattern did NOT have a beginning anchor (^), then add a
# wildcard at the beginning.
if [[ -z $has_beginning_anchor ]]; then
spider=".*$spider"
fi
# If the original pattern did NOT have an ending enchor ($), then add a
# wildcard at the end.
if [[ -z $has_end_anchor ]]; then
spider="$spider.*"
fi
[[ $DEBUG ]] && echo "(DEBUG) Checking for hits from spider: $original_spider"
# Check for hits from this spider in Solr and save results into a variable,
# setting a custom curl output format so I can get the HTTP status code and
# Solr response in one request, then tease them out later.
solr_result=$(curl -s -w "http_code=%{http_code}" "$SOLR_URL/$STATISTICS_SHARD/select" -d "q=userAgent:/$spider/&rows=0")
http_code=$(echo $solr_result | grep -o -E 'http_code=[0-9]+' | awk -F= '{print $2}')
# Check the Solr HTTP response code and skip spider if not successful
if [[ $http_code -ne 200 ]]; then
[[ $DEBUG ]] && echo "(DEBUG) Solr query returned HTTP $http_code, skipping $original_spider."
continue
fi
# lazy extraction of Solr numFound (relies on sed -E for extended regex)
numFound=$(echo $solr_result | sed -E 's/\s+http_code=[0-9]+//' | xmllint --format - | grep numFound | sed -E 's/^.*numFound="([0-9]+)".*$/\1/')
if [[ numFound -gt 0 ]]; then
if [[ $PURGE_SPIDER_HITS ]]; then
echo "Purging $numFound hits from $original_spider in $STATISTICS_SHARD"
# Purge the hits and soft commit
curl -s "$SOLR_URL/$STATISTICS_SHARD/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "userAgent:/$spider/" > /dev/null 2>&1
else
echo "Found $numFound hits from $original_spider in $STATISTICS_SHARD"
fi
BOT_HITS=$((BOT_HITS+numFound))
fi
done < "$SPIDERS_PATTERN_FILE_TEMP"
if [[ $BOT_HITS -gt 0 ]]; then
if [[ $PURGE_SPIDER_HITS ]]; then
echo
echo "Total number of bot hits purged: $BOT_HITS"
# Hard commit after we're done processing all spiders
curl -s "$SOLR_URL/$STATISTICS_SHARD/update?commit=true" > /dev/null 2>&1
else
echo
echo "Total number of hits from bots: $BOT_HITS"
fi
fi
if [[ -f "$SPIDERS_PATTERN_FILE_TEMP" ]]; then
rm "$SPIDERS_PATTERN_FILE_TEMP"
fi
# vim: set expandtab:ts=4:sw=4:bs=2