#!/usr/bin/env bash # Example of the REST calls to create an AU suitable for a wget crawl # and invoke the wget crawler. Invoke with -h for options. trap cleanup SIGINT SIGTERM ERR EXIT script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd -P) script_name=$(basename "${BASH_SOURCE[0]}") # ---- constants cfg_port=24620 crawl_port=24640 # ---- defaults host=localhost proxyarg="" depth=99 wait="" cancelJobId="" verbose="" usage() { cat < --handle --start [--proxy ] [--depth ] [--wait ] [--verbose] or ${script_name} --cancel Run (or cancel) a single wget crawl using the LOCKSS Crawler Service. Available options: -h, --help Print this help and exit --host Crawler service hostname (default localhost) --handle Named AU handle --user Crawler service REST request credentials --start (List of) url(s) from which to start crawling --proxy Proxy through host:port --depth The depth to which to follow links (default 99) --wait Delay between fetches (milliseconds. (Default from system config, which defaults to 3 seconds.) --cancel Cancel a running or queued crawl --verbose Invoke curl with --verbose EOF exit } cleanup() { trap - SIGINT SIGTERM ERR EXIT if [ -f "${tmpfile}" ]; then rm -f "${tmpfile}" fi } msg() { echo >&2 -e "${1-}" } die() { local msg=$1 local code=${2-1} # default exit status 1 msg "$msg" exit "$code" } convert_to_fractional_seconds () { local input=$1 re='^[0-9]+$' if ! [[ $input =~ $re ]] ; then case $input in *[Dd]) factor=86400 ;; *[Hh]) factor=3600 ;; *[Mm]) factor=60 ;; *[Ss]) factor=1 ;; *) echo "Invalid input:$input" >&2; exit 1;; esac echo $(echo "scale=2; ${input%?}*$factor" | bc) else factor=0.001 echo $(echo "scale=2; ${input}*$factor" | bc) fi } while [[ $# -gt 0 ]]; do case "$1" in -h | --help) usage ;; "--host" ) host="${2}" shift; shift; continue;; "--user") user="${2}" shift; shift; continue;; "--proxy") proxyarg="--proxy ${2}" shift; shift; continue;; "--handle") handle="${2}" shift; shift; continue;; "--start" ) starturl="${2}" shift; shift; continue;; "--depth" ) depth="${2}" shift; shift; continue;; "--wait" ) wait=$(convert_to_fractional_seconds "${2}") shift; shift; continue;; "--cancel" ) cancelJobId="${2}" shift; shift; continue;; "--verbose" ) verbose="--verbose" shift; continue;; "--trace" ) verbose="--trace-ascii /tmp/foo" shift; continue;; -*) usage esac done if [ -z "${user}" -o \( -z "${cancelJobId}" -a \( -z "${handle}" -o -z "${starturl}" \) \) ]; then echo "user:pass, and either cancel jobid or handle, and start URL must be provided" usage fi #echo "user: ${user}" #echo "host: ${host}" #echo "starturl: ${starturl}" curlcmd="curl ${verbose} -s -S -u ${user} ${proxyarg}" err_exit() { echo "$1" >&2 exit 1 } url_encode() { python3 -c "import urllib.parse; print (urllib.parse.quote('''$1'''))" } is_svc_ready() { local port="$1" local api_status=`( ${curlcmd} -X GET --header 'Accept: application/json' "http://${host}:${port}/status" ) || err_exit "Couldn't get service status"` echo "${api_status}" | jq -r .startupStatus | grep -q AUS_STARTED } wait_svc_ready() { local port="$1" local svc_name="$2" local did="" while ! is_svc_ready "${port}"; do if [ -z "${did}" ]; then echo -n "Waiting for ${svc_name} to be ready ..." did=1 fi sleep 5 done if [ -n "${did}" ]; then echo " done." fi } is_au_configured() { local auid="$1" local encoded_auid="`url_encode ${auid}`" local cur_config=`( ${curlcmd} -X GET --header 'Accept: application/json' "http://${host}:${cfg_port}/aus/${encoded_auid}" ) || err_exit "Couldn't check current AU config"` echo "${cur_config}" | grep -q auId } wait_svc_ready ${cfg_port} "Config Service" wait_svc_ready ${crawl_port} "Crawler Service" if [ -n "${cancelJobId}" ]; then echo "Cancelling job: ${cancelJobId}" tmpfile=$(mktemp /tmp/crawlreq.XXXXXX) http_status=`${curlcmd} -o ${tmpfile} -w "%{http_code}" -X DELETE --header "Accept: application/json" "http://${host}:${crawl_port}/crawls/${cancelJobId}"` echo "delete response: `cat ${tmpfile}`" echo "http_status: ${http_status}" rm -f "${tmpfile}" exit 0 fi # get AUID auid=`${curlcmd} -X POST --header "Content-Type: application/x-www-form-urlencoded" --header "Accept: application/json" --data-urlencode "handle=${handle}" "http://${host}:${cfg_port}/auids" | jq -r .auid` echo "auid: ${auid}" if ! is_au_configured "${auid}"; then # configure AU config="{\"auConfig\": {\"handle\": \"${handle}\", \"features\": \"crawledAu\"}, \"auId\": \"${auid}\"}" ${curlcmd} -X PUT --header "Content-Type: application/json" --header "Accept: application/json" -d "${config}" "http://${host}:${cfg_port}/aus/AUID" echo "AU created; requesting wget crawl ..." sleep 5 else echo "AU already exists; requesting wget crawl ..." fi # start crawl tmpfile=$(mktemp /tmp/crawlreq.XXXXXX) http_status=`${curlcmd} -o ${tmpfile} -w "%{http_code}" -X POST --header "Content-Type: application/json" --header "Accept: application/json" -d "{\"auId\": \"${auid}\", \"crawlDepth\":${depth}, \"crawlKind\": \"newContent\", \"crawlList\":[\"${starturl}\"], \"crawlerId\": \"wget\", \"extraCrawlerData\": {\"span-hosts\":false, \"page-requisites\": true, \"no-parent\": true, \"wait\":\"${wait}\"}, \"forceCrawl\": false, \"priority\":0, \"refetchDepth\":0}" "http://${host}:${crawl_port}/jobs"` exitcode=$? #echo "http_status: ${http_status}" if [ "${exitcode}" != 0 ]; then echo "curl exited with status ${exitcode}" exit 1 fi jobStatus=`cat ${tmpfile} | jq -r .jobStatus` statusCode=`echo "${jobStatus}" | jq -r .statusCode` msg=`echo "${jobStatus}" | jq -r .msg` #echo "jobStatus: ${jobStatus}" #echo "statusCode: ${statusCode}" #echo "msg: ${msg}" if [[ "${http_status}" == 2** ]]; then jobId=`cat ${tmpfile} | jq -r .jobId` echo "Crawl queued, jobId: ${jobId}" else echo "Request was rejected with ${statusCode}: ${msg}" exit 0 fi rm "${tmpfile}" exit 0