#!/bin/bash

# https://github.com/webrecorder/replayweb.page
# https://replayweb.page/

OUT="warc"

if [ "$1" == "--log" ]; then
  if [ ! -z "$2" ]; then
     OUT="$2"
  fi
  if [ ! -d "$OUT" ]; then
    echo "$OUT no es un directorio"
    exit 1
  fi
  find "$OUT" -name "*.warc" -o -name "*.warc.gz"  -print0 |
  while IFS= read -r -d '' WRC; do
      CDX=$(echo "$WRC" | sed -E 's|\.warc(\.gz)?$|.cdx|g')
      SZ=$(ls -lah "$WRC" | cut -d' ' -f5)
      echo "# $(basename $WRC) $SZ"
      if [ -f "$CDX" ]; then
          grep -ohE " https?://[^/]+" "$CDX" | sed 's|.*//|ZZZZTOTAL\n|' | sort | uniq -c | sed "s| ZZZZTOTAL| TOTAL ($SZ)|"
      fi
  done
  exit 0
elif [ ! -z "$1" ]; then
  OUT="$1"
fi

if [ -e "$OUT" ]; then
  if [ ! -d "$OUT" ]; then
    echo "$OUT existe y no es un directorio"
    exit 1
  fi
  if [ ! -z "$(ls -A "$OUT")" ]; then
    echo "$OUT ha de estar vacio"
    exit 1
  fi
fi
mkdir "$OUT"
if [ $? -ne 0 ]; then
  echo "No se ha podido crear el directorio $OUT"
  exit 1
fi

exe() {
  CMD=$(echo "\$ $@" | sed "s|$HOME|~|g")
  echo "$CMD"
  "$@"
}
wgt() {
  # --warc-cdx
  WOUT=$(echo "$@" | sed 's|.*--warc-file=||' | cut -d' ' -f1)
  exe wget --no-check-certificate --no-verbose \
  --execute robots=off \
  --delete-after --no-directories \
  --page-requisites \
  --mirror \
  --no-warc-keep-log \
  --output-file="$WOUT.log" \
  --warc-cdx \
  "$@"
  echo "Exit code: $?" >> "$WOUT.log"
}

URL="https://raw.githubusercontent.com/15hack/web-backup/main/out/links.txt"
ROT="https://15hack.github.io/web-backup/out/links.html"
DIR="$(pwd)"

TMP="$(mktemp -d)"
echo "# WKS: $TMP"
echo "# OUT: $OUT"
cd "$TMP"

# Ocultar carpeta destino en los logs
OUT="$DIR/$OUT"
ln -s "$OUT" out
OUT="out"

LNK="$TMP/links.txt"
wget -q -O "$LNK" "$URL"
if [ $? -ne 0 ]; then
  echo "Error descargando $URL"
  exit 1
fi

function do_wgt {
  if [ "$1" == "$ROT" ]; then
    DOMS=$(cat "$LNK" | cut -d'/' -f3 | sort | uniq | tr '\n' ',' | sed 's|,$||')
    echo "# $1"
    wgt --span-hosts --domains="$DOMS" --warc-file="$OUT/15M" "$ROT"
  elif [ -f "$1" ]; then
    INFL="$1"
    NAME=$(basename "$INFL" | sed 's|\.[^\.]*$||')
    echo "# $LNKS"
    wgt --input-file="$INFL" --warc-file="$OUT/$NAME"
  else
    URL="$1"
    DOM=$(echo "$URL" | cut -d'/' -f3)
    echo "# $DOM"
    wgt --warc-file="$OUT/$DOM" "$URL"
  fi
}

do_wgt "$ROT"

#grep -E "/mailman/listinfo$" "$LNK" > mail.txt
#grep -vE "/mailman/(pipermail|listinfo)" "$LNK" > webs.txt

#cat mail.txt | sort | uniq -c | sort -n -r | sed 's|^[ 0-9]*||' | awk -F'/' '!visited[$3]++' > out/15M.txt
#cat webs.txt | grep -E "^https?://[^/]*" -oh | sort | uniq -c | sort -n -r | sed 's|^[ 0-9]*||' | awk -F'/' '!visited[$3]++' >> out/15M.txt
#do_wgt out/15M.txt

#cat out/15M.txt | while read URL; do
#  do_wgt "$URL"
#done