#! /bin/bash ## backup-intfic-from-writing_com.sh: Backup an interactive story from the ## website writing.com, and convert the individual story pages into a TWEE ## source file for compiling with the Linux command-line tool "twee2" or ## for importing into the Windows graphical story editor "Twine". ## ## Either tool will transform the entire story into a single large web page ## structured as a javascript application. ## ## Note that as of version 2.x, Twine dropped support for the TWEE format. ## Version 1.4.x of Twine supports import of TWEE files. ## ## Copyright (c) 2018 John M. Abreau ## ## This work is licensed under a Creative Commons Attribution 4.0 ## International License. To view a copy of this license, visit ## http://creativecommons.org/licenses/by/4.0/ or send a letter to ## Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. ## usage: backup-intfic-from-writing_com.sh STORY_BASE [ ... ] ## where STORY_BASE is of the form ## ## https://www.writing.com/main/interact/item_id/123-Story-title ## This script was written on a Linux system using the GNU bash shell, ## version 4.4.23(1)-release. It's possible it may fail on earlier versions ## ## Script depends on a number of external tools: ## * wget ## * two python 2.x scripts that use the lxml package: ## + lxml-xpath.py wwget() { wget -U "$UA" --load-cookies "$cookies" "$@" ; } die() { printf '\n\n**** %s ****\n\n\n' "$*" 1>&2 ; exit 1 ; } if touch test.$$ ; then rm -f test.$$ else die ERROR: No write permission in current directory fi if [[ -x lxml-xpath.py ]]; then lxml_xpath="$PWD/lxml-xpath.py" else die ERROR: No executable file '"lxml-xpath.py"' in this directory fi if [[ ! -f ~/.netrc ]]; then die ERROR: No .netrc file in home directory with login credentials fi if [[ -x lxml-chapters-to-twee.py ]]; then chap_to_twee="$PWD/lxml-chapters-to-twee.py" else die ERROR: No executable file '"lxml-chapters-to-twee.py"' in this directory fi ## main function to backup one story, called at end of script main() { local uri="$1" # extract the story id for use as the base directory local base="${uri#*://}" base="${base#www.}" base="${base#writing.com/main/interact/item_id/}" base="${base%%/*}" base="${base%%\?*}" base="${base%%&*}" [[ -d "$base" ]] || mkdir "$base" pushd "$base" > /dev/null # load beginning of story and outline of chapters echo Fetch top-level page as BEGIN wwget -q -O BEGIN "$uri" entry=$($lxml_xpath //a/@href BEGIN | sort -u | grep '/map/1$') wwget -q -O TEMP "$entry" outline=$($lxml_xpath //a/@href TEMP | sort -u | grep '/outline$') echo Fetch page of all chapter urls as OUTLINE wwget -q -O OUTLINE "$outline" rm -f TEMP # get list of chapter urls from outline echo Extract list of chapter urls from OUTLINE as LIST $lxml_xpath //a/@href OUTLINE | grep /map/ | sort -u > LIST # create and enter chapters directory mkdir chapters cd chapters local total=$(cat ../LIST | wc -l) get_chapters ../LIST while true ; do reap_bad_chapters local curr=$(ls | grep ^1 | wc -l) [[ $total = $curr ]] && break echo Found $((total - curr)) bad chapters, pausing before next pass... rsleep 300 get_chapters ../LIST done cd .. popd > /dev/null } ################################# ##do_login=false do_login=true while getopts l opt; do case "$opt" in l) do_login=true ;; esac done shift $((OPTIND - 1)) ################################# cache_dir=$HOME/.cache/writing.com [[ -d "$cache_dir" ]] || mkdir -p "$cache_dir" cookies="$cache_dir"/cookies UA='Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:62.0) Gecko/20100101 Firefox/62.0' rsleep() { local delay=${1-10} local rdelay=$((delay/2 + RANDOM % delay)) local i=0 for i in $(seq $rdelay -1 1); do printf '\r Sleeping %s ' "$i" sleep 1 done printf '\r \r' } login() { echo Logging in to writing.com website ## Login to the writing.com website. Assume login and password ## are stored in ~/.netrc. local netrc_record=$(cat ~/.netrc | \ tr '\n' '|' | \ sed -e 's/$/\n/' -e 's/|machine/\nmachine/g' | \ grep '^machine www.writing.com') local user="${netrc_record#*|login }" user="${user%%|*}" local pass="${netrc_record#*|password }" pass="${pass%%|*}" local post_data="login_username=$user&login_password=$pass&send_to=/" wget -q --save-cookies=$cookies --post-data="$post_data" \ https://www.writing.com/main/login.php -O /dev/null } validate_uri() { ## vaidate that the urls are for writing.com interactive stories case "$uri" in *://www.writing.com/main/interact/item_id/*) ;; *://writing.com/main/interact/item_id/*) ;; *) die "ERROR: Not a valid writing.com interactive story: |%s|" "$uri" ;; esac } get_chapters() { local list="$1" local uri local total=$(cat "$list" | wc -l) cat "$list" | \ while read uri ; do local base="${uri##*/}" local ref="${uri%/map/*}/action/outline" if [[ ! -f $base ]]; then local curr=$(grep -n "/$base\$" "$list" | cut -d: -f1) echo === "$base [$curr/$total]" === wwget -q --referer="$ref" "$uri" rsleep 30 fi done printf '\n*** Completed a run ***\n\n' } count_bad_chapters() { list_bad_chapters | wc -l } reap_bad_chapters() { list_bad_chapters | xargs rm -f local i for i in 1* ; do grep -q '' $i || rm -f $i done } list_bad_chapters() { local err_title="${1-Interactive Stories Are Temporarily Unavailable}" grep -l "$err_title" 1* } ####################################################### ###################### MAIN LOOP ###################### ####################################################### echo Validating that urls are interactive stories on writing.com for uri do validate_uri "$uri" done $do_login && login storynum=0 for uri do storynum=$((storynum + 1)) echo Fetching story $storynum of $# main "$uri" done